Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
1227e00f
Commit
1227e00f
authored
Jun 25, 2013
by
yao
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
fix moments
parent
dbdeff20
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
167 additions
and
174 deletions
+167
-174
moments.cpp
modules/ocl/src/moments.cpp
+19
-24
moments.cl
modules/ocl/src/opencl/moments.cl
+144
-146
test_moments.cpp
modules/ocl/test/test_moments.cpp
+4
-4
No files found.
modules/ocl/src/moments.cpp
View file @
1227e00f
...
...
@@ -16,7 +16,7 @@
// Third party copyrights are property of their respective owners.
//
// @Authors
// Sen Liu, s
en@multicorewareinc
.com
// Sen Liu, s
wjtuls1987@126
.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
...
...
@@ -277,8 +277,8 @@ static void ocl_cvMoments( const void* array, CvMoments* mom, int binary )
blocky
=
size
.
height
/
TILE_SIZE
;
else
blocky
=
size
.
height
/
TILE_SIZE
+
1
;
cv
::
ocl
::
oclMat
dst_m
(
blocky
*
10
,
blockx
,
CV_64FC1
);
cl_mem
sum
=
openCLCreateBuffer
(
src
.
clCxt
,
CL_MEM_READ_WRITE
,
10
*
sizeof
(
double
)
);
oclMat
dst_m
(
blocky
*
10
,
blockx
,
CV_64FC1
);
oclMat
sum
(
1
,
10
,
CV_64FC1
);
int
tile_width
=
std
::
min
(
size
.
width
,
TILE_SIZE
);
int
tile_height
=
std
::
min
(
size
.
height
,
TILE_SIZE
);
size_t
localThreads
[
3
]
=
{
tile_height
,
1
,
1
};
...
...
@@ -288,19 +288,16 @@ static void ocl_cvMoments( const void* array, CvMoments* mom, int binary )
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
src
.
rows
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
src
.
cols
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
src
.
step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
tileSize
.
width
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
tileSize
.
height
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
dst_m
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
dst_m
.
cols
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
dst_m
.
step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
blocky
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
type
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
depth
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
cn
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
coi
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
binary
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
TILE_SIZE
));
openCLExecuteKernel
(
dst_m
.
clCxt
,
&
moments
,
"CvMoments"
,
globalThreads
,
localThreads
,
args
,
-
1
,
depth
);
openCLExecuteKernel
(
Context
::
getContext
()
,
&
moments
,
"CvMoments"
,
globalThreads
,
localThreads
,
args
,
-
1
,
depth
);
size_t
localThreadss
[
3
]
=
{
128
,
1
,
1
};
size_t
globalThreadss
[
3
]
=
{
128
,
1
,
1
};
...
...
@@ -309,25 +306,23 @@ static void ocl_cvMoments( const void* array, CvMoments* mom, int binary )
args_sum
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
tile_height
));
args_sum
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
tile_width
));
args_sum
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
TILE_SIZE
));
args_sum
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
sum
));
args_sum
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
sum
.
data
));
args_sum
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
dst_m
.
data
));
args_sum
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
dst_m
.
step
));
openCLExecuteKernel
(
dst_m
.
clCxt
,
&
moments
,
"dst_sum"
,
globalThreadss
,
localThreadss
,
args_sum
,
-
1
,
-
1
);
double
*
dstsum
=
new
double
[
10
];
memset
(
dstsum
,
0
,
10
*
sizeof
(
double
));
openCLReadBuffer
(
dst_m
.
clCxt
,
sum
,(
void
*
)
dstsum
,
10
*
sizeof
(
double
));
mom
->
m00
=
dstsum
[
0
];
mom
->
m10
=
dstsum
[
1
];
mom
->
m01
=
dstsum
[
2
];
mom
->
m20
=
dstsum
[
3
];
mom
->
m11
=
dstsum
[
4
];
mom
->
m02
=
dstsum
[
5
];
mom
->
m30
=
dstsum
[
6
];
mom
->
m21
=
dstsum
[
7
];
mom
->
m12
=
dstsum
[
8
];
mom
->
m03
=
dstsum
[
9
];
delete
[]
dstsum
;
openCLSafeCall
(
clReleaseMemObject
(
sum
));
openCLExecuteKernel
(
Context
::
getContext
(),
&
moments
,
"dst_sum"
,
globalThreadss
,
localThreadss
,
args_sum
,
-
1
,
-
1
);
Mat
dstsum
(
sum
);
mom
->
m00
=
dstsum
.
at
<
double
>
(
0
,
0
);
mom
->
m10
=
dstsum
.
at
<
double
>
(
0
,
1
);
mom
->
m01
=
dstsum
.
at
<
double
>
(
0
,
2
);
mom
->
m20
=
dstsum
.
at
<
double
>
(
0
,
3
);
mom
->
m11
=
dstsum
.
at
<
double
>
(
0
,
4
);
mom
->
m02
=
dstsum
.
at
<
double
>
(
0
,
5
);
mom
->
m30
=
dstsum
.
at
<
double
>
(
0
,
6
);
mom
->
m21
=
dstsum
.
at
<
double
>
(
0
,
7
);
mom
->
m12
=
dstsum
.
at
<
double
>
(
0
,
8
);
mom
->
m03
=
dstsum
.
at
<
double
>
(
0
,
9
);
icvCompleteMomentState
(
mom
);
}
...
...
modules/ocl/src/opencl/moments.cl
View file @
1227e00f
...
...
@@ -173,10 +173,10 @@ __kernel void dst_sum(int src_rows, int src_cols, int tile_height, int tile_widt
sum[i]
=
dst_sum[i][0]
;
}
__kernel
void
CvMoments_D0
(
__global
uchar16*
src_data,
int
src_rows,
int
src_cols,
int
src_step,
int
tileSize_width,
int
tileSize_height,
__kernel
void
CvMoments_D0
(
__global
uchar16*
src_data,
int
src_rows,
int
src_cols,
int
src_step,
__global
F*
dst_m,
int
dst_cols,
int
dst_step,
int
blocky,
int
type,
int
depth,
int
cn,
int
coi,
int
binary,
int
TILE_SIZE
)
int
depth,
int
cn,
int
coi,
int
binary,
int
TILE_SIZE
)
{
uchar
tmp_coi[16]
; // get the coi data
uchar16
tmp[16]
;
...
...
@@ -192,35 +192,43 @@ __kernel void CvMoments_D0(__global uchar16* src_data, int src_rows, int src_col
int
x
=
wgidx*TILE_SIZE
; // vector length of uchar
int
kcn
=
(
cn==2
)
?2:4
;
int
rstep
=
min
(
src_step,
TILE_SIZE
)
;
tileSize_height
=
min
(
TILE_SIZE,
src_rows
-
y
)
;
tileSize_width
=
min
(
TILE_SIZE,
src_cols
-
x
)
;
if
(
tileSize_width
<
TILE_SIZE
)
for
(
int
i
=
tileSize_width
; i < rstep; i++ )
*
((
__global
uchar*
)
src_data+
(
y+lidy
)
*src_step+x+i
)
=
0
;
if
(
coi
>
0
)
//channel
of
interest
for
(
int
i
=
0
; i < tileSize_width; i += VLEN_C)
{
for
(
int
j=0
; j<VLEN_C; j++)
tmp_coi[j]
=
*
((
__global
uchar*
)
src_data+
(
y+lidy
)
*src_step+
(
x+i+j
)
*kcn+coi-1
)
;
tmp[i/VLEN_C]
=
(
uchar16
)(
tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7],
tmp_coi[8],tmp_coi[9],tmp_coi[10],tmp_coi[11],tmp_coi[12],tmp_coi[13],tmp_coi[14],tmp_coi[15]
)
;
}
else
for
(
int
i=0
; i < tileSize_width; i+=VLEN_C)
tmp[i/VLEN_C]
=
*
(
src_data+
(
y+lidy
)
*src_step/VLEN_C+
(
x+i
)
/VLEN_C
)
;
int
tileSize_height
=
min
(
TILE_SIZE,
src_rows
-
y
)
;
int
tileSize_width
=
min
(
TILE_SIZE,
src_cols
-
x
)
;
if
(
y+lidy
<
src_rows
)
{
if
(
tileSize_width
<
TILE_SIZE
)
for
(
int
i
=
tileSize_width
; i < rstep && (x+i) < src_cols; i++ )
*
((
__global
uchar*
)
src_data+
(
y+lidy
)
*src_step+x+i
)
=
0
;
if
(
coi
>
0
)
//channel
of
interest
for
(
int
i
=
0
; i < tileSize_width; i += VLEN_C)
{
for
(
int
j=0
; j<VLEN_C; j++)
tmp_coi[j]
=
*
((
__global
uchar*
)
src_data+
(
y+lidy
)
*src_step+
(
x+i+j
)
*kcn+coi-1
)
;
tmp[i/VLEN_C]
=
(
uchar16
)(
tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7],
tmp_coi[8],tmp_coi[9],tmp_coi[10],tmp_coi[11],tmp_coi[12],tmp_coi[13],tmp_coi[14],tmp_coi[15]
)
;
}
else
for
(
int
i=0
; i < tileSize_width; i+=VLEN_C)
tmp[i/VLEN_C]
=
*
(
src_data+
(
y+lidy
)
*src_step/VLEN_C+
(
x+i
)
/VLEN_C
)
;
}
uchar16
zero
=
(
uchar16
)(
0
)
;
uchar16
full
=
(
uchar16
)(
255
)
;
if
(
binary
)
for
(
int
i=0
; i < tileSize_width; i+=VLEN_C)
tmp[i/VLEN_C]
=
(
tmp[i/VLEN_C]!=zero
)
?full:zero
;
F
mom[10]
;
__local
int
m[10][128]
;
if
(
lidy
==
0
)
if
(
lidy
<
128
)
{
for
(
int
i=0
; i<10; i++)
for
(
int
j=0
; j<128; j++)
m[i][j]=0
;
m[i][lidy]=0
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
int
lm[10]
=
{0}
;
int16
x0
=
(
int16
)(
0
)
;
int16
x1
=
(
int16
)(
0
)
;
...
...
@@ -281,6 +289,7 @@ __kernel void CvMoments_D0(__global uchar16* src_data, int src_rows, int src_col
m[i][lidy-j/2]
=
lm[i]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
if
(
lidy
==
0&&lidx
==
0
)
{
for
(
int
mt
=
0
; mt < 10; mt++ )
...
...
@@ -328,10 +337,10 @@ __kernel void CvMoments_D0(__global uchar16* src_data, int src_rows, int src_col
}
}
__kernel
void
CvMoments_D2
(
__global
ushort8*
src_data,
int
src_rows,
int
src_cols,
int
src_step,
int
tileSize_width,
int
tileSize_height,
__kernel
void
CvMoments_D2
(
__global
ushort8*
src_data,
int
src_rows,
int
src_cols,
int
src_step,
__global
F*
dst_m,
int
dst_cols,
int
dst_step,
int
blocky,
int
type,
int
depth,
int
cn,
int
coi,
int
binary,
const
int
TILE_SIZE
)
int
depth,
int
cn,
int
coi,
int
binary,
const
int
TILE_SIZE
)
{
ushort
tmp_coi[8]
; // get the coi data
ushort8
tmp[32]
;
...
...
@@ -346,21 +355,26 @@ __kernel void CvMoments_D2(__global ushort8* src_data, int src_rows, int src_col
int
x
=
wgidx*TILE_SIZE
; // real X index of pixel
int
kcn
=
(
cn==2
)
?2:4
;
int
rstep
=
min
(
src_step/2,
TILE_SIZE
)
;
tileSize_height
=
min
(
TILE_SIZE,
src_rows
-
y
)
;
tileSize_width
=
min
(
TILE_SIZE,
src_cols
-x
)
;
if
(
src_cols
>
TILE_SIZE
&&
tileSize_width
<
TILE_SIZE
)
for
(
int
i=tileSize_width
; i < rstep; i++ )
*
((
__global
ushort*
)
src_data+
(
y+lidy
)
*src_step/2+x+i
)
=
0
;
if
(
coi
>
0
)
for
(
int
i=0
; i < tileSize_width; i+=VLEN_US)
{
for
(
int
j=0
; j<VLEN_US; j++)
tmp_coi[j]
=
*
((
__global
ushort*
)
src_data+
(
y+lidy
)
*
(
int
)
src_step/2+
(
x+i+j
)
*kcn+coi-1
)
;
tmp[i/VLEN_US]
=
(
ushort8
)(
tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7]
)
;
}
else
for
(
int
i=0
; i < tileSize_width; i+=VLEN_US)
tmp[i/VLEN_US]
=
*
(
src_data+
(
y+lidy
)
*src_step/
(
2*VLEN_US
)
+
(
x+i
)
/VLEN_US
)
;
int
tileSize_height
=
min
(
TILE_SIZE,
src_rows
-
y
)
;
int
tileSize_width
=
min
(
TILE_SIZE,
src_cols
-x
)
;
if
(
y+lidy
<
src_rows
)
{
if
(
src_cols
>
TILE_SIZE
&&
tileSize_width
<
TILE_SIZE
)
for
(
int
i=tileSize_width
; i < rstep && (x+i) < src_cols; i++ )
*
((
__global
ushort*
)
src_data+
(
y+lidy
)
*src_step/2+x+i
)
=
0
;
if
(
coi
>
0
)
for
(
int
i=0
; i < tileSize_width; i+=VLEN_US)
{
for
(
int
j=0
; j<VLEN_US; j++)
tmp_coi[j]
=
*
((
__global
ushort*
)
src_data+
(
y+lidy
)
*
(
int
)
src_step/2+
(
x+i+j
)
*kcn+coi-1
)
;
tmp[i/VLEN_US]
=
(
ushort8
)(
tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7]
)
;
}
else
for
(
int
i=0
; i < tileSize_width; i+=VLEN_US)
tmp[i/VLEN_US]
=
*
(
src_data+
(
y+lidy
)
*src_step/
(
2*VLEN_US
)
+
(
x+i
)
/VLEN_US
)
;
}
ushort8
zero
=
(
ushort8
)(
0
)
;
ushort8
full
=
(
ushort8
)(
255
)
;
if
(
binary
)
...
...
@@ -368,11 +382,11 @@ __kernel void CvMoments_D2(__global ushort8* src_data, int src_rows, int src_col
tmp[i/VLEN_US]
=
(
tmp[i/VLEN_US]!=zero
)
?full:zero
;
F
mom[10]
;
__local
long
m[10][128]
;
if
(
lidy
==
0
)
if
(
lidy
<
128
)
for
(
int
i=0
; i<10; i++)
for
(
int
j=0
; j<128; j++)
m[i][j]=0
;
m[i][lidy]=0
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
long
lm[10]
=
{0}
;
int8
x0
=
(
int8
)(
0
)
;
int8
x1
=
(
int8
)(
0
)
;
...
...
@@ -422,17 +436,22 @@ __kernel void CvMoments_D2(__global ushort8* src_data, int src_rows, int src_col
lm[0]
=
x0.s0
; // m00
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
for
(
int
j
=
TILE_SIZE/2
; j >= 1; j = j/2 )
{
if
(
lidy
<
j
)
for
(
int
i
=
0
; i < 10; i++ )
lm[i]
=
lm[i]
+
m[i][lidy]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
for
(
int
j
=
TILE_SIZE/2
; j >= 1; j = j/2 )
{
if
(
lidy
>=
j/2&&lidy
<
j
)
for
(
int
i
=
0
; i < 10; i++ )
m[i][lidy-j/2]
=
lm[i]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
lidy
==
0&&lidx
==
0
)
{
for
(
int
mt
=
0
; mt < 10; mt++ )
...
...
@@ -482,10 +501,10 @@ __kernel void CvMoments_D2(__global ushort8* src_data, int src_rows, int src_col
}
}
__kernel
void
CvMoments_D3
(
__global
short8*
src_data,
int
src_rows,
int
src_cols,
int
src_step,
int
tileSize_width,
int
tileSize_height,
__kernel
void
CvMoments_D3
(
__global
short8*
src_data,
int
src_rows,
int
src_cols,
int
src_step,
__global
F*
dst_m,
int
dst_cols,
int
dst_step,
int
blocky,
int
type,
int
depth,
int
cn,
int
coi,
int
binary,
const
int
TILE_SIZE
)
int
depth,
int
cn,
int
coi,
int
binary,
const
int
TILE_SIZE
)
{
short
tmp_coi[8]
; // get the coi data
short8
tmp[32]
;
...
...
@@ -500,21 +519,26 @@ __kernel void CvMoments_D3(__global short8* src_data, int src_rows, int src_cols
int
x
=
wgidx*TILE_SIZE
; // real X index of pixel
int
kcn
=
(
cn==2
)
?2:4
;
int
rstep
=
min
(
src_step/2,
TILE_SIZE
)
;
tileSize_height
=
min
(
TILE_SIZE,
src_rows
-
y
)
;
tileSize_width
=
min
(
TILE_SIZE,
src_cols
-x
)
;
if
(
tileSize_width
<
TILE_SIZE
)
for
(
int
i
=
tileSize_width
; i < rstep; i++ )
*
((
__global
short*
)
src_data+
(
y+lidy
)
*src_step/2+x+i
)
=
0
;
if
(
coi
>
0
)
for
(
int
i=0
; i < tileSize_width; i+=VLEN_S)
{
for
(
int
j=0
; j<VLEN_S; j++)
tmp_coi[j]
=
*
((
__global
short*
)
src_data+
(
y+lidy
)
*src_step/2+
(
x+i+j
)
*kcn+coi-1
)
;
tmp[i/VLEN_S]
=
(
short8
)(
tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7]
)
;
}
else
for
(
int
i=0
; i < tileSize_width; i+=VLEN_S)
tmp[i/VLEN_S]
=
*
(
src_data+
(
y+lidy
)
*src_step/
(
2*VLEN_S
)
+
(
x+i
)
/VLEN_S
)
;
int
tileSize_height
=
min
(
TILE_SIZE,
src_rows
-
y
)
;
int
tileSize_width
=
min
(
TILE_SIZE,
src_cols
-x
)
;
if
(
y+lidy
<
src_rows
)
{
if
(
tileSize_width
<
TILE_SIZE
)
for
(
int
i
=
tileSize_width
; i < rstep && (x+i) < src_cols; i++ )
*
((
__global
short*
)
src_data+
(
y+lidy
)
*src_step/2+x+i
)
=
0
;
if
(
coi
>
0
)
for
(
int
i=0
; i < tileSize_width; i+=VLEN_S)
{
for
(
int
j=0
; j<VLEN_S; j++)
tmp_coi[j]
=
*
((
__global
short*
)
src_data+
(
y+lidy
)
*src_step/2+
(
x+i+j
)
*kcn+coi-1
)
;
tmp[i/VLEN_S]
=
(
short8
)(
tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7]
)
;
}
else
for
(
int
i=0
; i < tileSize_width; i+=VLEN_S)
tmp[i/VLEN_S]
=
*
(
src_data+
(
y+lidy
)
*src_step/
(
2*VLEN_S
)
+
(
x+i
)
/VLEN_S
)
;
}
short8
zero
=
(
short8
)(
0
)
;
short8
full
=
(
short8
)(
255
)
;
if
(
binary
)
...
...
@@ -523,10 +547,9 @@ __kernel void CvMoments_D3(__global short8* src_data, int src_rows, int src_cols
F
mom[10]
;
__local
long
m[10][128]
;
if
(
lidy
==
0
)
if
(
lidy
<
128
)
for
(
int
i=0
; i<10; i++)
for
(
int
j=0
; j<128; j++)
m[i][j]=0
;
m[i][lidy]=0
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
long
lm[10]
=
{0}
;
int8
x0
=
(
int8
)(
0
)
;
...
...
@@ -637,10 +660,10 @@ __kernel void CvMoments_D3(__global short8* src_data, int src_rows, int src_cols
}
}
__kernel
void
CvMoments_D5
(
__global
float*
src_data,
int
src_rows,
int
src_cols,
int
src_step,
int
tileSize_width,
int
tileSize_height,
__kernel
void
CvMoments_D5
(
__global
float*
src_data,
int
src_rows,
int
src_cols,
int
src_step,
__global
F*
dst_m,
int
dst_cols,
int
dst_step,
int
blocky,
int
type,
int
depth,
int
cn,
int
coi,
int
binary,
const
int
TILE_SIZE
)
int
depth,
int
cn,
int
coi,
int
binary,
const
int
TILE_SIZE
)
{
float
tmp_coi[4]
; // get the coi data
float4
tmp[64]
;
...
...
@@ -654,33 +677,30 @@ __kernel void CvMoments_D5( __global float* src_data, int src_rows, int src_cols
int
y
=
wgidy*TILE_SIZE
; // real Y index of pixel
int
x
=
wgidx*TILE_SIZE
; // real X index of pixel
int
kcn
=
(
cn==2
)
?2:4
;
src_step
/=
sizeof
(
*src_data
)
;
int
rstep
=
min
(
src_step,
TILE_SIZE
)
;
tileSize_height
=
min
(
TILE_SIZE,
src_rows
-
y
)
;
tileSize_width
=
min
(
TILE_SIZE,
src_cols
-x
)
;
int
rstep
=
min
(
src_step/4,
TILE_SIZE
)
;
int
tileSize_height
=
min
(
TILE_SIZE,
src_rows
-
y
)
;
int
tileSize_width
=
min
(
TILE_SIZE,
src_cols
-x
)
;
int
maxIdx
=
mul24
(
src_rows,
src_cols
)
;
int
yOff
=
(
y+lidy
)
*src_step
;
int
index
;
if
(
tileSize_width
<
TILE_SIZE
&&
yOff
<
src_rows
)
for
(
int
i
=
tileSize_width
; i < rstep && (yOff+x+i) < maxIdx; i++
)
*
(
src_data+yOff+x+i
)
=
0
;
if
(
coi
>
0
)
for
(
int
i=0
; i < tileSize_width; i+=VLEN_F
)
{
#
pragma
unroll
for
(
int
j=0
; j<4; j++
)
if
(
y+lidy
<
src_rows
)
{
if
(
tileSize_width
<
TILE_SIZE
)
for
(
int
i
=
tileSize_width
; i < rstep && (x+i) < src_cols; i++
)
*
((
__global
float*
)
src_data+
(
y+lidy
)
*src_step/4+x+i
)
=
0
;
if
(
coi
>
0
)
for
(
int
i=0
; i < tileSize_width; i+=VLEN_F
)
{
index
=
yOff+
(
x+i+j
)
*kcn+coi-1
;
if
(
index
<
maxIdx
)
tmp_coi[j]
=
*
(
src_data+index
)
;
else
tmp_coi[j]
=
0
;
for
(
int
j=0
; j<4; j++)
tmp_coi[j]
=
*
(
src_data+
(
y+lidy
)
*src_step/4+
(
x+i+j
)
*kcn+coi-1
)
;
tmp[i/VLEN_F]
=
(
float4
)(
tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3]
)
;
}
tmp[i/VLEN_F]
=
(
float4
)(
tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3]
)
;
}
else
for
(
int
i=0
; i < tileSize_width && (yOff+x+i) < maxIdx; i+=VLEN_F)
tmp[i/VLEN_F]
=
(
*
(
__global
float4
*
)(
src_data+yOff+x+i
))
;
else
for
(
int
i=0
; i < tileSize_width; i+=VLEN_F)
tmp[i/VLEN_F]
=
(
float4
)(
*
(
src_data+
(
y+lidy
)
*src_step/4+x+i
)
,
*
(
src_data+
(
y+lidy
)
*src_step/4+x+i+1
)
,
*
(
src_data+
(
y+lidy
)
*src_step/4+x+i+2
)
,
*
(
src_data+
(
y+lidy
)
*src_step/4+x+i+3
))
;
}
float4
zero
=
(
float4
)(
0
)
;
float4
full
=
(
float4
)(
255
)
;
if
(
binary
)
...
...
@@ -688,10 +708,9 @@ __kernel void CvMoments_D5( __global float* src_data, int src_rows, int src_cols
tmp[i/VLEN_F]
=
(
tmp[i/VLEN_F]!=zero
)
?full:zero
;
F
mom[10]
;
__local
F
m[10][128]
;
if
(
lidy
==
0
)
if
(
lidy
<
128
)
for
(
int
i
=
0
; i < 10; i ++)
for
(
int
j
=
0
; j < 128; j ++)
m[i][j]
=
0
;
m[i][lidy]
=
0
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
F
lm[10]
=
{0}
;
F4
x0
=
(
F4
)(
0
)
;
...
...
@@ -770,66 +789,42 @@ __kernel void CvMoments_D5( __global float* src_data, int src_rows, int src_cols
//
accumulate
moments
computed
in
each
tile
dst_step
/=
sizeof
(
F
)
;
int
dst_x_off
=
mad24
(
wgidy,
dst_cols,
wgidx
)
;
int
dst_off
=
0
;
int
max_dst_index
=
10
*
blocky
*
get_global_size
(
1
)
;
//
+
m00
(
=
m00
'
)
dst_off
=
mad24
(
DST_ROW_00
*
blocky,
dst_step,
dst_x_off
)
;
if
(
dst_off
<
max_dst_index
)
*
(
dst_m
+
dst_off
)
=
mom[0]
;
*
(
dst_m
+
mad24
(
DST_ROW_00
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[0]
;
//
+
m10
(
=
m10
'
+
x*m00
'
)
dst_off
=
mad24
(
DST_ROW_10
*
blocky,
dst_step,
dst_x_off
)
;
if
(
dst_off
<
max_dst_index
)
*
(
dst_m
+
dst_off
)
=
mom[1]
+
xm
;
*
(
dst_m
+
mad24
(
DST_ROW_10
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[1]
+
xm
;
//
+
m01
(
=
m01
'
+
y*m00
'
)
dst_off
=
mad24
(
DST_ROW_01
*
blocky,
dst_step,
dst_x_off
)
;
if
(
dst_off
<
max_dst_index
)
*
(
dst_m
+
dst_off
)
=
mom[2]
+
ym
;
*
(
dst_m
+
mad24
(
DST_ROW_01
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[2]
+
ym
;
//
+
m20
(
=
m20
'
+
2*x*m10
'
+
x*x*m00
'
)
dst_off
=
mad24
(
DST_ROW_20
*
blocky,
dst_step,
dst_x_off
)
;
if
(
dst_off
<
max_dst_index
)
*
(
dst_m
+
dst_off
)
=
mom[3]
+
x
*
(
mom[1]
*
2
+
xm
)
;
*
(
dst_m
+
mad24
(
DST_ROW_20
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[3]
+
x
*
(
mom[1]
*
2
+
xm
)
;
//
+
m11
(
=
m11
'
+
x*m01
'
+
y*m10
'
+
x*y*m00
'
)
dst_off
=
mad24
(
DST_ROW_11
*
blocky,
dst_step,
dst_x_off
)
;
if
(
dst_off
<
max_dst_index
)
*
(
dst_m
+
dst_off
)
=
mom[4]
+
x
*
(
mom[2]
+
ym
)
+
y
*
mom[1]
;
*
(
dst_m
+
mad24
(
DST_ROW_11
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[4]
+
x
*
(
mom[2]
+
ym
)
+
y
*
mom[1]
;
//
+
m02
(
=
m02
'
+
2*y*m01
'
+
y*y*m00
'
)
dst_off
=
mad24
(
DST_ROW_02
*
blocky,
dst_step,
dst_x_off
)
;
if
(
dst_off
<
max_dst_index
)
*
(
dst_m
+
dst_off
)
=
mom[5]
+
y
*
(
mom[2]
*
2
+
ym
)
;
*
(
dst_m
+
mad24
(
DST_ROW_02
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[5]
+
y
*
(
mom[2]
*
2
+
ym
)
;
//
+
m30
(
=
m30
'
+
3*x*m20
'
+
3*x*x*m10
'
+
x*x*x*m00
'
)
dst_off
=
mad24
(
DST_ROW_30
*
blocky,
dst_step,
dst_x_off
)
;
if
(
dst_off
<
max_dst_index
)
*
(
dst_m
+
dst_off
)
=
mom[6]
+
x
*
(
3.
*
mom[3]
+
x
*
(
3.
*
mom[1]
+
xm
))
;
*
(
dst_m
+
mad24
(
DST_ROW_30
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[6]
+
x
*
(
3.
*
mom[3]
+
x
*
(
3.
*
mom[1]
+
xm
))
;
//
+
m21
(
=
m21
'
+
x*
(
2*m11
'
+
2*y*m10
'
+
x*m01
'
+
x*y*m00
'
)
+
y*m20
'
)
dst_off
=
mad24
(
DST_ROW_21
*
blocky,
dst_step,
dst_x_off
)
;
if
(
dst_off
<
max_dst_index
)
*
(
dst_m
+
dst_off
)
=
mom[7]
+
x
*
(
2
*
(
mom[4]
+
y
*
mom[1]
)
+
x
*
(
mom[2]
+
ym
))
+
y
*
mom[3]
;
*
(
dst_m
+
mad24
(
DST_ROW_21
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[7]
+
x
*
(
2
*
(
mom[4]
+
y
*
mom[1]
)
+
x
*
(
mom[2]
+
ym
))
+
y
*
mom[3]
;
//
+
m12
(
=
m12
'
+
y*
(
2*m11
'
+
2*x*m01
'
+
y*m10
'
+
x*y*m00
'
)
+
x*m02
'
)
dst_off
=
mad24
(
DST_ROW_12
*
blocky,
dst_step,
dst_x_off
)
;
if
(
dst_off
<
max_dst_index
)
*
(
dst_m
+
dst_off
)
=
mom[8]
+
y
*
(
2
*
(
mom[4]
+
x
*
mom[2]
)
+
y
*
(
mom[1]
+
xm
))
+
x
*
mom[5]
;
*
(
dst_m
+
mad24
(
DST_ROW_12
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[8]
+
y
*
(
2
*
(
mom[4]
+
x
*
mom[2]
)
+
y
*
(
mom[1]
+
xm
))
+
x
*
mom[5]
;
//
+
m03
(
=
m03
'
+
3*y*m02
'
+
3*y*y*m01
'
+
y*y*y*m00
'
)
dst_off
=
mad24
(
DST_ROW_03
*
blocky,
dst_step,
dst_x_off
)
;
if
(
dst_off
<
max_dst_index
)
*
(
dst_m
+
dst_off
)
=
mom[9]
+
y
*
(
3.
*
mom[5]
+
y
*
(
3.
*
mom[2]
+
ym
))
;
*
(
dst_m
+
mad24
(
DST_ROW_03
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[9]
+
y
*
(
3.
*
mom[5]
+
y
*
(
3.
*
mom[2]
+
ym
))
;
}
}
__kernel
void
CvMoments_D6
(
__global
F*
src_data,
int
src_rows,
int
src_cols,
int
src_step,
int
tileSize_width,
int
tileSize_height,
__kernel
void
CvMoments_D6
(
__global
F*
src_data,
int
src_rows,
int
src_cols,
int
src_step,
__global
F*
dst_m,
int
dst_cols,
int
dst_step,
int
blocky,
int
type,
int
depth,
int
cn,
int
coi,
int
binary,
const
int
TILE_SIZE
)
int
depth,
int
cn,
int
coi,
int
binary,
const
int
TILE_SIZE
)
{
F
tmp_coi[4]
; // get the coi data
F4
tmp[64]
;
...
...
@@ -844,22 +839,26 @@ __kernel void CvMoments_D6(__global F* src_data, int src_rows, int src_cols, in
int
x
=
wgidx*TILE_SIZE
; // real X index of pixel
int
kcn
=
(
cn==2
)
?2:4
;
int
rstep
=
min
(
src_step/8,
TILE_SIZE
)
;
tileSize_height
=
min
(
TILE_SIZE,
src_rows
-
y
)
;
tileSize_width
=
min
(
TILE_SIZE,
src_cols
-
x
)
;
int
tileSize_height
=
min
(
TILE_SIZE,
src_rows
-
y
)
;
int
tileSize_width
=
min
(
TILE_SIZE,
src_cols
-
x
)
;
if
(
y+lidy
<
src_rows
)
{
if
(
tileSize_width
<
TILE_SIZE
)
for
(
int
i
=
tileSize_width
; i < rstep && (x+i) < src_cols; i++ )
*
((
__global
F*
)
src_data+
(
y+lidy
)
*src_step/8+x+i
)
=
0
;
if
(
coi
>
0
)
for
(
int
i=0
; i < tileSize_width; i+=VLEN_D)
{
for
(
int
j=0
; j<4 && ((x+i+j)*kcn+coi-1)<src_cols; j++)
tmp_coi[j]
=
*
(
src_data+
(
y+lidy
)
*src_step/8+
(
x+i+j
)
*kcn+coi-1
)
;
tmp[i/VLEN_D]
=
(
F4
)(
tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3]
)
;
}
else
for
(
int
i=0
; i < tileSize_width && (x+i+3) < src_cols; i+=VLEN_D)
tmp[i/VLEN_D]
=
(
F4
)(
*
(
src_data+
(
y+lidy
)
*src_step/8+x+i
)
,
*
(
src_data+
(
y+lidy
)
*src_step/8+x+i+1
)
,
*
(
src_data+
(
y+lidy
)
*src_step/8+x+i+2
)
,
*
(
src_data+
(
y+lidy
)
*src_step/8+x+i+3
))
;
}
if
(
tileSize_width
<
TILE_SIZE
)
for
(
int
i
=
tileSize_width
; i < rstep; i++ )
*
((
__global
F*
)
src_data+
(
y+lidy
)
*src_step/8+x+i
)
=
0
;
if
(
coi
>
0
)
for
(
int
i=0
; i < tileSize_width; i+=VLEN_D)
{
for
(
int
j=0
; j<4; j++)
tmp_coi[j]
=
*
(
src_data+
(
y+lidy
)
*src_step/8+
(
x+i+j
)
*kcn+coi-1
)
;
tmp[i/VLEN_D]
=
(
F4
)(
tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3]
)
;
}
else
for
(
int
i=0
; i < tileSize_width; i+=VLEN_D)
tmp[i/VLEN_D]
=
(
F4
)(
*
(
src_data+
(
y+lidy
)
*src_step/8+x+i
)
,
*
(
src_data+
(
y+lidy
)
*src_step/8+x+i+1
)
,
*
(
src_data+
(
y+lidy
)
*src_step/8+x+i+2
)
,
*
(
src_data+
(
y+lidy
)
*src_step/8+x+i+3
))
;
F4
zero
=
(
F4
)(
0
)
;
F4
full
=
(
F4
)(
255
)
;
if
(
binary
)
...
...
@@ -867,10 +866,9 @@ __kernel void CvMoments_D6(__global F* src_data, int src_rows, int src_cols, in
tmp[i/VLEN_D]
=
(
tmp[i/VLEN_D]!=zero
)
?full:zero
;
F
mom[10]
;
__local
F
m[10][128]
;
if
(
lidy
==
0
)
if
(
lidy
<
128
)
for
(
int
i=0
; i<10; i++)
for
(
int
j=0
; j<128; j++)
m[i][j]=0
;
m[i][lidy]=0
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
F
lm[10]
=
{0}
;
F4
x0
=
(
F4
)(
0
)
;
...
...
@@ -907,7 +905,6 @@ __kernel void CvMoments_D6(__global F* src_data, int src_rows, int src_cols, in
m[1][lidy-bheight]
=
x1.s0
; // m10
m[0][lidy-bheight]
=
x0.s0
; // m00
}
else
if
(
lidy
<
bheight
)
{
lm[9]
=
((
F
)
py
)
*
sy
; // m03
...
...
@@ -922,6 +919,7 @@ __kernel void CvMoments_D6(__global F* src_data, int src_rows, int src_cols, in
lm[0]
=
x0.s0
; // m00
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
for
(
int
j
=
TILE_SIZE/2
; j >= 1; j = j/2 )
{
if
(
lidy
<
j
)
...
...
modules/ocl/test/test_moments.cpp
View file @
1227e00f
...
...
@@ -45,12 +45,12 @@ TEST_P(MomentsTest, Mat)
{
if
(
test_contours
)
{
Mat
src
=
imread
(
workdir
+
"../cpp/pic3.png"
,
1
);
Mat
src_gray
,
canny_output
;
cvtColor
(
src
,
src_gray
,
CV_BGR2GRAY
)
;
Mat
src
=
imread
(
workdir
+
"../cpp/pic3.png"
,
IMREAD_GRAYSCALE
);
ASSERT_FALSE
(
src
.
empty
())
;
Mat
canny_output
;
vector
<
vector
<
Point
>
>
contours
;
vector
<
Vec4i
>
hierarchy
;
Canny
(
src
_gray
,
canny_output
,
100
,
200
,
3
);
Canny
(
src
,
canny_output
,
100
,
200
,
3
);
findContours
(
canny_output
,
contours
,
hierarchy
,
CV_RETR_TREE
,
CV_CHAIN_APPROX_SIMPLE
,
Point
(
0
,
0
)
);
for
(
size_t
i
=
0
;
i
<
contours
.
size
();
i
++
)
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment