Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
e80f5bed
Commit
e80f5bed
authored
Oct 31, 2013
by
Andrey Pavlenko
Committed by
OpenCV Buildbot
Oct 31, 2013
Browse files
Options
Browse Files
Download
Plain Diff
Merge pull request #1710 from melody-rain:2.4_moments_ocl
parents
ef9f6905
3dbcd054
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
443 additions
and
926 deletions
+443
-926
ocl.hpp
modules/ocl/include/opencv2/ocl/ocl.hpp
+6
-1
perf_moments.cpp
modules/ocl/perf/perf_moments.cpp
+20
-19
moments.cpp
modules/ocl/src/moments.cpp
+159
-110
moments.cl
modules/ocl/src/opencl/moments.cl
+246
-780
test_moments.cpp
modules/ocl/test/test_moments.cpp
+12
-16
No files found.
modules/ocl/include/opencv2/ocl/ocl.hpp
View file @
e80f5bed
...
...
@@ -1520,7 +1520,12 @@ namespace cv
float
pos
,
oclMat
&
newFrame
,
oclMat
&
buf
);
//! computes moments of the rasterized shape or a vector of points
CV_EXPORTS
Moments
ocl_moments
(
InputArray
_array
,
bool
binaryImage
);
//! _array should be a vector a points standing for the contour
CV_EXPORTS
Moments
ocl_moments
(
InputArray
contour
);
//! src should be a general image uploaded to the GPU.
//! the supported oclMat type are CV_8UC1, CV_16UC1, CV_16SC1, CV_32FC1 and CV_64FC1
//! to use type of CV_64FC1, the GPU should support CV_64FC1
CV_EXPORTS
Moments
ocl_moments
(
oclMat
&
src
,
bool
binary
);
class
CV_EXPORTS
StereoBM_OCL
{
...
...
modules/ocl/perf/perf_moments.cpp
View file @
e80f5bed
...
...
@@ -26,7 +26,7 @@
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other
m
aterials provided with the distribution.
// and/or other
M
aterials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
...
...
@@ -49,41 +49,42 @@
using
namespace
perf
;
using
std
::
tr1
::
tuple
;
using
std
::
tr1
::
get
;
using
namespace
cv
;
using
namespace
cv
::
ocl
;
using
namespace
cvtest
;
using
namespace
testing
;
using
namespace
std
;
///////////// Moments ////////////////////////
typedef
Size_MatType
MomentsFixture
;
///////////// Moments ////////////////////////
//*! performance of image
typedef
tuple
<
Size
,
MatType
,
bool
>
MomentsParamType
;
typedef
TestBaseWithParam
<
MomentsParamType
>
MomentsFixture
;
PERF_TEST_P
(
MomentsFixture
,
DISABLED_
Moments
,
PERF_TEST_P
(
MomentsFixture
,
Moments
,
::
testing
::
Combine
(
OCL_TYPICAL_MAT_SIZES
,
OCL_PERF_ENUM
(
CV_8UC1
,
CV_16SC1
,
CV_32FC1
,
CV_64FC1
)))
// TODO does not work properly (see below
)
OCL_PERF_ENUM
(
CV_8UC1
,
CV_16SC1
,
CV_16UC1
,
CV_32FC1
),
::
testing
::
Values
(
false
,
true
))
)
{
const
Size_MatType_t
params
=
GetParam
();
const
MomentsParamType
params
=
GetParam
();
const
Size
srcSize
=
get
<
0
>
(
params
);
const
int
type
=
get
<
1
>
(
params
);
const
bool
binaryImage
=
get
<
2
>
(
params
);
Mat
src
(
srcSize
,
type
),
dst
(
7
,
1
,
CV_64F
);
const
bool
binaryImage
=
false
;
cv
::
Moments
mom
;
declare
.
in
(
src
,
WARMUP_RNG
).
out
(
dst
);
randu
(
src
,
0
,
255
);
oclMat
src_d
(
src
);
cv
::
Moments
mom
;
if
(
RUN_OCL_IMPL
)
{
ocl
::
oclMat
oclSrc
(
src
);
OCL_TEST_CYCLE
()
mom
=
cv
::
ocl
::
ocl_moments
(
oclSrc
,
binaryImage
);
// TODO Use oclSrc
cv
::
HuMoments
(
mom
,
dst
);
SANITY_CHECK
(
dst
);
OCL_TEST_CYCLE
()
mom
=
cv
::
ocl
::
ocl_moments
(
src_d
,
binaryImage
);
}
else
if
(
RUN_PLAIN_IMPL
)
{
TEST_CYCLE
()
mom
=
cv
::
moments
(
src
,
binaryImage
);
cv
::
HuMoments
(
mom
,
dst
);
SANITY_CHECK
(
dst
);
}
else
OCL_PERF_ELSE
cv
::
HuMoments
(
mom
,
dst
);
SANITY_CHECK
(
dst
,
1e-3
);
}
modules/ocl/src/moments.cpp
View file @
e80f5bed
...
...
@@ -10,12 +10,12 @@
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Jin Ma, jin@multicorewareinc.com
// Sen Liu, swjtuls1987@126.com
//
// Redistribution and use in source and binary forms, with or without modification,
...
...
@@ -26,7 +26,7 @@
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other
m
aterials provided with the distribution.
// and/or other
M
aterials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
...
...
@@ -46,13 +46,16 @@
#include "precomp.hpp"
#include "opencl_kernels.hpp"
#if defined _MSC_VER
#define snprintf sprintf_s
#endif
namespace
cv
{
namespace
ocl
{
// The function calculates center of gravity and the central second order moments
static
void
icvCompleteMomentState
(
CvMoments
*
moments
)
{
namespace
ocl
{
// The function calculates center of gravity and the central second order moments
static
void
icvCompleteMomentState
(
CvMoments
*
moments
)
{
double
cx
=
0
,
cy
=
0
;
double
mu20
,
mu11
,
mu02
;
...
...
@@ -87,11 +90,11 @@ static void icvCompleteMomentState( CvMoments* moments )
moments
->
mu12
=
moments
->
m12
-
cy
*
(
mu11
+
cy
*
moments
->
m10
)
-
cx
*
mu02
;
// mu03 = m03 - cy*(3*mu02 + cy*m01)
moments
->
mu03
=
moments
->
m03
-
cy
*
(
3
*
mu02
+
cy
*
moments
->
m01
);
}
}
static
void
icvContourMoments
(
CvSeq
*
contour
,
CvMoments
*
mom
)
{
static
void
icvContourMoments
(
CvSeq
*
contour
,
CvMoments
*
mom
)
{
if
(
contour
->
total
)
{
CvSeqReader
reader
;
...
...
@@ -141,7 +144,10 @@ static void icvContourMoments( CvSeq* contour, CvMoments* mom )
cl_int
dst_step
=
(
cl_int
)
dst_a
.
step
;
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
dst_step
));
openCLExecuteKernel
(
dst_a
.
clCxt
,
&
moments
,
"icvContourMoments"
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
);
char
builOption
[
128
];
snprintf
(
builOption
,
128
,
"-D CV_8UC1"
);
openCLExecuteKernel
(
dst_a
.
clCxt
,
&
moments
,
"icvContourMoments"
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
,
builOption
);
cv
::
Mat
dst
(
dst_a
);
a00
=
a10
=
a01
=
a20
=
a11
=
a02
=
a30
=
a21
=
a12
=
a03
=
0.0
;
...
...
@@ -212,31 +218,155 @@ static void icvContourMoments( CvSeq* contour, CvMoments* mom )
icvCompleteMomentState
(
mom
);
}
}
}
}
static
void
ocl_cvMoments
(
const
void
*
array
,
CvMoments
*
mom
,
int
binary
)
{
Moments
ocl_moments
(
oclMat
&
src
,
bool
binary
)
//for image
{
CV_Assert
(
src
.
oclchannels
()
==
1
);
if
(
src
.
type
()
==
CV_64FC1
&&
!
Context
::
getContext
()
->
supportsFeature
(
FEATURE_CL_DOUBLE
))
{
CV_Error
(
CV_StsUnsupportedFormat
,
"Moments - double is not supported by your GPU!"
);
}
if
(
binary
)
{
oclMat
mask
;
if
(
src
.
type
()
!=
CV_8UC1
)
{
src
.
convertTo
(
mask
,
CV_8UC1
);
}
oclMat
src8u
(
src
.
size
(),
CV_8UC1
);
src8u
.
setTo
(
Scalar
(
255
),
mask
);
src
=
src8u
;
}
const
int
TILE_SIZE
=
256
;
int
type
,
depth
,
cn
,
coi
=
0
;
CvMat
stub
,
*
mat
=
(
CvMat
*
)
array
;
CvContour
contourHeader
;
CvMoments
mom
;
memset
(
&
mom
,
0
,
sizeof
(
mom
));
cv
::
Size
size
=
src
.
size
();
int
blockx
,
blocky
;
blockx
=
(
size
.
width
+
TILE_SIZE
-
1
)
/
TILE_SIZE
;
blocky
=
(
size
.
height
+
TILE_SIZE
-
1
)
/
TILE_SIZE
;
oclMat
dst_m
;
int
tile_height
=
TILE_SIZE
;
size_t
localThreads
[
3
]
=
{
1
,
tile_height
,
1
};
size_t
globalThreads
[
3
]
=
{
blockx
,
size
.
height
,
1
};
if
(
Context
::
getContext
()
->
supportsFeature
(
FEATURE_CL_DOUBLE
))
{
dst_m
.
create
(
blocky
*
10
,
blockx
,
CV_64FC1
);
}
else
{
dst_m
.
create
(
blocky
*
10
,
blockx
,
CV_32FC1
);
}
int
src_step
=
(
int
)(
src
.
step
/
src
.
elemSize
());
int
dstm_step
=
(
int
)(
dst_m
.
step
/
dst_m
.
elemSize
());
vector
<
pair
<
size_t
,
const
void
*>
>
args
,
args_sum
;
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
src
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
src
.
rows
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
src
.
cols
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
src_step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
dst_m
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
dst_m
.
cols
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
dstm_step
));
int
binary_
;
if
(
binary
)
binary_
=
1
;
else
binary_
=
0
;
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
binary_
));
char
builOption
[
128
];
if
(
binary
||
src
.
type
()
==
CV_8UC1
)
{
snprintf
(
builOption
,
128
,
"-D CV_8UC1"
);
}
else
if
(
src
.
type
()
==
CV_16UC1
)
{
snprintf
(
builOption
,
128
,
"-D CV_16UC1"
);
}
else
if
(
src
.
type
()
==
CV_16SC1
)
{
snprintf
(
builOption
,
128
,
"-D CV_16SC1"
);
}
else
if
(
src
.
type
()
==
CV_32FC1
)
{
snprintf
(
builOption
,
128
,
"-D CV_32FC1"
);
}
else
if
(
src
.
type
()
==
CV_64FC1
)
{
snprintf
(
builOption
,
128
,
"-D CV_64FC1"
);
}
else
{
CV_Error
(
CV_StsUnsupportedFormat
,
""
);
}
openCLExecuteKernel
(
Context
::
getContext
(),
&
moments
,
"CvMoments"
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
,
builOption
);
Mat
tmp
(
dst_m
);
tmp
.
convertTo
(
tmp
,
CV_64FC1
);
double
tmp_m
[
10
]
=
{
0
};
for
(
int
j
=
0
;
j
<
tmp
.
rows
;
j
+=
10
)
{
for
(
int
i
=
0
;
i
<
tmp
.
cols
;
i
++
)
{
tmp_m
[
0
]
+=
tmp
.
at
<
double
>
(
j
,
i
);
tmp_m
[
1
]
+=
tmp
.
at
<
double
>
(
j
+
1
,
i
);
tmp_m
[
2
]
+=
tmp
.
at
<
double
>
(
j
+
2
,
i
);
tmp_m
[
3
]
+=
tmp
.
at
<
double
>
(
j
+
3
,
i
);
tmp_m
[
4
]
+=
tmp
.
at
<
double
>
(
j
+
4
,
i
);
tmp_m
[
5
]
+=
tmp
.
at
<
double
>
(
j
+
5
,
i
);
tmp_m
[
6
]
+=
tmp
.
at
<
double
>
(
j
+
6
,
i
);
tmp_m
[
7
]
+=
tmp
.
at
<
double
>
(
j
+
7
,
i
);
tmp_m
[
8
]
+=
tmp
.
at
<
double
>
(
j
+
8
,
i
);
tmp_m
[
9
]
+=
tmp
.
at
<
double
>
(
j
+
9
,
i
);
}
}
mom
.
m00
=
tmp_m
[
0
];
mom
.
m10
=
tmp_m
[
1
];
mom
.
m01
=
tmp_m
[
2
];
mom
.
m20
=
tmp_m
[
3
];
mom
.
m11
=
tmp_m
[
4
];
mom
.
m02
=
tmp_m
[
5
];
mom
.
m30
=
tmp_m
[
6
];
mom
.
m21
=
tmp_m
[
7
];
mom
.
m12
=
tmp_m
[
8
];
mom
.
m03
=
tmp_m
[
9
];
icvCompleteMomentState
(
&
mom
);
return
mom
;
}
Moments
ocl_moments
(
InputArray
_contour
)
//for contour
{
CvMoments
mom
;
memset
(
&
mom
,
0
,
sizeof
(
mom
));
Mat
arr
=
_contour
.
getMat
();
CvMat
c_array
=
arr
;
const
void
*
array
=
&
c_array
;
CvSeq
*
contour
=
0
;
CvSeqBlock
block
;
if
(
CV_IS_SEQ
(
array
))
{
contour
=
(
CvSeq
*
)
array
;
contour
=
(
CvSeq
*
)(
array
)
;
if
(
!
CV_IS_SEQ_POINT_SET
(
contour
))
CV_Error
(
CV_StsBadArg
,
"The passed sequence is not a valid contour"
);
}
if
(
!
mom
)
CV_Error
(
CV_StsNullPtr
,
""
);
int
type
,
coi
=
0
;
memset
(
mom
,
0
,
sizeof
(
*
mom
));
CvMat
stub
,
*
mat
=
(
CvMat
*
)(
array
);
CvContour
contourHeader
;
CvSeqBlock
block
;
if
(
!
contour
)
{
mat
=
cvGetMat
(
mat
,
&
stub
,
&
coi
);
type
=
CV_MAT_TYPE
(
mat
->
type
);
...
...
@@ -247,93 +377,11 @@ static void ocl_cvMoments( const void* array, CvMoments* mom, int binary )
mat
,
&
contourHeader
,
&
block
);
}
}
if
(
contour
)
{
icvContourMoments
(
contour
,
mom
);
return
;
}
type
=
CV_MAT_TYPE
(
mat
->
type
);
depth
=
CV_MAT_DEPTH
(
type
);
cn
=
CV_MAT_CN
(
type
);
cv
::
Size
size
=
cvGetMatSize
(
mat
);
if
(
cn
>
1
&&
coi
==
0
)
CV_Error
(
CV_StsBadArg
,
"Invalid image type"
);
if
(
size
.
width
<=
0
||
size
.
height
<=
0
)
return
;
cv
::
Mat
src0
(
mat
);
cv
::
ocl
::
oclMat
src
(
src0
);
cv
::
Size
tileSize
;
int
blockx
,
blocky
;
if
(
size
.
width
%
TILE_SIZE
==
0
)
blockx
=
size
.
width
/
TILE_SIZE
;
else
blockx
=
size
.
width
/
TILE_SIZE
+
1
;
if
(
size
.
height
%
TILE_SIZE
==
0
)
blocky
=
size
.
height
/
TILE_SIZE
;
else
blocky
=
size
.
height
/
TILE_SIZE
+
1
;
oclMat
dst_m
(
blocky
*
10
,
blockx
,
CV_64FC1
);
oclMat
sum
(
1
,
10
,
CV_64FC1
);
int
tile_width
=
std
::
min
(
size
.
width
,
TILE_SIZE
);
int
tile_height
=
std
::
min
(
size
.
height
,
TILE_SIZE
);
size_t
localThreads
[
3
]
=
{
tile_height
,
1
,
1
};
size_t
globalThreads
[
3
]
=
{
size
.
height
,
blockx
,
1
};
vector
<
pair
<
size_t
,
const
void
*>
>
args
,
args_sum
;
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
src
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
src
.
rows
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
src
.
cols
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
src
.
step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
dst_m
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
dst_m
.
cols
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
dst_m
.
step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
blocky
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
depth
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
cn
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
coi
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
binary
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
TILE_SIZE
));
openCLExecuteKernel
(
Context
::
getContext
(),
&
moments
,
"CvMoments"
,
globalThreads
,
localThreads
,
args
,
-
1
,
depth
);
size_t
localThreadss
[
3
]
=
{
128
,
1
,
1
};
size_t
globalThreadss
[
3
]
=
{
128
,
1
,
1
};
args_sum
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
src
.
rows
));
args_sum
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
src
.
cols
));
args_sum
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
tile_height
));
args_sum
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
tile_width
));
args_sum
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
TILE_SIZE
));
args_sum
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
sum
.
data
));
args_sum
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
dst_m
.
data
));
args_sum
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
dst_m
.
step
));
openCLExecuteKernel
(
Context
::
getContext
(),
&
moments
,
"dst_sum"
,
globalThreadss
,
localThreadss
,
args_sum
,
-
1
,
-
1
);
Mat
dstsum
(
sum
);
mom
->
m00
=
dstsum
.
at
<
double
>
(
0
,
0
);
mom
->
m10
=
dstsum
.
at
<
double
>
(
0
,
1
);
mom
->
m01
=
dstsum
.
at
<
double
>
(
0
,
2
);
mom
->
m20
=
dstsum
.
at
<
double
>
(
0
,
3
);
mom
->
m11
=
dstsum
.
at
<
double
>
(
0
,
4
);
mom
->
m02
=
dstsum
.
at
<
double
>
(
0
,
5
);
mom
->
m30
=
dstsum
.
at
<
double
>
(
0
,
6
);
mom
->
m21
=
dstsum
.
at
<
double
>
(
0
,
7
);
mom
->
m12
=
dstsum
.
at
<
double
>
(
0
,
8
);
mom
->
m03
=
dstsum
.
at
<
double
>
(
0
,
9
);
icvCompleteMomentState
(
mom
);
}
Moments
ocl_moments
(
InputArray
_array
,
bool
binaryImage
)
{
CvMoments
om
;
Mat
arr
=
_array
.
getMat
();
CvMat
c_array
=
arr
;
ocl_cvMoments
(
&
c_array
,
&
om
,
binaryImage
);
return
om
;
}
}
CV_Assert
(
contour
);
icvContourMoments
(
contour
,
&
mom
);
return
mom
;
}
}
}
\ No newline at end of file
modules/ocl/src/opencl/moments.cl
View file @
e80f5bed
...
...
@@ -15,6 +15,7 @@
//
Third
party
copyrights
are
property
of
their
respective
owners.
//
//
@Authors
//
Jin
Ma,
jin@multicorewareinc.com
//
Sen
Liu,
swjtuls1987@126.com
//
//
Redistribution
and
use
in
source
and
binary
forms,
with
or
without
modification,
...
...
@@ -44,22 +45,14 @@
//M*/
#
if
defined
(
DOUBLE_SUPPORT
)
#
ifdef
cl_khr_fp64
#
pragma
OPENCL
EXTENSION
cl_khr_fp64:enable
#
elif
defined
(
cl_amd_fp64
)
#
pragma
OPENCL
EXTENSION
cl_amd_fp64:enable
#
endif
typedef
double
T
;
typedef
double
F
;
typedef
double4
F4
;
#
define
convert_F4
convert_double4
#
else
typedef
float
F
;
typedef
float4
F4
;
typedef
long
T
;
#
define
convert_F4
convert_float4
#
endif
#
define
DST_ROW_00
0
...
...
@@ -99,7 +92,6 @@ __kernel void icvContourMoments(int contour_total,
xi
=
(
T
)(
*
(
reader_oclmat_data
+
(
idx
+
1
)
*
2
))
;
yi
=
(
T
)(
*
(
reader_oclmat_data
+
(
idx
+
1
)
*
2
+
1
))
;
}
xi2
=
xi
*
xi
;
yi2
=
yi
*
yi
;
dxy
=
xi_1
*
yi
-
xi
*
yi_1
;
...
...
@@ -123,858 +115,332 @@ __kernel void icvContourMoments(int contour_total,
yi2
*
(
xi_1
+
3
*
xi
))
;
}
__kernel
void
dst_sum
(
int
src_rows,
int
src_cols,
int
tile_height,
int
tile_width,
int
TILE_SIZE,
__global
F*
sum,
__global
F*
dst_m,
int
dst_step
)
#
if
defined
(
DOUBLE_SUPPORT
)
#
define
WT
double
#
define
WT4
double4
#
define
convert_T4
convert_double4
#
define
convert_T
convert_double
#
else
#
define
WT
float
#
define
WT4
float4
#
define
convert_T4
convert_float4
#
define
convert_T
convert_float
#
endif
#
ifdef
CV_8UC1
#
define
TT
uchar
#
elif
defined
CV_16UC1
#
define
TT
ushort
#
elif
defined
CV_16SC1
#
define
TT
short
#
elif
defined
CV_32FC1
#
define
TT
float
#
elif
defined
CV_64FC1
#
ifdef
DOUBLE_SUPPORT
#
define
TT
double
#
else
#
define
TT
float
#
endif
#
endif
__kernel
void
CvMoments
(
__global
TT*
src_data,
int
src_rows,
int
src_cols,
int
src_step,
__global
WT*
dst_m,
int
dst_cols,
int
dst_step,
int
binary
)
{
int
gidy
=
get_global_id
(
0
)
;
int
gidx
=
get_global_id
(
1
)
;
int
block_y
=
src_rows/tile_height
;
int
block_x
=
src_cols/tile_width
;
int
block_num
;
if
(
src_rows
>
TILE_SIZE
&&
src_rows
%
TILE_SIZE
!=
0
)
block_y
++
;
if
(
src_cols
>
TILE_SIZE
&&
src_cols
%
TILE_SIZE
!=
0
)
block_x
++
;
block_num
=
block_y
*
block_x
;
__local
F
dst_sum[10][128]
;
if
(
gidy<128-block_num
)
for
(
int
i=0
; i<10; i++)
dst_sum[i][gidy+block_num]=0
;
int
dy
=
get_global_id
(
1
)
;
int
ly
=
get_local_id
(
1
)
;
int
gidx
=
get_group_id
(
0
)
;
int
gidy
=
get_group_id
(
1
)
;
int
x_rest
=
src_cols
%
256
;
int
y_rest
=
src_rows
%
256
;
__local
int
codxy[256]
;
codxy[ly]
=
ly
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
dst_step
/=
sizeof
(
F
)
;
if
(
gidy<block_num
)
{
dst_sum[0][gidy]
=
*
(
dst_m
+
mad24
(
DST_ROW_00
*
block_y,
dst_step,
gidy
))
;
dst_sum[1][gidy]
=
*
(
dst_m
+
mad24
(
DST_ROW_10
*
block_y,
dst_step,
gidy
))
;
dst_sum[2][gidy]
=
*
(
dst_m
+
mad24
(
DST_ROW_01
*
block_y,
dst_step,
gidy
))
;
dst_sum[3][gidy]
=
*
(
dst_m
+
mad24
(
DST_ROW_20
*
block_y,
dst_step,
gidy
))
;
dst_sum[4][gidy]
=
*
(
dst_m
+
mad24
(
DST_ROW_11
*
block_y,
dst_step,
gidy
))
;
dst_sum[5][gidy]
=
*
(
dst_m
+
mad24
(
DST_ROW_02
*
block_y,
dst_step,
gidy
))
;
dst_sum[6][gidy]
=
*
(
dst_m
+
mad24
(
DST_ROW_30
*
block_y,
dst_step,
gidy
))
;
dst_sum[7][gidy]
=
*
(
dst_m
+
mad24
(
DST_ROW_21
*
block_y,
dst_step,
gidy
))
;
dst_sum[8][gidy]
=
*
(
dst_m
+
mad24
(
DST_ROW_12
*
block_y,
dst_step,
gidy
))
;
dst_sum[9][gidy]
=
*
(
dst_m
+
mad24
(
DST_ROW_03
*
block_y,
dst_step,
gidy
))
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
for
(
int
lsize=64
; lsize>0; lsize>>=1)
{
if
(
gidy<lsize
)
{
int
lsize2
=
gidy
+
lsize
;
for
(
int
i=0
; i<10; i++)
dst_sum[i][gidy]
+=
dst_sum[i][lsize2]
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
if
(
gidy==0
)
for
(
int
i=0
; i<10; i++)
sum[i]
=
dst_sum[i][0]
;
}
WT4
x0
=
(
WT4
)(
0.f
)
;
WT4
x1
=
(
WT4
)(
0.f
)
;
WT4
x2
=
(
WT4
)(
0.f
)
;
WT4
x3
=
(
WT4
)(
0.f
)
;
__kernel
void
CvMoments_D0
(
__global
uchar16*
src_data,
int
src_rows,
int
src_cols,
int
src_step,
__global
F*
dst_m,
int
dst_cols,
int
dst_step,
int
blocky,
int
depth,
int
cn,
int
coi,
int
binary,
int
TILE_SIZE
)
{
uchar
tmp_coi[16]
; // get the coi data
uchar16
tmp[16]
;
int
VLEN_C
=
16
; // vector length of uchar
int
gidy
=
get_global_id
(
0
)
;
int
gidx
=
get_global_id
(
1
)
;
int
wgidy
=
get_group_id
(
0
)
;
int
wgidx
=
get_group_id
(
1
)
;
int
lidy
=
get_local_id
(
0
)
;
int
lidx
=
get_local_id
(
1
)
;
int
y
=
wgidy*TILE_SIZE
; // vector length of uchar
int
x
=
wgidx*TILE_SIZE
; // vector length of uchar
int
kcn
=
(
cn==2
)
?2:4
;
int
rstep
=
min
(
src_step,
TILE_SIZE
)
;
int
tileSize_height
=
min
(
TILE_SIZE,
src_rows
-
y
)
;
int
tileSize_width
=
min
(
TILE_SIZE,
src_cols
-
x
)
;
if
(
y+lidy
<
src_rows
)
{
if
(
tileSize_width
<
TILE_SIZE
)
for
(
int
i
=
tileSize_width
; i < rstep && (x+i) < src_cols; i++ )
*
((
__global
uchar*
)
src_data+
(
y+lidy
)
*src_step+x+i
)
=
0
;
__global
TT*
row
=
src_data
+
gidy
*
src_step
+
ly
*
src_step
+
gidx
*
256
;
bool
switchFlag
=
false
;
if
(
coi
>
0
)
//channel
of
interest
for
(
int
i
=
0
; i < tileSize_width; i += VLEN_C)
{
for
(
int
j=0
; j<VLEN_C; j++)
tmp_coi[j]
=
*
((
__global
uchar*
)
src_data+
(
y+lidy
)
*src_step+
(
x+i+j
)
*kcn+coi-1
)
;
tmp[i/VLEN_C]
=
(
uchar16
)(
tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7],
tmp_coi[8],tmp_coi[9],tmp_coi[10],tmp_coi[11],tmp_coi[12],tmp_coi[13],tmp_coi[14],tmp_coi[15]
)
;
}
else
for
(
int
i=0
; i < tileSize_width; i+=VLEN_C)
tmp[i/VLEN_C]
=
*
(
src_data+
(
y+lidy
)
*src_step/VLEN_C+
(
x+i
)
/VLEN_C
)
;
}
WT4
p
;
WT4
x
;
WT4
xp
;
WT4
xxp
;
uchar16
zero
=
(
uchar16
)(
0
)
;
uchar16
full
=
(
uchar16
)(
255
)
;
if
(
binary
)
for
(
int
i=0
; i < tileSize_width; i+=VLEN_C)
tmp[i/VLEN_C]
=
(
tmp[i/VLEN_C]!=zero
)
?full:zero
;
WT
py
=
0.f,
sy
=
0.f
;
F
mom[10]
;
__local
int
m[10][128]
;
if
(
lidy
<
128
)
if
(
dy
<
src_rows
)
{
for
(
int
i=0
; i<10; i++)
m[i][lidy]=0
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
int
lm[10]
=
{0}
;
int16
x0
=
(
int16
)(
0
)
;
int16
x1
=
(
int16
)(
0
)
;
int16
x2
=
(
int16
)(
0
)
;
int16
x3
=
(
int16
)(
0
)
;
for
(
int
xt
=
0
; xt < tileSize_width; xt+=(VLEN_C) )
if
((
x_rest
>
0
)
&&
(
gidx
==
(
get_num_groups
(
0
)
-
1
)))
{
int16
v_xt
=
(
int16
)(
xt,
xt+1,
xt+2,
xt+3,
xt+4,
xt+5,
xt+6,
xt+7,
xt+8,
xt+9,
xt+10,
xt+11,
xt+12,
xt+13,
xt+14,
xt+15
)
;
int16
p
=
convert_int16
(
tmp[xt/VLEN_C]
)
;
int16
xp
=
v_xt
*
p,
xxp
=
xp
*v_xt
;
int
i
;
for
(
i
=
0
; i < x_rest - 4; i += 4)
{
p
=
convert_T4
(
vload4
(
0
,
row
+
i
))
;
x
=
convert_T4
(
vload4
(
0
,
codxy
+
i
))
;
xp
=
x
*
p
;
xxp
=
xp
*
x
;
x0
+=
p
;
x1
+=
xp
;
x2
+=
xxp
;
x3
+=
xxp
*
v_xt
;
}
x0.s0
+=
x0.s1
+
x0.s2
+
x0.s3
+
x0.s4
+
x0.s5
+
x0.s6
+
x0.s7
+
x0.s8
+
x0.s9
+
x0.sa
+
x0.sb
+
x0.sc
+
x0.sd
+
x0.se
+
x0.sf
;
x1.s0
+=
x1.s1
+
x1.s2
+
x1.s3
+
x1.s4
+
x1.s5
+
x1.s6
+
x1.s7
+
x1.s8
+
x1.s9
+
x1.sa
+
x1.sb
+
x1.sc
+
x1.sd
+
x1.se
+
x1.sf
;
x2.s0
+=
x2.s1
+
x2.s2
+
x2.s3
+
x2.s4
+
x2.s5
+
x2.s6
+
x2.s7
+
x2.s8
+
x2.s9
+
x2.sa
+
x2.sb
+
x2.sc
+
x2.sd
+
x2.se
+
x2.sf
;
x3.s0
+=
x3.s1
+
x3.s2
+
x3.s3
+
x3.s4
+
x3.s5
+
x3.s6
+
x3.s7
+
x3.s8
+
x3.s9
+
x3.sa
+
x3.sb
+
x3.sc
+
x3.sd
+
x3.se
+
x3.sf
;
int
py
=
lidy
*
((
int
)
x0.s0
)
;
int
sy
=
lidy*lidy
;
int
bheight
=
min
(
tileSize_height,
TILE_SIZE/2
)
;
if
(
bheight
>=
TILE_SIZE/2&&lidy
>
bheight-1&&lidy
<
tileSize_height
)
{
m[9][lidy-bheight]
=
((
int
)
py
)
*
sy
; // m03
m[8][lidy-bheight]
=
((
int
)
x1.s0
)
*
sy
; // m12
m[7][lidy-bheight]
=
((
int
)
x2.s0
)
*
lidy
; // m21
m[6][lidy-bheight]
=
x3.s0
; // m30
m[5][lidy-bheight]
=
x0.s0
*
sy
; // m02
m[4][lidy-bheight]
=
x1.s0
*
lidy
; // m11
m[3][lidy-bheight]
=
x2.s0
; // m20
m[2][lidy-bheight]
=
py
; // m01
m[1][lidy-bheight]
=
x1.s0
; // m10
m[0][lidy-bheight]
=
x0.s0
; // m00
}
else
if
(
lidy
<
bheight
)
{
lm[9]
=
((
int
)
py
)
*
sy
; // m03
lm[8]
=
((
int
)
x1.s0
)
*
sy
; // m12
lm[7]
=
((
int
)
x2.s0
)
*
lidy
; // m21
lm[6]
=
x3.s0
; // m30
lm[5]
=
x0.s0
*
sy
; // m02
lm[4]
=
x1.s0
*
lidy
; // m11
lm[3]
=
x2.s0
; // m20
lm[2]
=
py
; // m01
lm[1]
=
x1.s0
; // m10
lm[0]
=
x0.s0
; // m00
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
for
(
int
j
=
bheight
; j >= 1; j = j/2 )
{
if
(
lidy
<
j
)
for
(
int
i
=
0
; i < 10; i++ )
lm[i]
=
lm[i]
+
m[i][lidy]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
lidy
>=
j/2&&lidy
<
j
)
for
(
int
i
=
0
; i < 10; i++ )
m[i][lidy-j/2]
=
lm[i]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
if
(
lidy
==
0&&lidx
==
0
)
{
for
(
int
mt
=
0
; mt < 10; mt++ )
mom[mt]
=
(
F
)
lm[mt]
;
if
(
binary
)
{
F
s
=
1./255
;
for
(
int
mt
=
0
; mt < 10; mt++ )
mom[mt]
*=
s
;
x3
+=
convert_T4
(
xxp
*
x
)
;
}
F
xm
=
x
*
mom[0],
ym
=
y
*
mom[0]
;
//
accumulate
moments
computed
in
each
tile
dst_step
/=
sizeof
(
F
)
;
x0.s0
=
x0.s0
+
x0.s1
+
x0.s2
+
x0.s3
;
//
+
m00
(
=
m00
'
)
*
(
dst_m
+
mad24
(
DST_ROW_00
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[0]
;
x1.s0
=
x1.s0
+
x1.s1
+
x1.s2
+
x1.s3
;
//
+
m10
(
=
m10
'
+
x*m00
'
)
*
(
dst_m
+
mad24
(
DST_ROW_10
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[1]
+
xm
;
x2.s0
=
x2.s0
+
x2.s1
+
x2.s2
+
x2.s3
;
//
+
m01
(
=
m01
'
+
y*m00
'
)
*
(
dst_m
+
mad24
(
DST_ROW_01
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[2]
+
ym
;
x3.s0
=
x3.s0
+
x3.s1
+
x3.s2
+
x3.s3
;
//
+
m20
(
=
m20
'
+
2*x*m10
'
+
x*x*m00
'
)
*
(
dst_m
+
mad24
(
DST_ROW_20
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[3]
+
x
*
(
mom[1]
*
2
+
xm
)
;
WT
x0_
=
0
;
WT
x1_
=
0
;
WT
x2_
=
0
;
WT
x3_
=
0
;
//
+
m11
(
=
m11
'
+
x*m01
'
+
y*m10
'
+
x*y*m00
'
)
*
(
dst_m
+
mad24
(
DST_ROW_11
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[4]
+
x
*
(
mom[2]
+
ym
)
+
y
*
mom[1]
;
//
+
m02
(
=
m02
'
+
2*y*m01
'
+
y*y*m00
'
)
*
(
dst_m
+
mad24
(
DST_ROW_02
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[5]
+
y
*
(
mom[2]
*
2
+
ym
)
;
//
+
m30
(
=
m30
'
+
3*x*m20
'
+
3*x*x*m10
'
+
x*x*x*m00
'
)
*
(
dst_m
+
mad24
(
DST_ROW_30
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[6]
+
x
*
(
3.
*
mom[3]
+
x
*
(
3.
*
mom[1]
+
xm
))
;
for
(
; i < x_rest; i++)
{
WT
p_
=
0
;
p_
=
row[i]
;
WT
x_
=
convert_T
(
codxy[i]
)
;
//
+
m21
(
=
m21
'
+
x*
(
2*m11
'
+
2*y*m10
'
+
x*m01
'
+
x*y*m00
'
)
+
y*m20
'
)
*
(
dst_m
+
mad24
(
DST_ROW_21
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[7]
+
x
*
(
2
*
(
mom[4]
+
y
*
mom[1]
)
+
x
*
(
mom[2]
+
ym
))
+
y
*
mom[3]
;
//
+
m12
(
=
m12
'
+
y*
(
2*m11
'
+
2*x*m01
'
+
y*m10
'
+
x*y*m00
'
)
+
x*m02
'
)
*
(
dst_m
+
mad24
(
DST_ROW_12
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[8]
+
y
*
(
2
*
(
mom[4]
+
x
*
mom[2]
)
+
y
*
(
mom[1]
+
xm
))
+
x
*
mom[5]
;
WT
xp_
=
x_
*
p_
;
WT
xxp_
=
xp_
*
x_
;
//
+
m03
(
=
m03
'
+
3*y*m02
'
+
3*y*y*m01
'
+
y*y*y*m00
'
)
*
(
dst_m
+
mad24
(
DST_ROW_03
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[9]
+
y
*
(
3.
*
mom[5]
+
y
*
(
3.
*
mom[2]
+
ym
))
;
x0_
+=
p_
;
x1_
+=
xp_
;
x2_
+=
xxp_
;
x3_
+=
xxp_
*
x_
;
}
}
__kernel
void
CvMoments_D2
(
__global
ushort8*
src_data,
int
src_rows,
int
src_cols,
int
src_step,
__global
F*
dst_m,
int
dst_cols,
int
dst_step,
int
blocky,
int
depth,
int
cn,
int
coi,
int
binary,
const
int
TILE_SIZE
)
{
ushort
tmp_coi[8]
; // get the coi data
ushort8
tmp[32]
;
int
VLEN_US
=
8
; // vector length of ushort
int
gidy
=
get_global_id
(
0
)
;
int
gidx
=
get_global_id
(
1
)
;
int
wgidy
=
get_group_id
(
0
)
;
int
wgidx
=
get_group_id
(
1
)
;
int
lidy
=
get_local_id
(
0
)
;
int
lidx
=
get_local_id
(
1
)
;
int
y
=
wgidy*TILE_SIZE
; // real Y index of pixel
int
x
=
wgidx*TILE_SIZE
; // real X index of pixel
int
kcn
=
(
cn==2
)
?2:4
;
int
rstep
=
min
(
src_step/2,
TILE_SIZE
)
;
int
tileSize_height
=
min
(
TILE_SIZE,
src_rows
-
y
)
;
int
tileSize_width
=
min
(
TILE_SIZE,
src_cols
-x
)
;
if
(
y+lidy
<
src_rows
)
x0.s0
+=
x0_
;
x1.s0
+=
x1_
;
x2.s0
+=
x2_
;
x3.s0
+=
x3_
;
}else
{
if
(
src_cols
>
TILE_SIZE
&&
tileSize_width
<
TILE_SIZE
)
for
(
int
i=tileSize_width
; i < rstep && (x+i) < src_cols; i++ )
*
((
__global
ushort*
)
src_data+
(
y+lidy
)
*src_step/2+x+i
)
=
0
;
if
(
coi
>
0
)
for
(
int
i=0
; i < tileSize_width; i+=VLEN_US)
for
(
int
i
=
0
; i < 256; i += 4)
{
for
(
int
j=0
; j<VLEN_US; j++)
tmp_coi[j]
=
*
((
__global
ushort*
)
src_data+
(
y+lidy
)
*
(
int
)
src_step/2+
(
x+i+j
)
*kcn+coi-1
)
;
tmp[i/VLEN_US]
=
(
ushort8
)(
tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7]
)
;
}
else
for
(
int
i=0
; i < tileSize_width; i+=VLEN_US)
tmp[i/VLEN_US]
=
*
(
src_data+
(
y+lidy
)
*src_step/
(
2*VLEN_US
)
+
(
x+i
)
/VLEN_US
)
;
}
p
=
convert_T4
(
vload4
(
0
,
row
+
i
))
;
x
=
convert_T4
(
vload4
(
0
,
codxy
+
i
))
;
xp
=
x
*
p
;
xxp
=
xp
*
x
;
ushort8
zero
=
(
ushort8
)(
0
)
;
ushort8
full
=
(
ushort8
)(
255
)
;
if
(
binary
)
for
(
int
i=0
; i < tileSize_width; i+=VLEN_US)
tmp[i/VLEN_US]
=
(
tmp[i/VLEN_US]!=zero
)
?full:zero
;
F
mom[10]
;
__local
long
m[10][128]
;
if
(
lidy
<
128
)
for
(
int
i=0
; i<10; i++)
m[i][lidy]=0
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
long
lm[10]
=
{0}
;
int8
x0
=
(
int8
)(
0
)
;
int8
x1
=
(
int8
)(
0
)
;
int8
x2
=
(
int8
)(
0
)
;
long8
x3
=
(
long8
)(
0
)
;
for
(
int
xt
=
0
; xt < tileSize_width; xt+=(VLEN_US) )
{
int8
v_xt
=
(
int8
)(
xt,
xt+1,
xt+2,
xt+3,
xt+4,
xt+5,
xt+6,
xt+7
)
;
int8
p
=
convert_int8
(
tmp[xt/VLEN_US]
)
;
int8
xp
=
v_xt
*
p,
xxp
=
xp
*
v_xt
;
x0
+=
p
;
x1
+=
xp
;
x2
+=
xxp
;
x3
+=
convert_long8
(
xxp
)
*convert_long8
(
v_xt
)
;
}
x0.s0
+=
x0.s1
+
x0.s2
+
x0.s3
+
x0.s4
+
x0.s5
+
x0.s6
+
x0.s7
;
x1.s0
+=
x1.s1
+
x1.s2
+
x1.s3
+
x1.s4
+
x1.s5
+
x1.s6
+
x1.s7
;
x2.s0
+=
x2.s1
+
x2.s2
+
x2.s3
+
x2.s4
+
x2.s5
+
x2.s6
+
x2.s7
;
x3.s0
+=
x3.s1
+
x3.s2
+
x3.s3
+
x3.s4
+
x3.s5
+
x3.s6
+
x3.s7
;
int
py
=
lidy
*
x0.s0,
sy
=
lidy*lidy
;
int
bheight
=
min
(
tileSize_height,
TILE_SIZE/2
)
;
if
(
bheight
>=
TILE_SIZE/2&&lidy
>
bheight-1&&lidy
<
tileSize_height
)
{
m[9][lidy-bheight]
=
((
long
)
py
)
*
sy
; // m03
m[8][lidy-bheight]
=
((
long
)
x1.s0
)
*
sy
; // m12
m[7][lidy-bheight]
=
((
long
)
x2.s0
)
*
lidy
; // m21
m[6][lidy-bheight]
=
x3.s0
; // m30
m[5][lidy-bheight]
=
x0.s0
*
sy
; // m02
m[4][lidy-bheight]
=
x1.s0
*
lidy
; // m11
m[3][lidy-bheight]
=
x2.s0
; // m20
m[2][lidy-bheight]
=
py
; // m01
m[1][lidy-bheight]
=
x1.s0
; // m10
m[0][lidy-bheight]
=
x0.s0
; // m00
}
else
if
(
lidy
<
bheight
)
{
lm[9]
=
((
long
)
py
)
*
sy
; // m03
lm[8]
=
((
long
)
x1.s0
)
*
sy
; // m12
lm[7]
=
((
long
)
x2.s0
)
*
lidy
; // m21
lm[6]
=
x3.s0
; // m30
lm[5]
=
x0.s0
*
sy
; // m02
lm[4]
=
x1.s0
*
lidy
; // m11
lm[3]
=
x2.s0
; // m20
lm[2]
=
py
; // m01
lm[1]
=
x1.s0
; // m10
lm[0]
=
x0.s0
; // m00
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
for
(
int
j
=
TILE_SIZE/2
; j >= 1; j = j/2 )
{
if
(
lidy
<
j
)
for
(
int
i
=
0
; i < 10; i++ )
lm[i]
=
lm[i]
+
m[i][lidy]
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
for
(
int
j
=
TILE_SIZE/2
; j >= 1; j = j/2 )
{
if
(
lidy
>=
j/2&&lidy
<
j
)
for
(
int
i
=
0
; i < 10; i++ )
m[i][lidy-j/2]
=
lm[i]
;
x3
+=
convert_T4
(
xxp
*
x
)
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
lidy
==
0&&lidx
==
0
)
{
for
(
int
mt
=
0
; mt < 10; mt++ )
mom[mt]
=
(
F
)
lm[mt]
;
if
(
binary
)
{
F
s
=
1./255
;
for
(
int
mt
=
0
; mt < 10; mt++ )
mom[mt]
*=
s
;
}
F
xm
=
x
*mom[0],
ym
=
y
*
mom[0]
;
//
accumulate
moments
computed
in
each
tile
dst_step
/=
sizeof
(
F
)
;
//
+
m00
(
=
m00
'
)
*
(
dst_m
+
mad24
(
DST_ROW_00
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[0]
;
//
+
m10
(
=
m10
'
+
x*m00
'
)
*
(
dst_m
+
mad24
(
DST_ROW_10
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[1]
+
xm
;
//
+
m01
(
=
m01
'
+
y*m00
'
)
*
(
dst_m
+
mad24
(
DST_ROW_01
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[2]
+
ym
;
x0.s0
=
x0.s0
+
x0.s1
+
x0.s2
+
x0.s3
;
//
+
m20
(
=
m20
'
+
2*x*m10
'
+
x*x*m00
'
)
*
(
dst_m
+
mad24
(
DST_ROW_20
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[3]
+
x
*
(
mom[1]
*
2
+
xm
)
;
x1.s0
=
x1.s0
+
x1.s1
+
x1.s2
+
x1.s3
;
//
+
m11
(
=
m11
'
+
x*m01
'
+
y*m10
'
+
x*y*m00
'
)
*
(
dst_m
+
mad24
(
DST_ROW_11
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[4]
+
x
*
(
mom[2]
+
ym
)
+
y
*
mom[1]
;
x2.s0
=
x2.s0
+
x2.s1
+
x2.s2
+
x2.s3
;
//
+
m02
(
=
m02
'
+
2*y*m01
'
+
y*y*m00
'
)
*
(
dst_m
+
mad24
(
DST_ROW_02
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[5]
+
y
*
(
mom[2]
*
2
+
ym
)
;
//
+
m30
(
=
m30
'
+
3*x*m20
'
+
3*x*x*m10
'
+
x*x*x*m00
'
)
*
(
dst_m
+
mad24
(
DST_ROW_30
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[6]
+
x
*
(
3.
*
mom[3]
+
x
*
(
3.
*
mom[1]
+
xm
))
;
//
+
m21
(
=
m21
'
+
x*
(
2*m11
'
+
2*y*m10
'
+
x*m01
'
+
x*y*m00
'
)
+
y*m20
'
)
*
(
dst_m
+
mad24
(
DST_ROW_21
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[7]
+
x
*
(
2
*
(
mom[4]
+
y
*
mom[1]
)
+
x
*
(
mom[2]
+
ym
))
+
y
*
mom[3]
;
//
+
m12
(
=
m12
'
+
y*
(
2*m11
'
+
2*x*m01
'
+
y*m10
'
+
x*y*m00
'
)
+
x*m02
'
)
*
(
dst_m
+
mad24
(
DST_ROW_12
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[8]
+
y
*
(
2
*
(
mom[4]
+
x
*
mom[2]
)
+
y
*
(
mom[1]
+
xm
))
+
x
*
mom[5]
;
x3.s0
=
x3.s0
+
x3.s1
+
x3.s2
+
x3.s3
;
}
//
+
m03
(
=
m03
'
+
3*y*m02
'
+
3*y*y*m01
'
+
y*y*y*m00
'
)
*
(
dst_m
+
mad24
(
DST_ROW_03
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[9]
+
y
*
(
3.
*
mom[5]
+
y
*
(
3.
*
mom[2]
+
ym
))
;
py
=
ly
*
x0.s0
;
sy
=
ly
*
ly
;
}
}
__local
WT
mom[10][256]
;
__kernel
void
CvMoments_D3
(
__global
short8*
src_data,
int
src_rows,
int
src_cols,
int
src_step,
__global
F*
dst_m,
int
dst_cols,
int
dst_step,
int
blocky,
int
depth,
int
cn,
int
coi,
int
binary,
const
int
TILE_SIZE
)
{
short
tmp_coi[8]
; // get the coi data
short8
tmp[32]
;
int
VLEN_S
=8
; // vector length of short
int
gidy
=
get_global_id
(
0
)
;
int
gidx
=
get_global_id
(
1
)
;
int
wgidy
=
get_group_id
(
0
)
;
int
wgidx
=
get_group_id
(
1
)
;
int
lidy
=
get_local_id
(
0
)
;
int
lidx
=
get_local_id
(
1
)
;
int
y
=
wgidy*TILE_SIZE
; // real Y index of pixel
int
x
=
wgidx*TILE_SIZE
; // real X index of pixel
int
kcn
=
(
cn==2
)
?2:4
;
int
rstep
=
min
(
src_step/2,
TILE_SIZE
)
;
int
tileSize_height
=
min
(
TILE_SIZE,
src_rows
-
y
)
;
int
tileSize_width
=
min
(
TILE_SIZE,
src_cols
-x
)
;
if
(
y+lidy
<
src_rows
)
if
((
y_rest
>
0
)
&&
(
gidy
==
(
get_num_groups
(
1
)
-
1
)))
{
if
(
tileSize_width
<
TILE_SIZE
)
for
(
int
i
=
tileSize_width
; i < rstep && (x+i) < src_cols; i++ )
*
((
__global
short*
)
src_data+
(
y+lidy
)
*src_step/2+x+i
)
=
0
;
if
(
coi
>
0
)
for
(
int
i=0
; i < tileSize_width; i+=VLEN_S)
if
(
ly
<
y_rest
)
{
for
(
int
j=0
; j<VLEN_S; j++)
tmp_coi[j]
=
*
((
__global
short*
)
src_data+
(
y+lidy
)
*src_step/2+
(
x+i+j
)
*kcn+coi-1
)
;
tmp[i/VLEN_S]
=
(
short8
)(
tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7]
)
;
mom[9][ly]
=
py
*
sy
;
mom[8][ly]
=
x1.s0
*
sy
;
mom[7][ly]
=
x2.s0
*
ly
;
mom[6][ly]
=
x3.s0
;
mom[5][ly]
=
x0.s0
*
sy
;
mom[4][ly]
=
x1.s0
*
ly
;
mom[3][ly]
=
x2.s0
;
mom[2][ly]
=
py
;
mom[1][ly]
=
x1.s0
;
mom[0][ly]
=
x0.s0
;
}
else
for
(
int
i=0
; i < tileSize_width; i+=VLEN_S)
tmp[i/VLEN_S]
=
*
(
src_data+
(
y+lidy
)
*src_step/
(
2*VLEN_S
)
+
(
x+i
)
/VLEN_S
)
;
}
short8
zero
=
(
short8
)(
0
)
;
short8
full
=
(
short8
)(
255
)
;
if
(
binary
)
for
(
int
i=0
; i < tileSize_width; i+=(VLEN_S))
tmp[i/VLEN_S]
=
(
tmp[i/VLEN_S]!=zero
)
?full:zero
;
F
mom[10]
;
__local
long
m[10][128]
;
if
(
lidy
<
128
)
for
(
int
i=0
; i<10; i++)
m[i][lidy]=0
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
long
lm[10]
=
{0}
;
int8
x0
=
(
int8
)(
0
)
;
int8
x1
=
(
int8
)(
0
)
;
int8
x2
=
(
int8
)(
0
)
;
long8
x3
=
(
long8
)(
0
)
;
for
(
int
xt
=
0
; xt < tileSize_width; xt+= (VLEN_S))
if
(
ly
<
10
)
{
int8
v_xt
=
(
int8
)(
xt,
xt+1,
xt+2,
xt+3,
xt+4,
xt+5,
xt+6,
xt+7
)
;
int8
p
=
convert_int8
(
tmp[xt/VLEN_S]
)
;
int8
xp
=
v_xt
*
p,
xxp
=
xp
*
v_xt
;
x0
+=
p
;
x1
+=
xp
;
x2
+=
xxp
;
x3
+=
convert_long8
(
xxp
)
*
convert_long8
(
v_xt
)
;
}
x0.s0
+=
x0.s1
+
x0.s2
+
x0.s3
+
x0.s4
+
x0.s5
+
x0.s6
+
x0.s7
;
x1.s0
+=
x1.s1
+
x1.s2
+
x1.s3
+
x1.s4
+
x1.s5
+
x1.s6
+
x1.s7
;
x2.s0
+=
x2.s1
+
x2.s2
+
x2.s3
+
x2.s4
+
x2.s5
+
x2.s6
+
x2.s7
;
x3.s0
+=
x3.s1
+
x3.s2
+
x3.s3
+
x3.s4
+
x3.s5
+
x3.s6
+
x3.s7
;
int
py
=
lidy
*
x0.s0,
sy
=
lidy*lidy
;
int
bheight
=
min
(
tileSize_height,
TILE_SIZE/2
)
;
if
(
bheight
>=
TILE_SIZE/2&&lidy
>
bheight-1&&lidy
<
tileSize_height
)
for
(
int
i
=
1
; i < y_rest; i++)
{
m[9][lidy-bheight]
=
((
long
)
py
)
*
sy
; // m03
m[8][lidy-bheight]
=
((
long
)
x1.s0
)
*
sy
; // m12
m[7][lidy-bheight]
=
((
long
)
x2.s0
)
*
lidy
; // m21
m[6][lidy-bheight]
=
x3.s0
; // m30
m[5][lidy-bheight]
=
x0.s0
*
sy
; // m02
m[4][lidy-bheight]
=
x1.s0
*
lidy
; // m11
m[3][lidy-bheight]
=
x2.s0
; // m20
m[2][lidy-bheight]
=
py
; // m01
m[1][lidy-bheight]
=
x1.s0
; // m10
m[0][lidy-bheight]
=
x0.s0
; // m00
mom[ly][0]
=
mom[ly][i]
+
mom[ly][0]
;
}
else
if
(
lidy
<
bheight
)
{
lm[9]
=
((
long
)
py
)
*
sy
; // m03
lm[8]
=
((
long
)(
x1.s0
))
*
sy
; // m12
lm[7]
=
((
long
)(
x2.s0
))
*
lidy
; // m21
lm[6]
=
x3.s0
; // m30
lm[5]
=
x0.s0
*
sy
; // m02
lm[4]
=
x1.s0
*
lidy
; // m11
lm[3]
=
x2.s0
; // m20
lm[2]
=
py
; // m01
lm[1]
=
x1.s0
; // m10
lm[0]
=
x0.s0
; // m00
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
for
(
int
j
=
TILE_SIZE/2
; j >=1; j = j/2 )
}else
{
if
(
lidy
<
j
)
for
(
int
i
=
0
; i < 10; i++ )
lm[i]
=
lm[i]
+
m[i][lidy]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
lidy
>=
j/2&&lidy
<
j
)
for
(
int
i
=
0
; i < 10; i++ )
m[i][lidy-j/2]
=
lm[i]
;
mom[9][ly]
=
py
*
sy
;
mom[8][ly]
=
x1.s0
*
sy
;
mom[7][ly]
=
x2.s0
*
ly
;
mom[6][ly]
=
x3.s0
;
mom[5][ly]
=
x0.s0
*
sy
;
mom[4][ly]
=
x1.s0
*
ly
;
mom[3][ly]
=
x2.s0
;
mom[2][ly]
=
py
;
mom[1][ly]
=
x1.s0
;
mom[0][ly]
=
x0.s0
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
if
(
lidy
==0
&&lidx
==0
)
{
for
(
int
mt
=
0
; mt < 10; mt++ )
mom[mt]
=
(
F
)
lm[mt]
;
if
(
binary
)
if
(
ly
<
128
)
{
F
s
=
1./255
;
for
(
int
mt
=
0
; mt < 10; mt++ )
mom[mt]
*=
s
;
}
F
xm
=
x
*
mom[0],
ym
=
y*mom[0]
;
//
accumulate
moments
computed
in
each
tile
dst_step
/=
sizeof
(
F
)
;
//
+
m00
(
=
m00
'
)
*
(
dst_m
+
mad24
(
DST_ROW_00
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[0]
;
//
+
m10
(
=
m10
'
+
x*m00
'
)
*
(
dst_m
+
mad24
(
DST_ROW_10
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[1]
+
xm
;
//
+
m01
(
=
m01
'
+
y*m00
'
)
*
(
dst_m
+
mad24
(
DST_ROW_01
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[2]
+
ym
;
//
+
m20
(
=
m20
'
+
2*x*m10
'
+
x*x*m00
'
)
*
(
dst_m
+
mad24
(
DST_ROW_20
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[3]
+
x
*
(
mom[1]
*
2
+
xm
)
;
//
+
m11
(
=
m11
'
+
x*m01
'
+
y*m10
'
+
x*y*m00
'
)
*
(
dst_m
+
mad24
(
DST_ROW_11
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[4]
+
x
*
(
mom[2]
+
ym
)
+
y
*
mom[1]
;
//
+
m02
(
=
m02
'
+
2*y*m01
'
+
y*y*m00
'
)
*
(
dst_m
+
mad24
(
DST_ROW_02
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[5]
+
y
*
(
mom[2]
*
2
+
ym
)
;
//
+
m30
(
=
m30
'
+
3*x*m20
'
+
3*x*x*m10
'
+
x*x*x*m00
'
)
*
(
dst_m
+
mad24
(
DST_ROW_30
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[6]
+
x
*
(
3.
*
mom[3]
+
x
*
(
3.
*
mom[1]
+
xm
))
;
//
+
m21
(
=
m21
'
+
x*
(
2*m11
'
+
2*y*m10
'
+
x*m01
'
+
x*y*m00
'
)
+
y*m20
'
)
*
(
dst_m
+
mad24
(
DST_ROW_21
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[7]
+
x
*
(
2
*
(
mom[4]
+
y
*
mom[1]
)
+
x
*
(
mom[2]
+
ym
))
+
y
*
mom[3]
;
//
+
m12
(
=
m12
'
+
y*
(
2*m11
'
+
2*x*m01
'
+
y*m10
'
+
x*y*m00
'
)
+
x*m02
'
)
*
(
dst_m
+
mad24
(
DST_ROW_12
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[8]
+
y
*
(
2
*
(
mom[4]
+
x
*
mom[2]
)
+
y
*
(
mom[1]
+
xm
))
+
x
*
mom[5]
;
//
+
m03
(
=
m03
'
+
3*y*m02
'
+
3*y*y*m01
'
+
y*y*y*m00
'
)
*
(
dst_m
+
mad24
(
DST_ROW_03
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[9]
+
y
*
(
3.
*
mom[5]
+
y
*
(
3.
*
mom[2]
+
ym
))
;
mom[0][ly]
=
mom[0][ly]
+
mom[0][ly
+
128]
;
mom[1][ly]
=
mom[1][ly]
+
mom[1][ly
+
128]
;
mom[2][ly]
=
mom[2][ly]
+
mom[2][ly
+
128]
;
mom[3][ly]
=
mom[3][ly]
+
mom[3][ly
+
128]
;
mom[4][ly]
=
mom[4][ly]
+
mom[4][ly
+
128]
;
mom[5][ly]
=
mom[5][ly]
+
mom[5][ly
+
128]
;
mom[6][ly]
=
mom[6][ly]
+
mom[6][ly
+
128]
;
mom[7][ly]
=
mom[7][ly]
+
mom[7][ly
+
128]
;
mom[8][ly]
=
mom[8][ly]
+
mom[8][ly
+
128]
;
mom[9][ly]
=
mom[9][ly]
+
mom[9][ly
+
128]
;
}
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
__kernel
void
CvMoments_D5
(
__global
float*
src_data,
int
src_rows,
int
src_cols,
int
src_step,
__global
F*
dst_m,
int
dst_cols,
int
dst_step,
int
blocky,
int
depth,
int
cn,
int
coi,
int
binary,
const
int
TILE_SIZE
)
{
float
tmp_coi[4]
; // get the coi data
float4
tmp[64]
;
int
VLEN_F
=
4
; // vector length of float
int
gidy
=
get_global_id
(
0
)
;
int
gidx
=
get_global_id
(
1
)
;
int
wgidy
=
get_group_id
(
0
)
;
int
wgidx
=
get_group_id
(
1
)
;
int
lidy
=
get_local_id
(
0
)
;
int
lidx
=
get_local_id
(
1
)
;
int
y
=
wgidy*TILE_SIZE
; // real Y index of pixel
int
x
=
wgidx*TILE_SIZE
; // real X index of pixel
int
kcn
=
(
cn==2
)
?2:4
;
int
rstep
=
min
(
src_step/4,
TILE_SIZE
)
;
int
tileSize_height
=
min
(
TILE_SIZE,
src_rows
-
y
)
;
int
tileSize_width
=
min
(
TILE_SIZE,
src_cols
-x
)
;
int
maxIdx
=
mul24
(
src_rows,
src_cols
)
;
int
yOff
=
(
y+lidy
)
*src_step
;
int
index
;
if
(
y+lidy
<
src_rows
)
if
(
ly
<
64
)
{
if
(
tileSize_width
<
TILE_SIZE
)
for
(
int
i
=
tileSize_width
; i < rstep && (x+i) < src_cols; i++ )
*
((
__global
float*
)
src_data+
(
y+lidy
)
*src_step/4+x+i
)
=
0
;
if
(
coi
>
0
)
for
(
int
i=0
; i < tileSize_width; i+=VLEN_F)
{
for
(
int
j=0
; j<4; j++)
tmp_coi[j]
=
*
(
src_data+
(
y+lidy
)
*src_step/4+
(
x+i+j
)
*kcn+coi-1
)
;
tmp[i/VLEN_F]
=
(
float4
)(
tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3]
)
;
}
else
for
(
int
i=0
; i < tileSize_width; i+=VLEN_F)
tmp[i/VLEN_F]
=
(
float4
)(
*
(
src_data+
(
y+lidy
)
*src_step/4+x+i
)
,
*
(
src_data+
(
y+lidy
)
*src_step/4+x+i+1
)
,
*
(
src_data+
(
y+lidy
)
*src_step/4+x+i+2
)
,
*
(
src_data+
(
y+lidy
)
*src_step/4+x+i+3
))
;
mom[0][ly]
=
mom[0][ly]
+
mom[0][ly
+
64]
;
mom[1][ly]
=
mom[1][ly]
+
mom[1][ly
+
64]
;
mom[2][ly]
=
mom[2][ly]
+
mom[2][ly
+
64]
;
mom[3][ly]
=
mom[3][ly]
+
mom[3][ly
+
64]
;
mom[4][ly]
=
mom[4][ly]
+
mom[4][ly
+
64]
;
mom[5][ly]
=
mom[5][ly]
+
mom[5][ly
+
64]
;
mom[6][ly]
=
mom[6][ly]
+
mom[6][ly
+
64]
;
mom[7][ly]
=
mom[7][ly]
+
mom[7][ly
+
64]
;
mom[8][ly]
=
mom[8][ly]
+
mom[8][ly
+
64]
;
mom[9][ly]
=
mom[9][ly]
+
mom[9][ly
+
64]
;
}
float4
zero
=
(
float4
)(
0
)
;
float4
full
=
(
float4
)(
255
)
;
if
(
binary
)
for
(
int
i=0
; i < tileSize_width; i+=4)
tmp[i/VLEN_F]
=
(
tmp[i/VLEN_F]!=zero
)
?full:zero
;
F
mom[10]
;
__local
F
m[10][128]
;
if
(
lidy
<
128
)
for
(
int
i
=
0
; i < 10; i ++)
m[i][lidy]
=
0
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
F
lm[10]
=
{0}
;
F4
x0
=
(
F4
)(
0
)
;
F4
x1
=
(
F4
)(
0
)
;
F4
x2
=
(
F4
)(
0
)
;
F4
x3
=
(
F4
)(
0
)
;
for
(
int
xt
=
0
; xt < tileSize_width; xt+=VLEN_F )
{
F4
v_xt
=
(
F4
)(
xt,
xt+1,
xt+2,
xt+3
)
;
F4
p
=
convert_F4
(
tmp[xt/VLEN_F]
)
;
F4
xp
=
v_xt
*
p,
xxp
=
xp
*
v_xt
;
x0
+=
p
;
x1
+=
xp
;
x2
+=
xxp
;
x3
+=
xxp
*
v_xt
;
}
x0.s0
+=
x0.s1
+
x0.s2
+
x0.s3
;
x1.s0
+=
x1.s1
+
x1.s2
+
x1.s3
;
x2.s0
+=
x2.s1
+
x2.s2
+
x2.s3
;
x3.s0
+=
x3.s1
+
x3.s2
+
x3.s3
;
F
py
=
lidy
*
x0.s0,
sy
=
lidy*lidy
;
int
bheight
=
min
(
tileSize_height,
TILE_SIZE/2
)
;
if
(
bheight
>=
TILE_SIZE/2&&lidy
>
bheight-1&&lidy
<
tileSize_height
)
if
(
ly
<
32
)
{
m[9][lidy-bheight]
=
((
F
)
py
)
*
sy
; // m03
m[8][lidy-bheight]
=
((
F
)
x1.s0
)
*
sy
; // m12
m[7][lidy-bheight]
=
((
F
)
x2.s0
)
*
lidy
; // m21
m[6][lidy-bheight]
=
x3.s0
; // m30
m[5][lidy-bheight]
=
x0.s0
*
sy
; // m02
m[4][lidy-bheight]
=
x1.s0
*
lidy
; // m11
m[3][lidy-bheight]
=
x2.s0
; // m20
m[2][lidy-bheight]
=
py
; // m01
m[1][lidy-bheight]
=
x1.s0
; // m10
m[0][lidy-bheight]
=
x0.s0
; // m00
mom[0][ly]
=
mom[0][ly]
+
mom[0][ly
+
32]
;
mom[1][ly]
=
mom[1][ly]
+
mom[1][ly
+
32]
;
mom[2][ly]
=
mom[2][ly]
+
mom[2][ly
+
32]
;
mom[3][ly]
=
mom[3][ly]
+
mom[3][ly
+
32]
;
mom[4][ly]
=
mom[4][ly]
+
mom[4][ly
+
32]
;
mom[5][ly]
=
mom[5][ly]
+
mom[5][ly
+
32]
;
mom[6][ly]
=
mom[6][ly]
+
mom[6][ly
+
32]
;
mom[7][ly]
=
mom[7][ly]
+
mom[7][ly
+
32]
;
mom[8][ly]
=
mom[8][ly]
+
mom[8][ly
+
32]
;
mom[9][ly]
=
mom[9][ly]
+
mom[9][ly
+
32]
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
else
if
(
lidy
<
bheight
)
if
(
ly
<
16
)
{
lm[9]
=
((
F
)
py
)
*
sy
; // m03
lm[8]
=
((
F
)
x1.s0
)
*
sy
; // m12
lm[7]
=
((
F
)
x2.s0
)
*
lidy
; // m21
lm[6]
=
x3.s0
; // m30
lm[5]
=
x0.s0
*
sy
; // m02
lm[4]
=
x1.s0
*
lidy
; // m11
lm[3]
=
x2.s0
; // m20
lm[2]
=
py
; // m01
lm[1]
=
x1.s0
; // m10
lm[0]
=
x0.s0
; // m00
mom[0][ly]
=
mom[0][ly]
+
mom[0][ly
+
16]
;
mom[1][ly]
=
mom[1][ly]
+
mom[1][ly
+
16]
;
mom[2][ly]
=
mom[2][ly]
+
mom[2][ly
+
16]
;
mom[3][ly]
=
mom[3][ly]
+
mom[3][ly
+
16]
;
mom[4][ly]
=
mom[4][ly]
+
mom[4][ly
+
16]
;
mom[5][ly]
=
mom[5][ly]
+
mom[5][ly
+
16]
;
mom[6][ly]
=
mom[6][ly]
+
mom[6][ly
+
16]
;
mom[7][ly]
=
mom[7][ly]
+
mom[7][ly
+
16]
;
mom[8][ly]
=
mom[8][ly]
+
mom[8][ly
+
16]
;
mom[9][ly]
=
mom[9][ly]
+
mom[9][ly
+
16]
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
for
(
int
j
=
TILE_SIZE/2
; j >= 1; j = j/2 )
if
(
ly
<
8
)
{
if
(
lidy
<
j
)
for
(
int
i
=
0
; i < 10; i++ )
lm[i]
=
lm[i]
+
m[i][lidy]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
lidy
>=
j/2&&lidy
<
j
)
for
(
int
i
=
0
; i < 10; i++ )
m[i][lidy-j/2]
=
lm[i]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
mom[0][ly]
=
mom[0][ly]
+
mom[0][ly
+
8]
;
mom[1][ly]
=
mom[1][ly]
+
mom[1][ly
+
8]
;
mom[2][ly]
=
mom[2][ly]
+
mom[2][ly
+
8]
;
mom[3][ly]
=
mom[3][ly]
+
mom[3][ly
+
8]
;
mom[4][ly]
=
mom[4][ly]
+
mom[4][ly
+
8]
;
mom[5][ly]
=
mom[5][ly]
+
mom[5][ly
+
8]
;
mom[6][ly]
=
mom[6][ly]
+
mom[6][ly
+
8]
;
mom[7][ly]
=
mom[7][ly]
+
mom[7][ly
+
8]
;
mom[8][ly]
=
mom[8][ly]
+
mom[8][ly
+
8]
;
mom[9][ly]
=
mom[9][ly]
+
mom[9][ly
+
8]
;
}
if
(
lidy
==
0&&lidx
==
0
)
{
for
(
int
mt
=
0
; mt < 10; mt++ )
mom[mt]
=
(
F
)
lm[mt]
;
if
(
binary
)
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
ly
<
4
)
{
F
s
=
1./255
;
for
(
int
mt
=
0
; mt < 10; mt++ )
mom[mt]
*=
s
;
mom[0][ly]
=
mom[0][ly]
+
mom[0][ly
+
4]
;
mom[1][ly]
=
mom[1][ly]
+
mom[1][ly
+
4]
;
mom[2][ly]
=
mom[2][ly]
+
mom[2][ly
+
4]
;
mom[3][ly]
=
mom[3][ly]
+
mom[3][ly
+
4]
;
mom[4][ly]
=
mom[4][ly]
+
mom[4][ly
+
4]
;
mom[5][ly]
=
mom[5][ly]
+
mom[5][ly
+
4]
;
mom[6][ly]
=
mom[6][ly]
+
mom[6][ly
+
4]
;
mom[7][ly]
=
mom[7][ly]
+
mom[7][ly
+
4]
;
mom[8][ly]
=
mom[8][ly]
+
mom[8][ly
+
4]
;
mom[9][ly]
=
mom[9][ly]
+
mom[9][ly
+
4]
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
F
xm
=
x
*
mom[0],
ym
=
y
*
mom[0]
;
//
accumulate
moments
computed
in
each
tile
dst_step
/=
sizeof
(
F
)
;
//
+
m00
(
=
m00
'
)
*
(
dst_m
+
mad24
(
DST_ROW_00
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[0]
;
//
+
m10
(
=
m10
'
+
x*m00
'
)
*
(
dst_m
+
mad24
(
DST_ROW_10
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[1]
+
xm
;
//
+
m01
(
=
m01
'
+
y*m00
'
)
*
(
dst_m
+
mad24
(
DST_ROW_01
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[2]
+
ym
;
//
+
m20
(
=
m20
'
+
2*x*m10
'
+
x*x*m00
'
)
*
(
dst_m
+
mad24
(
DST_ROW_20
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[3]
+
x
*
(
mom[1]
*
2
+
xm
)
;
//
+
m11
(
=
m11
'
+
x*m01
'
+
y*m10
'
+
x*y*m00
'
)
*
(
dst_m
+
mad24
(
DST_ROW_11
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[4]
+
x
*
(
mom[2]
+
ym
)
+
y
*
mom[1]
;
//
+
m02
(
=
m02
'
+
2*y*m01
'
+
y*y*m00
'
)
*
(
dst_m
+
mad24
(
DST_ROW_02
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[5]
+
y
*
(
mom[2]
*
2
+
ym
)
;
//
+
m30
(
=
m30
'
+
3*x*m20
'
+
3*x*x*m10
'
+
x*x*x*m00
'
)
*
(
dst_m
+
mad24
(
DST_ROW_30
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[6]
+
x
*
(
3.
*
mom[3]
+
x
*
(
3.
*
mom[1]
+
xm
))
;
//
+
m21
(
=
m21
'
+
x*
(
2*m11
'
+
2*y*m10
'
+
x*m01
'
+
x*y*m00
'
)
+
y*m20
'
)
*
(
dst_m
+
mad24
(
DST_ROW_21
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[7]
+
x
*
(
2
*
(
mom[4]
+
y
*
mom[1]
)
+
x
*
(
mom[2]
+
ym
))
+
y
*
mom[3]
;
//
+
m12
(
=
m12
'
+
y*
(
2*m11
'
+
2*x*m01
'
+
y*m10
'
+
x*y*m00
'
)
+
x*m02
'
)
*
(
dst_m
+
mad24
(
DST_ROW_12
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[8]
+
y
*
(
2
*
(
mom[4]
+
x
*
mom[2]
)
+
y
*
(
mom[1]
+
xm
))
+
x
*
mom[5]
;
//
+
m03
(
=
m03
'
+
3*y*m02
'
+
3*y*y*m01
'
+
y*y*y*m00
'
)
*
(
dst_m
+
mad24
(
DST_ROW_03
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[9]
+
y
*
(
3.
*
mom[5]
+
y
*
(
3.
*
mom[2]
+
ym
))
;
if
(
ly
<
2
)
{
mom[0][ly]
=
mom[0][ly]
+
mom[0][ly
+
2]
;
mom[1][ly]
=
mom[1][ly]
+
mom[1][ly
+
2]
;
mom[2][ly]
=
mom[2][ly]
+
mom[2][ly
+
2]
;
mom[3][ly]
=
mom[3][ly]
+
mom[3][ly
+
2]
;
mom[4][ly]
=
mom[4][ly]
+
mom[4][ly
+
2]
;
mom[5][ly]
=
mom[5][ly]
+
mom[5][ly
+
2]
;
mom[6][ly]
=
mom[6][ly]
+
mom[6][ly
+
2]
;
mom[7][ly]
=
mom[7][ly]
+
mom[7][ly
+
2]
;
mom[8][ly]
=
mom[8][ly]
+
mom[8][ly
+
2]
;
mom[9][ly]
=
mom[9][ly]
+
mom[9][ly
+
2]
;
}
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
__kernel
void
CvMoments_D6
(
__global
F*
src_data,
int
src_rows,
int
src_cols,
int
src_step,
__global
F*
dst_m,
int
dst_cols,
int
dst_step,
int
blocky,
int
depth,
int
cn,
int
coi,
int
binary,
const
int
TILE_SIZE
)
{
F
tmp_coi[4]
; // get the coi data
F4
tmp[64]
;
int
VLEN_D
=
4
; // length of vetor
int
gidy
=
get_global_id
(
0
)
;
int
gidx
=
get_global_id
(
1
)
;
int
wgidy
=
get_group_id
(
0
)
;
int
wgidx
=
get_group_id
(
1
)
;
int
lidy
=
get_local_id
(
0
)
;
int
lidx
=
get_local_id
(
1
)
;
int
y
=
wgidy*TILE_SIZE
; // real Y index of pixel
int
x
=
wgidx*TILE_SIZE
; // real X index of pixel
int
kcn
=
(
cn==2
)
?2:4
;
int
rstep
=
min
(
src_step/8,
TILE_SIZE
)
;
int
tileSize_height
=
min
(
TILE_SIZE,
src_rows
-
y
)
;
int
tileSize_width
=
min
(
TILE_SIZE,
src_cols
-
x
)
;
if
(
y+lidy
<
src_rows
)
{
if
(
tileSize_width
<
TILE_SIZE
)
for
(
int
i
=
tileSize_width
; i < rstep && (x+i) < src_cols; i++ )
*
((
__global
F*
)
src_data+
(
y+lidy
)
*src_step/8+x+i
)
=
0
;
if
(
coi
>
0
)
for
(
int
i=0
; i < tileSize_width; i+=VLEN_D)
if
(
ly
<
1
)
{
for
(
int
j=0
; j<4 && ((x+i+j)*kcn+coi-1)<src_cols; j++)
tmp_coi[j]
=
*
(
src_data+
(
y+lidy
)
*src_step/8+
(
x+i+j
)
*kcn+coi-1
)
;
tmp[i/VLEN_D]
=
(
F4
)(
tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3]
)
;
mom[0][ly]
=
mom[0][ly]
+
mom[0][ly
+
1]
;
mom[1][ly]
=
mom[1][ly]
+
mom[1][ly
+
1]
;
mom[2][ly]
=
mom[2][ly]
+
mom[2][ly
+
1]
;
mom[3][ly]
=
mom[3][ly]
+
mom[3][ly
+
1]
;
mom[4][ly]
=
mom[4][ly]
+
mom[4][ly
+
1]
;
mom[5][ly]
=
mom[5][ly]
+
mom[5][ly
+
1]
;
mom[6][ly]
=
mom[6][ly]
+
mom[6][ly
+
1]
;
mom[7][ly]
=
mom[7][ly]
+
mom[7][ly
+
1]
;
mom[8][ly]
=
mom[8][ly]
+
mom[8][ly
+
1]
;
mom[9][ly]
=
mom[9][ly]
+
mom[9][ly
+
1]
;
}
else
for
(
int
i=0
; i < tileSize_width && (x+i+3) < src_cols; i+=VLEN_D)
tmp[i/VLEN_D]
=
(
F4
)(
*
(
src_data+
(
y+lidy
)
*src_step/8+x+i
)
,
*
(
src_data+
(
y+lidy
)
*src_step/8+x+i+1
)
,
*
(
src_data+
(
y+lidy
)
*src_step/8+x+i+2
)
,
*
(
src_data+
(
y+lidy
)
*src_step/8+x+i+3
))
;
}
F4
zero
=
(
F4
)(
0
)
;
F4
full
=
(
F4
)(
255
)
;
if
(
binary
)
for
(
int
i=0
; i < tileSize_width; i+=VLEN_D)
tmp[i/VLEN_D]
=
(
tmp[i/VLEN_D]!=zero
)
?full:zero
;
F
mom[10]
;
__local
F
m[10][128]
;
if
(
lidy
<
128
)
for
(
int
i=0
; i<10; i++)
m[i][lidy]=0
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
F
lm[10]
=
{0}
;
F4
x0
=
(
F4
)(
0
)
;
F4
x1
=
(
F4
)(
0
)
;
F4
x2
=
(
F4
)(
0
)
;
F4
x3
=
(
F4
)(
0
)
;
for
(
int
xt
=
0
; xt < tileSize_width; xt+=VLEN_D )
{
F4
v_xt
=
(
F4
)(
xt,
xt+1,
xt+2,
xt+3
)
;
F4
p
=
tmp[xt/VLEN_D]
;
F4
xp
=
v_xt
*
p,
xxp
=
xp
*
v_xt
;
x0
+=
p
;
x1
+=
xp
;
x2
+=
xxp
;
x3
+=
xxp
*v_xt
;
}
x0.s0
+=
x0.s1
+
x0.s2
+
x0.s3
;
x1.s0
+=
x1.s1
+
x1.s2
+
x1.s3
;
x2.s0
+=
x2.s1
+
x2.s2
+
x2.s3
;
x3.s0
+=
x3.s1
+
x3.s2
+
x3.s3
;
F
py
=
lidy
*
x0.s0,
sy
=
lidy*lidy
;
int
bheight
=
min
(
tileSize_height,
TILE_SIZE/2
)
;
if
(
bheight
>=
TILE_SIZE/2&&lidy
>
bheight-1&&lidy
<
tileSize_height
)
if
(
binary
)
{
m[9][lidy-bheight]
=
((
F
)
py
)
*
sy
; // m03
m[8][lidy-bheight]
=
((
F
)
x1.s0
)
*
sy
; // m12
m[7][lidy-bheight]
=
((
F
)
x2.s0
)
*
lidy
; // m21
m[6][lidy-bheight]
=
x3.s0
; // m30
m[5][lidy-bheight]
=
x0.s0
*
sy
; // m02
m[4][lidy-bheight]
=
x1.s0
*
lidy
; // m11
m[3][lidy-bheight]
=
x2.s0
; // m20
m[2][lidy-bheight]
=
py
; // m01
m[1][lidy-bheight]
=
x1.s0
; // m10
m[0][lidy-bheight]
=
x0.s0
; // m00
}
else
if
(
lidy
<
bheight
)
WT
s
=
1./255
;
if
(
ly
<
10
)
{
lm[9]
=
((
F
)
py
)
*
sy
; // m03
lm[8]
=
((
F
)
x1.s0
)
*
sy
; // m12
lm[7]
=
((
F
)
x2.s0
)
*
lidy
; // m21
lm[6]
=
x3.s0
; // m30
lm[5]
=
x0.s0
*
sy
; // m02
lm[4]
=
x1.s0
*
lidy
; // m11
lm[3]
=
x2.s0
; // m20
lm[2]
=
py
; // m01
lm[1]
=
x1.s0
; // m10
lm[0]
=
x0.s0
; // m00
mom[ly][0]
*=
s
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
for
(
int
j
=
TILE_SIZE/2
; j >= 1; j = j/2 )
{
if
(
lidy
<
j
)
for
(
int
i
=
0
; i < 10; i++ )
lm[i]
=
lm[i]
+
m[i][lidy]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
lidy
>=
j/2&&lidy
<
j
)
for
(
int
i
=
0
; i < 10; i++ )
m[i][lidy-j/2]
=
lm[i]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
if
(
lidy
==
0&&lidx
==
0
)
{
for
(
int
mt
=
0
; mt < 10; mt++ )
mom[mt]
=
(
F
)
lm[mt]
;
if
(
binary
)
WT
xm
=
(
gidx
*
256
)
*
mom[0][0]
;
WT
ym
=
(
gidy
*
256
)
*
mom[0][0]
;
if
(
ly
==
0
)
{
F
s
=
1./255
;
for
(
int
mt
=
0
; mt < 10; mt++ )
mom[mt]
*=
s
;
mom[0][1]
=
mom[0][0]
;
mom[1][1]
=
mom[1][0]
+
xm
;
mom[2][1]
=
mom[2][0]
+
ym
;
mom[3][1]
=
mom[3][0]
+
gidx
*
256
*
(
mom[1][0]
*
2
+
xm
)
;
mom[4][1]
=
mom[4][0]
+
gidx
*
256
*
(
mom[2][0]
+
ym
)
+
gidy
*
256
*
mom[1][0]
;
mom[5][1]
=
mom[5][0]
+
gidy
*
256
*
(
mom[2][0]
*
2
+
ym
)
;
mom[6][1]
=
mom[6][0]
+
gidx
*
256
*
(
3
*
mom[3][0]
+
256
*
gidx
*
(
3
*
mom[1][0]
+
xm
))
;
mom[7][1]
=
mom[7][0]
+
gidx
*
256
*
(
2
*
(
mom[4][0]
+
256
*
gidy
*
mom[1][0]
)
+
256
*
gidx
*
(
mom[2][0]
+
ym
))
+
256
*
gidy
*
mom[3][0]
;
mom[8][1]
=
mom[8][0]
+
gidy
*
256
*
(
2
*
(
mom[4][0]
+
256
*
gidx
*
mom[2][0]
)
+
256
*
gidy
*
(
mom[1][0]
+
xm
))
+
256
*
gidx
*
mom[5][0]
;
mom[9][1]
=
mom[9][0]
+
gidy
*
256
*
(
3
*
mom[5][0]
+
256
*
gidy
*
(
3
*
mom[2][0]
+
ym
))
;
}
F
xm
=
x
*
mom[0],
ym
=
y
*
mom[0]
;
//
accumulate
moments
computed
in
each
tile
dst_step
/=
sizeof
(
F
)
;
//
+
m00
(
=
m00
'
)
*
(
dst_m
+
mad24
(
DST_ROW_00
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[0]
;
//
+
m10
(
=
m10
'
+
x*m00
'
)
*
(
dst_m
+
mad24
(
DST_ROW_10
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[1]
+
xm
;
//
+
m01
(
=
m01
'
+
y*m00
'
)
*
(
dst_m
+
mad24
(
DST_ROW_01
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[2]
+
ym
;
//
+
m20
(
=
m20
'
+
2*x*m10
'
+
x*x*m00
'
)
*
(
dst_m
+
mad24
(
DST_ROW_20
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[3]
+
x
*
(
mom[1]
*
2
+
xm
)
;
//
+
m11
(
=
m11
'
+
x*m01
'
+
y*m10
'
+
x*y*m00
'
)
*
(
dst_m
+
mad24
(
DST_ROW_11
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[4]
+
x
*
(
mom[2]
+
ym
)
+
y
*
mom[1]
;
//
+
m02
(
=
m02
'
+
2*y*m01
'
+
y*y*m00
'
)
*
(
dst_m
+
mad24
(
DST_ROW_02
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[5]
+
y
*
(
mom[2]
*
2
+
ym
)
;
//
+
m30
(
=
m30
'
+
3*x*m20
'
+
3*x*x*m10
'
+
x*x*x*m00
'
)
*
(
dst_m
+
mad24
(
DST_ROW_30
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[6]
+
x
*
(
3.
*
mom[3]
+
x
*
(
3.
*
mom[1]
+
xm
))
;
//
+
m21
(
=
m21
'
+
x*
(
2*m11
'
+
2*y*m10
'
+
x*m01
'
+
x*y*m00
'
)
+
y*m20
'
)
*
(
dst_m
+
mad24
(
DST_ROW_21
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[7]
+
x
*
(
2
*
(
mom[4]
+
y
*
mom[1]
)
+
x
*
(
mom[2]
+
ym
))
+
y
*
mom[3]
;
//
+
m12
(
=
m12
'
+
y*
(
2*m11
'
+
2*x*m01
'
+
y*m10
'
+
x*y*m00
'
)
+
x*m02
'
)
*
(
dst_m
+
mad24
(
DST_ROW_12
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[8]
+
y
*
(
2
*
(
mom[4]
+
x
*
mom[2]
)
+
y
*
(
mom[1]
+
xm
))
+
x
*
mom[5]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
//
+
m03
(
=
m03
'
+
3*y*m02
'
+
3*y*y*m01
'
+
y*y*y*m00
'
)
*
(
dst_m
+
mad24
(
DST_ROW_03
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[9]
+
y
*
(
3.
*
mom[5]
+
y
*
(
3.
*
mom[2]
+
ym
))
;
if
(
ly
<
10
)
{
dst_m[10
*
gidy
*
dst_step
+
ly
*
dst_step
+
gidx]
=
mom[ly][1]
;
}
}
modules/ocl/test/test_moments.cpp
View file @
e80f5bed
...
...
@@ -10,18 +10,19 @@ using namespace cvtest;
using
namespace
testing
;
using
namespace
std
;
PARAM_TEST_CASE
(
MomentsTest
,
MatType
,
bool
)
PARAM_TEST_CASE
(
MomentsTest
,
MatType
,
bool
,
bool
)
{
int
type
;
cv
::
Mat
mat
1
;
cv
::
Mat
mat
;
bool
test_contours
;
bool
binaryImage
;
virtual
void
SetUp
()
{
type
=
GET_PARAM
(
0
);
test_contours
=
GET_PARAM
(
1
);
cv
::
Size
size
(
10
*
MWIDTH
,
10
*
MHEIGHT
);
mat1
=
randomMat
(
size
,
type
,
5
,
16
,
false
);
cv
::
Size
size
(
10
*
MWIDTH
,
10
*
MHEIGHT
);
mat
=
randomMat
(
size
,
type
,
0
,
256
,
false
);
binaryImage
=
GET_PARAM
(
2
);
}
void
Compare
(
Moments
&
cpu
,
Moments
&
gpu
)
...
...
@@ -29,16 +30,13 @@ PARAM_TEST_CASE(MomentsTest, MatType, bool)
Mat
gpu_dst
,
cpu_dst
;
HuMoments
(
cpu
,
cpu_dst
);
HuMoments
(
gpu
,
gpu_dst
);
EXPECT_MAT_NEAR
(
gpu_dst
,
cpu_dst
,
.5
);
EXPECT_MAT_NEAR
(
gpu_dst
,
cpu_dst
,
1e-3
);
}
};
OCL_TEST_P
(
MomentsTest
,
Mat
)
{
bool
binaryImage
=
0
;
oclMat
src_d
(
mat
);
for
(
int
j
=
0
;
j
<
LOOP_TIMES
;
j
++
)
{
if
(
test_contours
)
...
...
@@ -53,18 +51,16 @@ OCL_TEST_P(MomentsTest, Mat)
for
(
size_t
i
=
0
;
i
<
contours
.
size
();
i
++
)
{
Moments
m
=
moments
(
contours
[
i
],
false
);
Moments
dm
=
ocl
::
ocl_moments
(
contours
[
i
]
,
false
);
Moments
dm
=
ocl
::
ocl_moments
(
contours
[
i
]);
Compare
(
m
,
dm
);
}
}
cv
::
_InputArray
_array
(
mat1
);
cv
::
Moments
CvMom
=
cv
::
moments
(
_array
,
binaryImage
);
cv
::
Moments
oclMom
=
cv
::
ocl
::
ocl_moments
(
_array
,
binaryImage
);
cv
::
Moments
CvMom
=
cv
::
moments
(
mat
,
binaryImage
);
cv
::
Moments
oclMom
=
cv
::
ocl
::
ocl_moments
(
src_d
,
binaryImage
);
Compare
(
CvMom
,
oclMom
);
}
}
INSTANTIATE_TEST_CASE_P
(
OCL_ImgProc
,
MomentsTest
,
Combine
(
Values
(
CV_8UC1
,
CV_16UC1
,
CV_16SC1
,
CV_64FC1
),
Values
(
true
,
fals
e
)));
Values
(
CV_8UC1
,
CV_16UC1
,
CV_16SC1
,
CV_32FC1
,
CV_64FC1
),
Values
(
false
,
true
),
Values
(
false
,
tru
e
)));
#endif // HAVE_OPENCL
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment