Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
6c99b5c9
Commit
6c99b5c9
authored
Mar 26, 2013
by
Andrey Kamaev
Committed by
OpenCV Buildbot
Mar 26, 2013
Browse files
Options
Browse Files
Download
Plain Diff
Merge pull request #705 from bitwangyaoyao:2.4_oclFix
parents
6a6ae355
d6f1ad8c
Hide whitespace changes
Inline
Side-by-side
Showing
14 changed files
with
863 additions
and
1444 deletions
+863
-1444
OpenCVDetectOpenCL.cmake
cmake/OpenCVDetectOpenCL.cmake
+1
-1
cl2cpp.cmake
cmake/cl2cpp.cmake
+1
-0
brute_force_matcher.cpp
modules/ocl/src/brute_force_matcher.cpp
+119
-573
haar.cpp
modules/ocl/src/haar.cpp
+2
-2
match_template.cpp
modules/ocl/src/match_template.cpp
+97
-45
moments.cpp
modules/ocl/src/moments.cpp
+12
-36
brute_force_match.cl
modules/ocl/src/opencl/brute_force_match.cl
+97
-218
haarobjectdetect.cl
modules/ocl/src/opencl/haarobjectdetect.cl
+8
-0
match_template.cl
modules/ocl/src/opencl/match_template.cl
+94
-59
moments.cl
modules/ocl/src/opencl/moments.cl
+212
-262
pyr_up.cl
modules/ocl/src/opencl/pyr_up.cl
+212
-240
stereobm.cl
modules/ocl/src/opencl/stereobm.cl
+1
-1
test_brute_force_matcher.cpp
modules/ocl/test/test_brute_force_matcher.cpp
+2
-2
test_match_template.cpp
modules/ocl/test/test_match_template.cpp
+5
-5
No files found.
cmake/OpenCVDetectOpenCL.cmake
View file @
6c99b5c9
...
...
@@ -43,7 +43,7 @@ if(OPENCL_FOUND)
set
(
OPENCL_LIBRARIES
${
OPENCL_LIBRARY
}
)
if
(
X86_64
)
set
(
CLAMD_POSSIBLE_LIB_SUFFIXES lib
32
/import
)
set
(
CLAMD_POSSIBLE_LIB_SUFFIXES lib
64
/import
)
elseif
(
X86
)
set
(
CLAMD_POSSIBLE_LIB_SUFFIXES lib32/import
)
endif
()
...
...
cmake/cl2cpp.cmake
View file @
6c99b5c9
...
...
@@ -18,6 +18,7 @@ foreach(cl ${cl_list})
string
(
REPLACE
"
\t
"
" "
lines
"
${
lines
}
"
)
string
(
REGEX REPLACE
"/
\\
*([^*]/|
\\
*[^/]|[^*/])*
\\
*/"
""
lines
"
${
lines
}
"
)
# multiline comments
string
(
REGEX REPLACE
"/
\\
*([^
\n
])*
\\
*/"
""
lines
"
${
lines
}
"
)
# single-line comments
string
(
REGEX REPLACE
"[ ]*//[^
\n
]*
\n
"
"
\n
"
lines
"
${
lines
}
"
)
# single-line comments
string
(
REGEX REPLACE
"
\n
[ ]*(
\n
[ ]*)*"
"
\n
"
lines
"
${
lines
}
"
)
# empty lines & leading whitespace
string
(
REGEX REPLACE
"^
\n
"
""
lines
"
${
lines
}
"
)
# leading new line
...
...
modules/ocl/src/brute_force_matcher.cpp
View file @
6c99b5c9
...
...
@@ -44,6 +44,7 @@
//M*/
#include "precomp.hpp"
#include <functional>
#include <iterator>
#include <vector>
...
...
@@ -60,10 +61,11 @@ namespace cv
}
}
template
<
int
BLOCK_SIZE
,
int
MAX_DESC_LEN
,
typename
T
/*, typename Mask*/
>
template
<
int
BLOCK_SIZE
,
int
MAX_DESC_LEN
/*, typename Mask*/
>
void
matchUnrolledCached
(
const
oclMat
&
query
,
const
oclMat
&
train
,
const
oclMat
&
/*mask*/
,
const
oclMat
&
trainIdx
,
const
oclMat
&
distance
,
int
distType
)
{
assert
(
query
.
type
()
==
CV_32F
);
cv
::
ocl
::
Context
*
ctx
=
query
.
clCxt
;
size_t
globalSize
[]
=
{(
query
.
rows
+
BLOCK_SIZE
-
1
)
/
BLOCK_SIZE
*
BLOCK_SIZE
,
BLOCK_SIZE
,
1
};
size_t
localSize
[]
=
{
BLOCK_SIZE
,
BLOCK_SIZE
,
1
};
...
...
@@ -91,20 +93,21 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, const oclMat
std
::
string
kernelName
=
"BruteForceMatch_UnrollMatch"
;
openCLExecuteKernel
(
ctx
,
&
brute_force_match
,
kernelName
,
globalSize
,
localSize
,
args
,
-
1
,
-
1
);
openCLExecuteKernel
(
ctx
,
&
brute_force_match
,
kernelName
,
globalSize
,
localSize
,
args
,
-
1
,
query
.
depth
()
);
}
}
template
<
int
BLOCK_SIZE
,
int
MAX_DESC_LEN
,
typename
T
/*, typename Mask*/
>
template
<
int
BLOCK_SIZE
,
int
MAX_DESC_LEN
/*, typename Mask*/
>
void
matchUnrolledCached
(
const
oclMat
/*query*/
,
const
oclMat
*
/*trains*/
,
int
/*n*/
,
const
oclMat
/*mask*/
,
const
oclMat
&
/*bestTrainIdx*/
,
const
oclMat
&
/*bestImgIdx*/
,
const
oclMat
&
/*bestDistance*/
,
int
/*distType*/
)
{
}
template
<
int
BLOCK_SIZE
,
typename
T
/*, typename Mask*/
>
template
<
int
BLOCK_SIZE
/*, typename Mask*/
>
void
match
(
const
oclMat
&
query
,
const
oclMat
&
train
,
const
oclMat
&
/*mask*/
,
const
oclMat
&
trainIdx
,
const
oclMat
&
distance
,
int
distType
)
{
assert
(
query
.
type
()
==
CV_32F
);
cv
::
ocl
::
Context
*
ctx
=
query
.
clCxt
;
size_t
globalSize
[]
=
{(
query
.
rows
+
BLOCK_SIZE
-
1
)
/
BLOCK_SIZE
*
BLOCK_SIZE
,
BLOCK_SIZE
,
1
};
size_t
localSize
[]
=
{
BLOCK_SIZE
,
BLOCK_SIZE
,
1
};
...
...
@@ -130,21 +133,22 @@ void match(const oclMat &query, const oclMat &train, const oclMat &/*mask*/,
std
::
string
kernelName
=
"BruteForceMatch_Match"
;
openCLExecuteKernel
(
ctx
,
&
brute_force_match
,
kernelName
,
globalSize
,
localSize
,
args
,
-
1
,
-
1
);
openCLExecuteKernel
(
ctx
,
&
brute_force_match
,
kernelName
,
globalSize
,
localSize
,
args
,
-
1
,
query
.
depth
()
);
}
}
template
<
int
BLOCK_SIZE
,
typename
T
/*, typename Mask*/
>
template
<
int
BLOCK_SIZE
/*, typename Mask*/
>
void
match
(
const
oclMat
/*query*/
,
const
oclMat
*
/*trains*/
,
int
/*n*/
,
const
oclMat
/*mask*/
,
const
oclMat
&
/*bestTrainIdx*/
,
const
oclMat
&
/*bestImgIdx*/
,
const
oclMat
&
/*bestDistance*/
,
int
/*distType*/
)
{
}
//radius_matchUnrolledCached
template
<
int
BLOCK_SIZE
,
int
MAX_DESC_LEN
,
typename
T
/*, typename Mask*/
>
template
<
int
BLOCK_SIZE
,
int
MAX_DESC_LEN
/*, typename Mask*/
>
void
matchUnrolledCached
(
const
oclMat
&
query
,
const
oclMat
&
train
,
float
maxDistance
,
const
oclMat
&
/*mask*/
,
const
oclMat
&
trainIdx
,
const
oclMat
&
distance
,
const
oclMat
&
nMatches
,
int
distType
)
{
assert
(
query
.
type
()
==
CV_32F
);
cv
::
ocl
::
Context
*
ctx
=
query
.
clCxt
;
size_t
globalSize
[]
=
{(
train
.
rows
+
BLOCK_SIZE
-
1
)
/
BLOCK_SIZE
*
BLOCK_SIZE
,
(
query
.
rows
+
BLOCK_SIZE
-
1
)
/
BLOCK_SIZE
*
BLOCK_SIZE
,
1
};
size_t
localSize
[]
=
{
BLOCK_SIZE
,
BLOCK_SIZE
,
1
};
...
...
@@ -176,15 +180,16 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, float maxDist
std
::
string
kernelName
=
"BruteForceMatch_RadiusUnrollMatch"
;
openCLExecuteKernel
(
ctx
,
&
brute_force_match
,
kernelName
,
globalSize
,
localSize
,
args
,
-
1
,
-
1
);
openCLExecuteKernel
(
ctx
,
&
brute_force_match
,
kernelName
,
globalSize
,
localSize
,
args
,
-
1
,
query
.
depth
()
);
}
}
//radius_match
template
<
int
BLOCK_SIZE
,
typename
T
/*, typename Mask*/
>
template
<
int
BLOCK_SIZE
/*, typename Mask*/
>
void
radius_match
(
const
oclMat
&
query
,
const
oclMat
&
train
,
float
maxDistance
,
const
oclMat
&
/*mask*/
,
const
oclMat
&
trainIdx
,
const
oclMat
&
distance
,
const
oclMat
&
nMatches
,
int
distType
)
{
assert
(
query
.
type
()
==
CV_32F
);
cv
::
ocl
::
Context
*
ctx
=
query
.
clCxt
;
size_t
globalSize
[]
=
{(
train
.
rows
+
BLOCK_SIZE
-
1
)
/
BLOCK_SIZE
*
BLOCK_SIZE
,
(
query
.
rows
+
BLOCK_SIZE
-
1
)
/
BLOCK_SIZE
*
BLOCK_SIZE
,
1
};
size_t
localSize
[]
=
{
BLOCK_SIZE
,
BLOCK_SIZE
,
1
};
...
...
@@ -214,263 +219,70 @@ void radius_match(const oclMat &query, const oclMat &train, float maxDistance, c
std
::
string
kernelName
=
"BruteForceMatch_RadiusMatch"
;
openCLExecuteKernel
(
ctx
,
&
brute_force_match
,
kernelName
,
globalSize
,
localSize
,
args
,
-
1
,
-
1
);
//float *dis = (float *)clEnqueueMapBuffer(ctx->impl->clCmdQueue, (cl_mem)distance.data, CL_TRUE, CL_MAP_READ, 0, 8, 0, NULL, NULL, NULL);
//printf("%f, %f\n", dis[0], dis[1]);
openCLExecuteKernel
(
ctx
,
&
brute_force_match
,
kernelName
,
globalSize
,
localSize
,
args
,
-
1
,
query
.
depth
());
}
}
// with mask
template
<
typename
T
/*, typename Mask*/
>
void
matchDispatcher
(
const
oclMat
&
query
,
const
oclMat
&
train
,
const
oclMat
&
mask
,
static
void
matchDispatcher
(
const
oclMat
&
query
,
const
oclMat
&
train
,
const
oclMat
&
mask
,
const
oclMat
&
trainIdx
,
const
oclMat
&
distance
,
int
distType
)
{
const
oclMat
zeroMask
;
const
oclMat
&
tempMask
=
mask
.
data
?
mask
:
zeroMask
;
if
(
query
.
cols
<=
64
)
{
matchUnrolledCached
<
16
,
64
,
T
>
(
query
,
train
,
mask
,
trainIdx
,
distance
,
distType
);
}
else
if
(
query
.
cols
<=
128
)
{
matchUnrolledCached
<
16
,
128
,
T
>
(
query
,
train
,
mask
,
trainIdx
,
distance
,
distType
);
}
/*else if (query.cols <= 256)
{
matchUnrolled<16, 256, Dist>(query, train, mask, trainIdx, distance, stream);
}
else if (query.cols <= 512)
{
matchUnrolled<16, 512, Dist>(query, train, mask, trainIdx, distance, stream);
}
else if (query.cols <= 1024)
{
matchUnrolled<16, 1024, Dist>(query, train, mask, trainIdx, distance, stream);
}*/
else
{
match
<
16
,
T
>
(
query
,
train
,
mask
,
trainIdx
,
distance
,
distType
);
}
}
// without mask
template
<
typename
T
/*, typename Mask*/
>
void
matchDispatcher
(
const
oclMat
&
query
,
const
oclMat
&
train
,
const
oclMat
&
trainIdx
,
const
oclMat
&
distance
,
int
distType
)
{
oclMat
mask
;
if
(
query
.
cols
<=
64
)
{
matchUnrolledCached
<
16
,
64
,
T
>
(
query
,
train
,
mask
,
trainIdx
,
distance
,
distType
);
matchUnrolledCached
<
16
,
64
>
(
query
,
train
,
tempMask
,
trainIdx
,
distance
,
distType
);
}
else
if
(
query
.
cols
<=
128
)
{
matchUnrolledCached
<
16
,
128
,
T
>
(
query
,
train
,
mask
,
trainIdx
,
distance
,
distType
);
}
/*else if (query.cols <= 256)
{
matchUnrolled<16, 256, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance);
}
else if (query.cols <= 512)
{
matchUnrolled<16, 512, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance);
matchUnrolledCached
<
16
,
128
>
(
query
,
train
,
tempMask
,
trainIdx
,
distance
,
distType
);
}
else if (query.cols <= 1024)
{
matchUnrolled<16, 1024, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance);
}*/
else
{
match
<
16
,
T
>
(
query
,
train
,
m
ask
,
trainIdx
,
distance
,
distType
);
match
<
16
>
(
query
,
train
,
tempM
ask
,
trainIdx
,
distance
,
distType
);
}
}
template
<
typename
T
/*, typename Mask*/
>
void
matchDispatcher
(
const
oclMat
&
query
,
const
oclMat
*
trains
,
int
n
,
const
oclMat
&
mask
,
static
void
matchDispatcher
(
const
oclMat
&
query
,
const
oclMat
*
trains
,
int
n
,
const
oclMat
&
mask
,
const
oclMat
&
trainIdx
,
const
oclMat
&
imgIdx
,
const
oclMat
&
distance
,
int
distType
)
{
const
oclMat
zeroMask
;
const
oclMat
&
tempMask
=
mask
.
data
?
mask
:
zeroMask
;
if
(
query
.
cols
<=
64
)
{
matchUnrolledCached
<
16
,
64
,
T
>
(
query
,
trains
,
n
,
m
ask
,
trainIdx
,
imgIdx
,
distance
,
distType
);
matchUnrolledCached
<
16
,
64
>
(
query
,
trains
,
n
,
tempM
ask
,
trainIdx
,
imgIdx
,
distance
,
distType
);
}
else
if
(
query
.
cols
<=
128
)
{
matchUnrolledCached
<
16
,
128
,
T
>
(
query
,
trains
,
n
,
mask
,
trainIdx
,
imgIdx
,
distance
,
distType
);
}
/*else if (query.cols <= 256)
{
matchUnrolled<16, 256, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
matchUnrolledCached
<
16
,
128
>
(
query
,
trains
,
n
,
tempMask
,
trainIdx
,
imgIdx
,
distance
,
distType
);
}
else if (query.cols <= 512)
{
matchUnrolled<16, 512, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
}
else if (query.cols <= 1024)
{
matchUnrolled<16, 1024, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
}*/
else
{
match
<
16
,
T
>
(
query
,
trains
,
n
,
mask
,
trainIdx
,
imgIdx
,
distance
,
distType
);
}
}
template
<
typename
T
/*, typename Mask*/
>
void
matchDispatcher
(
const
oclMat
&
query
,
const
oclMat
*
trains
,
int
n
,
const
oclMat
&
trainIdx
,
const
oclMat
&
imgIdx
,
const
oclMat
&
distance
,
int
distType
)
{
oclMat
mask
;
if
(
query
.
cols
<=
64
)
{
matchUnrolledCached
<
16
,
64
,
T
>
(
query
,
trains
,
n
,
mask
,
trainIdx
,
imgIdx
,
distance
,
distType
);
}
else
if
(
query
.
cols
<=
128
)
{
matchUnrolledCached
<
16
,
128
,
T
>
(
query
,
trains
,
n
,
mask
,
trainIdx
,
imgIdx
,
distance
,
distType
);
}
/*else if (query.cols <= 256)
{
matchUnrolled<16, 256, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
}
else if (query.cols <= 512)
{
matchUnrolled<16, 512, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
}
else if (query.cols <= 1024)
{
matchUnrolled<16, 1024, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
}*/
else
{
match
<
16
,
T
>
(
query
,
trains
,
n
,
mask
,
trainIdx
,
imgIdx
,
distance
,
distType
);
match
<
16
>
(
query
,
trains
,
n
,
tempMask
,
trainIdx
,
imgIdx
,
distance
,
distType
);
}
}
//radius matchDispatcher
// with mask
template
<
typename
T
/*, typename Mask*/
>
void
matchDispatcher
(
const
oclMat
&
query
,
const
oclMat
&
train
,
float
maxDistance
,
const
oclMat
&
mask
,
static
void
matchDispatcher
(
const
oclMat
&
query
,
const
oclMat
&
train
,
float
maxDistance
,
const
oclMat
&
mask
,
const
oclMat
&
trainIdx
,
const
oclMat
&
distance
,
const
oclMat
&
nMatches
,
int
distType
)
{
const
oclMat
zeroMask
;
const
oclMat
&
tempMask
=
mask
.
data
?
mask
:
zeroMask
;
if
(
query
.
cols
<=
64
)
{
matchUnrolledCached
<
16
,
64
,
T
>
(
query
,
train
,
maxDistance
,
m
ask
,
trainIdx
,
distance
,
nMatches
,
distType
);
matchUnrolledCached
<
16
,
64
>
(
query
,
train
,
maxDistance
,
tempM
ask
,
trainIdx
,
distance
,
nMatches
,
distType
);
}
else
if
(
query
.
cols
<=
128
)
{
matchUnrolledCached
<
16
,
128
,
T
>
(
query
,
train
,
maxDistance
,
mask
,
trainIdx
,
distance
,
nMatches
,
distType
);
}
/*else if (query.cols <= 256)
{
matchUnrolled<16, 256, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
matchUnrolledCached
<
16
,
128
>
(
query
,
train
,
maxDistance
,
tempMask
,
trainIdx
,
distance
,
nMatches
,
distType
);
}
else if (query.cols <= 512)
{
matchUnrolled<16, 512, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
}
else if (query.cols <= 1024)
{
matchUnrolled<16, 1024, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
}*/
else
{
radius_match
<
16
,
T
>
(
query
,
train
,
maxDistance
,
mask
,
trainIdx
,
distance
,
nMatches
,
distType
);
}
}
// without mask
template
<
typename
T
/*, typename Mask*/
>
void
matchDispatcher
(
const
oclMat
&
query
,
const
oclMat
&
train
,
float
maxDistance
,
const
oclMat
&
trainIdx
,
const
oclMat
&
distance
,
const
oclMat
&
nMatches
,
int
distType
)
{
oclMat
mask
;
if
(
query
.
cols
<=
64
)
{
matchUnrolledCached
<
16
,
64
,
T
>
(
query
,
train
,
maxDistance
,
mask
,
trainIdx
,
distance
,
nMatches
,
distType
);
}
else
if
(
query
.
cols
<=
128
)
{
matchUnrolledCached
<
16
,
128
,
T
>
(
query
,
train
,
maxDistance
,
mask
,
trainIdx
,
distance
,
nMatches
,
distType
);
}
/*else if (query.cols <= 256)
{
matchUnrolled<16, 256, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
}
else if (query.cols <= 512)
{
matchUnrolled<16, 512, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
}
else if (query.cols <= 1024)
{
matchUnrolled<16, 1024, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
}*/
else
{
radius_match
<
16
,
T
>
(
query
,
train
,
maxDistance
,
mask
,
trainIdx
,
distance
,
nMatches
,
distType
);
}
}
template
<
typename
T
/*, typename Mask*/
>
void
matchDispatcher
(
const
oclMat
&
query
,
const
oclMat
&
train
,
int
n
,
float
maxDistance
,
const
oclMat
&
mask
,
const
oclMat
&
trainIdx
,
const
oclMat
&
distance
,
const
oclMat
&
nMatches
,
int
distType
)
{
if
(
query
.
cols
<=
64
)
{
matchUnrolledCached
<
16
,
64
,
T
>
(
query
,
train
,
n
,
maxDistance
,
mask
,
trainIdx
,
distance
,
nMatches
,
distType
);
}
else
if
(
query
.
cols
<=
128
)
{
matchUnrolledCached
<
16
,
128
,
T
>
(
query
,
train
,
n
,
maxDistance
,
mask
,
trainIdx
,
distance
,
nMatches
,
distType
);
}
/*else if (query.cols <= 256)
{
matchUnrolled<16, 256, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
}
else if (query.cols <= 512)
{
matchUnrolled<16, 512, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
}
else if (query.cols <= 1024)
{
matchUnrolled<16, 1024, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
}*/
else
{
match
<
16
,
T
>
(
query
,
train
,
n
,
maxDistance
,
mask
,
trainIdx
,
distance
,
nMatches
,
distType
);
}
}
// without mask
template
<
typename
T
/*, typename Mask*/
>
void
matchDispatcher
(
const
oclMat
&
query
,
const
oclMat
&
train
,
int
n
,
float
maxDistance
,
const
oclMat
&
trainIdx
,
const
oclMat
&
distance
,
const
oclMat
&
nMatches
,
int
distType
)
{
oclMat
mask
;
if
(
query
.
cols
<=
64
)
{
matchUnrolledCached
<
16
,
64
,
T
>
(
query
,
train
,
n
,
maxDistance
,
mask
,
trainIdx
,
distance
,
nMatches
,
distType
);
}
else
if
(
query
.
cols
<=
128
)
{
matchUnrolledCached
<
16
,
128
,
T
>
(
query
,
train
,
n
,
maxDistance
,
mask
,
trainIdx
,
distance
,
nMatches
,
distType
);
}
/*else if (query.cols <= 256)
{
matchUnrolled<16, 256, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
}
else if (query.cols <= 512)
{
matchUnrolled<16, 512, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
}
else if (query.cols <= 1024)
{
matchUnrolled<16, 1024, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
}*/
else
{
match
<
16
,
T
>
(
query
,
train
,
n
,
maxDistance
,
mask
,
trainIdx
,
distance
,
nMatches
,
distType
);
radius_match
<
16
>
(
query
,
train
,
maxDistance
,
tempMask
,
trainIdx
,
distance
,
nMatches
,
distType
);
}
}
//knn match Dispatcher
template
<
int
BLOCK_SIZE
,
int
MAX_DESC_LEN
,
typename
T
/*, typename Mask*/
>
template
<
int
BLOCK_SIZE
,
int
MAX_DESC_LEN
/*, typename Mask*/
>
void
knn_matchUnrolledCached
(
const
oclMat
&
query
,
const
oclMat
&
train
,
const
oclMat
&
/*mask*/
,
const
oclMat
&
trainIdx
,
const
oclMat
&
distance
,
int
distType
)
{
...
...
@@ -501,11 +313,11 @@ void knn_matchUnrolledCached(const oclMat &query, const oclMat &train, const ocl
std
::
string
kernelName
=
"BruteForceMatch_knnUnrollMatch"
;
openCLExecuteKernel
(
ctx
,
&
brute_force_match
,
kernelName
,
globalSize
,
localSize
,
args
,
-
1
,
-
1
);
openCLExecuteKernel
(
ctx
,
&
brute_force_match
,
kernelName
,
globalSize
,
localSize
,
args
,
-
1
,
query
.
depth
()
);
}
}
template
<
int
BLOCK_SIZE
,
typename
T
/*, typename Mask*/
>
template
<
int
BLOCK_SIZE
/*, typename Mask*/
>
void
knn_match
(
const
oclMat
&
query
,
const
oclMat
&
train
,
const
oclMat
&
/*mask*/
,
const
oclMat
&
trainIdx
,
const
oclMat
&
distance
,
int
distType
)
{
...
...
@@ -534,11 +346,11 @@ void knn_match(const oclMat &query, const oclMat &train, const oclMat &/*mask*/,
std
::
string
kernelName
=
"BruteForceMatch_knnMatch"
;
openCLExecuteKernel
(
ctx
,
&
brute_force_match
,
kernelName
,
globalSize
,
localSize
,
args
,
-
1
,
-
1
);
openCLExecuteKernel
(
ctx
,
&
brute_force_match
,
kernelName
,
globalSize
,
localSize
,
args
,
-
1
,
query
.
depth
()
);
}
}
template
<
int
BLOCK_SIZE
,
int
MAX_DESC_LEN
,
typename
T
/*, typename Mask*/
>
template
<
int
BLOCK_SIZE
,
int
MAX_DESC_LEN
/*, typename Mask*/
>
void
calcDistanceUnrolled
(
const
oclMat
&
query
,
const
oclMat
&
train
,
const
oclMat
&
/*mask*/
,
const
oclMat
&
allDist
,
int
distType
)
{
cv
::
ocl
::
Context
*
ctx
=
query
.
clCxt
;
...
...
@@ -567,11 +379,11 @@ void calcDistanceUnrolled(const oclMat &query, const oclMat &train, const oclMat
std
::
string
kernelName
=
"BruteForceMatch_calcDistanceUnrolled"
;
openCLExecuteKernel
(
ctx
,
&
brute_force_match
,
kernelName
,
globalSize
,
localSize
,
args
,
-
1
,
-
1
);
openCLExecuteKernel
(
ctx
,
&
brute_force_match
,
kernelName
,
globalSize
,
localSize
,
args
,
-
1
,
query
.
depth
()
);
}
}
template
<
int
BLOCK_SIZE
,
typename
T
/*, typename Mask*/
>
template
<
int
BLOCK_SIZE
/*, typename Mask*/
>
void
calcDistance
(
const
oclMat
&
query
,
const
oclMat
&
train
,
const
oclMat
&
/*mask*/
,
const
oclMat
&
allDist
,
int
distType
)
{
cv
::
ocl
::
Context
*
ctx
=
query
.
clCxt
;
...
...
@@ -598,69 +410,43 @@ void calcDistance(const oclMat &query, const oclMat &train, const oclMat &/*mask
std
::
string
kernelName
=
"BruteForceMatch_calcDistance"
;
openCLExecuteKernel
(
ctx
,
&
brute_force_match
,
kernelName
,
globalSize
,
localSize
,
args
,
-
1
,
-
1
);
openCLExecuteKernel
(
ctx
,
&
brute_force_match
,
kernelName
,
globalSize
,
localSize
,
args
,
-
1
,
query
.
depth
()
);
}
}
///////////////////////////////////////////////////////////////////////////////
// Calc Distance dispatcher
template
<
typename
T
/*, typename Mask*/
>
void
calcDistanceDispatcher
(
const
oclMat
&
query
,
const
oclMat
&
train
,
const
oclMat
&
mask
,
static
void
calcDistanceDispatcher
(
const
oclMat
&
query
,
const
oclMat
&
train
,
const
oclMat
&
mask
,
const
oclMat
&
allDist
,
int
distType
)
{
if
(
query
.
cols
<=
64
)
{
calcDistanceUnrolled
<
16
,
64
,
T
>
(
query
,
train
,
mask
,
allDist
,
distType
);
calcDistanceUnrolled
<
16
,
64
>
(
query
,
train
,
mask
,
allDist
,
distType
);
}
else
if
(
query
.
cols
<=
128
)
{
calcDistanceUnrolled
<
16
,
128
,
T
>
(
query
,
train
,
mask
,
allDist
,
distType
);
calcDistanceUnrolled
<
16
,
128
>
(
query
,
train
,
mask
,
allDist
,
distType
);
}
/*else if (query.cols <= 256)
{
calcDistanceUnrolled<16, 256, Dist>(query, train, mask, allDist, stream);
}
else if (query.cols <= 512)
{
calcDistanceUnrolled<16, 512, Dist>(query, train, mask, allDist, stream);
}
else if (query.cols <= 1024)
{
calcDistanceUnrolled<16, 1024, Dist>(query, train, mask, allDist, stream);
}*/
else
{
calcDistance
<
16
,
T
>
(
query
,
train
,
mask
,
allDist
,
distType
);
calcDistance
<
16
>
(
query
,
train
,
mask
,
allDist
,
distType
);
}
}
template
<
typename
T
/*, typename Mask*/
>
void
match2Dispatcher
(
const
oclMat
&
query
,
const
oclMat
&
train
,
const
oclMat
&
mask
,
static
void
match2Dispatcher
(
const
oclMat
&
query
,
const
oclMat
&
train
,
const
oclMat
&
mask
,
const
oclMat
&
trainIdx
,
const
oclMat
&
distance
,
int
distType
)
{
if
(
query
.
cols
<=
64
)
{
knn_matchUnrolledCached
<
16
,
64
,
T
>
(
query
,
train
,
mask
,
trainIdx
,
distance
,
distType
);
knn_matchUnrolledCached
<
16
,
64
>
(
query
,
train
,
mask
,
trainIdx
,
distance
,
distType
);
}
else
if
(
query
.
cols
<=
128
)
{
knn_matchUnrolledCached
<
16
,
128
,
T
>
(
query
,
train
,
mask
,
trainIdx
,
distance
,
distType
);
}
/*else if (query.cols <= 256)
{
matchUnrolled<16, 256, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream);
knn_matchUnrolledCached
<
16
,
128
>
(
query
,
train
,
mask
,
trainIdx
,
distance
,
distType
);
}
else if (query.cols <= 512)
{
matchUnrolled<16, 512, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream);
}
else if (query.cols <= 1024)
{
matchUnrolled<16, 1024, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream);
}*/
else
{
knn_match
<
16
,
T
>
(
query
,
train
,
mask
,
trainIdx
,
distance
,
distType
);
knn_match
<
16
>
(
query
,
train
,
mask
,
trainIdx
,
distance
,
distType
);
}
}
...
...
@@ -686,7 +472,7 @@ void findKnnMatch(int k, const oclMat &trainIdx, const oclMat &distance, const o
//args.push_back( make_pair( sizeof(cl_int), (void *)&train.cols ));
//args.push_back( make_pair( sizeof(cl_int), (void *)&query.step ));
openCLExecuteKernel
(
ctx
,
&
brute_force_match
,
kernelName
,
globalSize
,
localSize
,
args
,
-
1
,
-
1
);
openCLExecuteKernel
(
ctx
,
&
brute_force_match
,
kernelName
,
globalSize
,
localSize
,
args
,
trainIdx
.
depth
()
,
-
1
);
}
}
...
...
@@ -695,206 +481,22 @@ static void findKnnMatchDispatcher(int k, const oclMat &trainIdx, const oclMat &
findKnnMatch
<
256
>
(
k
,
trainIdx
,
distance
,
allDist
,
distType
);
}
//with mask
template
<
typename
T
/*, typename Mask*/
>
void
kmatchDispatcher
(
const
oclMat
&
query
,
const
oclMat
&
train
,
int
k
,
const
oclMat
&
mask
,
static
void
kmatchDispatcher
(
const
oclMat
&
query
,
const
oclMat
&
train
,
int
k
,
const
oclMat
&
mask
,
const
oclMat
&
trainIdx
,
const
oclMat
&
distance
,
const
oclMat
&
allDist
,
int
distType
)
{
const
oclMat
zeroMask
;
const
oclMat
&
tempMask
=
mask
.
data
?
mask
:
zeroMask
;
if
(
k
==
2
)
{
match2Dispatcher
<
T
>
(
query
,
train
,
m
ask
,
trainIdx
,
distance
,
distType
);
match2Dispatcher
(
query
,
train
,
tempM
ask
,
trainIdx
,
distance
,
distType
);
}
else
{
calcDistanceDispatcher
<
T
>
(
query
,
train
,
m
ask
,
allDist
,
distType
);
calcDistanceDispatcher
(
query
,
train
,
tempM
ask
,
allDist
,
distType
);
findKnnMatchDispatcher
(
k
,
trainIdx
,
distance
,
allDist
,
distType
);
}
}
//without mask
template
<
typename
T
/*, typename Mask*/
>
void
kmatchDispatcher
(
const
oclMat
&
query
,
const
oclMat
&
train
,
int
k
,
const
oclMat
&
trainIdx
,
const
oclMat
&
distance
,
const
oclMat
&
allDist
,
int
distType
)
{
oclMat
mask
;
if
(
k
==
2
)
{
match2Dispatcher
<
T
>
(
query
,
train
,
mask
,
trainIdx
,
distance
,
distType
);
}
else
{
calcDistanceDispatcher
<
T
>
(
query
,
train
,
mask
,
allDist
,
distType
);
findKnnMatchDispatcher
(
k
,
trainIdx
,
distance
,
allDist
,
distType
);
}
}
template
<
typename
T
>
void
ocl_matchL1_gpu
(
const
oclMat
&
query
,
const
oclMat
&
train
,
const
oclMat
&
mask
,
const
oclMat
&
trainIdx
,
const
oclMat
&
distance
)
{
int
distType
=
0
;
if
(
mask
.
data
)
{
matchDispatcher
<
T
>
(
query
,
train
,
mask
,
trainIdx
,
distance
,
distType
);
}
else
{
matchDispatcher
<
T
>
(
query
,
train
,
trainIdx
,
distance
,
distType
);
}
}
template
<
typename
T
>
void
ocl_matchL1_gpu
(
const
oclMat
&
query
,
const
oclMat
&
trains
,
const
oclMat
&
masks
,
const
oclMat
&
trainIdx
,
const
oclMat
&
imgIdx
,
const
oclMat
&
distance
)
{
int
distType
=
0
;
if
(
masks
.
data
)
{
matchDispatcher
<
T
>
(
query
,
(
const
oclMat
*
)
trains
.
ptr
(),
trains
.
cols
,
masks
,
trainIdx
,
imgIdx
,
distance
,
distType
);
}
else
{
matchDispatcher
<
T
>
(
query
,
(
const
oclMat
*
)
trains
.
ptr
(),
trains
.
cols
,
trainIdx
,
imgIdx
,
distance
,
distType
);
}
}
template
<
typename
T
>
void
ocl_matchL2_gpu
(
const
oclMat
&
query
,
const
oclMat
&
train
,
const
oclMat
&
mask
,
const
oclMat
&
trainIdx
,
const
oclMat
&
distance
)
{
int
distType
=
1
;
if
(
mask
.
data
)
{
matchDispatcher
<
T
>
(
query
,
train
,
mask
,
trainIdx
,
distance
,
distType
);
}
else
{
matchDispatcher
<
T
>
(
query
,
train
,
trainIdx
,
distance
,
distType
);
}
}
template
<
typename
T
>
void
ocl_matchL2_gpu
(
const
oclMat
&
query
,
const
oclMat
&
trains
,
const
oclMat
&
masks
,
const
oclMat
&
trainIdx
,
const
oclMat
&
imgIdx
,
const
oclMat
&
distance
)
{
int
distType
=
1
;
if
(
masks
.
data
)
{
matchDispatcher
<
T
>
(
query
,
(
const
oclMat
*
)
trains
.
ptr
(),
trains
.
cols
,
masks
,
trainIdx
,
imgIdx
,
distance
,
distType
);
}
else
{
matchDispatcher
<
T
>
(
query
,
(
const
oclMat
*
)
trains
.
ptr
(),
trains
.
cols
,
trainIdx
,
imgIdx
,
distance
,
distType
);
}
}
template
<
typename
T
>
void
ocl_matchHamming_gpu
(
const
oclMat
&
query
,
const
oclMat
&
train
,
const
oclMat
&
mask
,
const
oclMat
&
trainIdx
,
const
oclMat
&
distance
)
{
int
distType
=
2
;
if
(
mask
.
data
)
{
matchDispatcher
<
T
>
(
query
,
train
,
mask
,
trainIdx
,
distance
,
distType
);
}
else
{
matchDispatcher
<
T
>
(
query
,
train
,
trainIdx
,
distance
,
distType
);
}
}
template
<
typename
T
>
void
ocl_matchHamming_gpu
(
const
oclMat
&
query
,
const
oclMat
&
trains
,
const
oclMat
&
masks
,
const
oclMat
&
trainIdx
,
const
oclMat
&
imgIdx
,
const
oclMat
&
distance
)
{
int
distType
=
2
;
if
(
masks
.
data
)
{
matchDispatcher
<
T
>
(
query
,
(
const
oclMat
*
)
trains
.
ptr
(),
trains
.
cols
,
masks
,
trainIdx
,
imgIdx
,
distance
,
distType
);
}
else
{
matchDispatcher
<
T
>
(
query
,
(
const
oclMat
*
)
trains
.
ptr
(),
trains
.
cols
,
trainIdx
,
imgIdx
,
distance
,
distType
);
}
}
// knn caller
template
<
typename
T
>
void
ocl_matchL1_gpu
(
const
oclMat
&
query
,
const
oclMat
&
train
,
int
k
,
const
oclMat
&
mask
,
const
oclMat
&
trainIdx
,
const
oclMat
&
distance
,
const
oclMat
&
allDist
)
{
int
distType
=
0
;
if
(
mask
.
data
)
kmatchDispatcher
<
T
>
(
query
,
train
,
k
,
mask
,
trainIdx
,
distance
,
allDist
,
distType
);
else
kmatchDispatcher
<
T
>
(
query
,
train
,
k
,
trainIdx
,
distance
,
allDist
,
distType
);
}
template
<
typename
T
>
void
ocl_matchL2_gpu
(
const
oclMat
&
query
,
const
oclMat
&
train
,
int
k
,
const
oclMat
&
mask
,
const
oclMat
&
trainIdx
,
const
oclMat
&
distance
,
const
oclMat
&
allDist
)
{
int
distType
=
1
;
if
(
mask
.
data
)
kmatchDispatcher
<
T
>
(
query
,
train
,
k
,
mask
,
trainIdx
,
distance
,
allDist
,
distType
);
else
kmatchDispatcher
<
T
>
(
query
,
train
,
k
,
trainIdx
,
distance
,
allDist
,
distType
);
}
template
<
typename
T
>
void
ocl_matchHamming_gpu
(
const
oclMat
&
query
,
const
oclMat
&
train
,
int
k
,
const
oclMat
&
mask
,
const
oclMat
&
trainIdx
,
const
oclMat
&
distance
,
const
oclMat
&
allDist
)
{
int
distType
=
2
;
if
(
mask
.
data
)
kmatchDispatcher
<
T
>
(
query
,
train
,
k
,
mask
,
trainIdx
,
distance
,
allDist
,
distType
);
else
kmatchDispatcher
<
T
>
(
query
,
train
,
k
,
trainIdx
,
distance
,
allDist
,
distType
);
}
//radius caller
template
<
typename
T
>
void
ocl_matchL1_gpu
(
const
oclMat
&
query
,
const
oclMat
&
train
,
float
maxDistance
,
const
oclMat
&
mask
,
const
oclMat
&
trainIdx
,
const
oclMat
&
distance
,
const
oclMat
&
nMatches
)
{
int
distType
=
0
;
if
(
mask
.
data
)
matchDispatcher
<
T
>
(
query
,
train
,
maxDistance
,
mask
,
trainIdx
,
distance
,
nMatches
,
distType
);
else
matchDispatcher
<
T
>
(
query
,
train
,
maxDistance
,
trainIdx
,
distance
,
nMatches
,
distType
);
}
template
<
typename
T
>
void
ocl_matchL2_gpu
(
const
oclMat
&
query
,
const
oclMat
&
train
,
float
maxDistance
,
const
oclMat
&
mask
,
const
oclMat
&
trainIdx
,
const
oclMat
&
distance
,
const
oclMat
&
nMatches
)
{
int
distType
=
1
;
if
(
mask
.
data
)
matchDispatcher
<
T
>
(
query
,
train
,
maxDistance
,
mask
,
trainIdx
,
distance
,
nMatches
,
distType
);
else
matchDispatcher
<
T
>
(
query
,
train
,
maxDistance
,
trainIdx
,
distance
,
nMatches
,
distType
);
}
template
<
typename
T
>
void
ocl_matchHamming_gpu
(
const
oclMat
&
query
,
const
oclMat
&
train
,
float
maxDistance
,
const
oclMat
&
mask
,
const
oclMat
&
trainIdx
,
const
oclMat
&
distance
,
const
oclMat
&
nMatches
)
{
int
distType
=
2
;
if
(
mask
.
data
)
matchDispatcher
<
T
>
(
query
,
train
,
maxDistance
,
mask
,
trainIdx
,
distance
,
nMatches
,
distType
);
else
matchDispatcher
<
T
>
(
query
,
train
,
maxDistance
,
trainIdx
,
distance
,
nMatches
,
distType
);
}
cv
::
ocl
::
BruteForceMatcher_OCL_base
::
BruteForceMatcher_OCL_base
(
DistType
distType_
)
:
distType
(
distType_
)
{
}
...
...
@@ -929,38 +531,28 @@ void cv::ocl::BruteForceMatcher_OCL_base::matchSingle(const oclMat &query, const
{
if
(
query
.
empty
()
||
train
.
empty
())
return
;
// match1 doesn't support signed char type, match2 only support float, hamming support uchar, ushort and int
int
callType
=
query
.
depth
();
char
cvFuncName
[]
=
"singleMatch"
;
if
(
callType
!=
5
)
CV_ERROR
(
CV_UNSUPPORTED_FORMAT_ERR
,
"BruteForceMatch OpenCL only support float type query!
\n
"
);
typedef
void
(
*
caller_t
)(
const
oclMat
&
query
,
const
oclMat
&
train
,
const
oclMat
&
mask
,
const
oclMat
&
trainIdx
,
const
oclMat
&
distance
);
static
const
caller_t
callers
[
3
][
6
]
=
if
((
distType
==
0
&&
callType
==
1
)
||
(
distType
==
1
&&
callType
!=
5
)
||
(
distType
==
2
&&
(
callType
!=
0
||
callType
!=
2
||
callType
!=
4
)))
{
{
ocl_matchL1_gpu
<
unsigned
char
>
,
0
/*ocl_matchL1_gpu<signed char>*/
,
ocl_matchL1_gpu
<
unsigned
short
>
,
ocl_matchL1_gpu
<
short
>
,
ocl_matchL1_gpu
<
int
>
,
ocl_matchL1_gpu
<
float
>
},
{
0
/*ocl_matchL2_gpu<unsigned char>*/
,
0
/*ocl_matchL2_gpu<signed char>*/
,
0
/*ocl_matchL2_gpu<unsigned short>*/
,
0
/*ocl_matchL2_gpu<short>*/
,
0
/*ocl_matchL2_gpu<int>*/
,
ocl_matchL2_gpu
<
float
>
},
{
ocl_matchHamming_gpu
<
unsigned
char
>
,
0
/*ocl_matchHamming_gpu<signed char>*/
,
ocl_matchHamming_gpu
<
unsigned
short
>
,
0
/*ocl_matchHamming_gpu<short>*/
,
ocl_matchHamming_gpu
<
int
>
,
0
/*ocl_matchHamming_gpu<float>*/
}
};
CV_ERROR
(
CV_UNSUPPORTED_DEPTH_ERR
,
"BruteForceMatch OpenCL only support float type query!
\n
"
);
}
CV_Assert
(
query
.
channels
()
==
1
&&
query
.
depth
()
<
CV_64F
);
CV_Assert
(
train
.
cols
==
query
.
cols
&&
train
.
type
()
==
query
.
type
());
const
int
nQuery
=
query
.
rows
;
trainIdx
.
create
(
1
,
nQuery
,
CV_32S
);
distance
.
create
(
1
,
nQuery
,
CV_32F
);
trainIdx
.
create
(
1
,
query
.
rows
,
CV_32S
);
distance
.
create
(
1
,
query
.
rows
,
CV_32F
);
caller_t
func
=
callers
[
distType
][
query
.
depth
()];
func
(
query
,
train
,
mask
,
trainIdx
,
distance
);
matchDispatcher
(
query
,
train
,
mask
,
trainIdx
,
distance
,
distType
);
exit
:
return
;
}
void
cv
::
ocl
::
BruteForceMatcher_OCL_base
::
matchDownload
(
const
oclMat
&
trainIdx
,
const
oclMat
&
distance
,
vector
<
DMatch
>
&
matches
)
...
...
@@ -1062,40 +654,27 @@ void cv::ocl::BruteForceMatcher_OCL_base::matchCollection(const oclMat &query, c
if
(
query
.
empty
()
||
trainCollection
.
empty
())
return
;
typedef
void
(
*
caller_t
)(
const
oclMat
&
query
,
const
oclMat
&
trains
,
const
oclMat
&
masks
,
const
oclMat
&
trainIdx
,
const
oclMat
&
imgIdx
,
const
oclMat
&
distance
);
// match1 doesn't support signed char type, match2 only support float, hamming support uchar, ushort and int
int
callType
=
query
.
depth
();
char
cvFuncName
[]
=
"matchCollection"
;
if
(
callType
!=
5
)
CV_ERROR
(
CV_UNSUPPORTED_FORMAT_ERR
,
"BruteForceMatch OpenCL only support float type query!
\n
"
);
static
const
caller_t
callers
[
3
][
6
]
=
if
((
distType
==
0
&&
callType
==
1
)
||
(
distType
==
1
&&
callType
!=
5
)
||
(
distType
==
2
&&
(
callType
!=
0
||
callType
!=
2
||
callType
!=
4
)))
{
{
ocl_matchL1_gpu
<
unsigned
char
>
,
0
/*matchL1_gpu<signed char>*/
,
ocl_matchL1_gpu
<
unsigned
short
>
,
ocl_matchL1_gpu
<
short
>
,
ocl_matchL1_gpu
<
int
>
,
ocl_matchL1_gpu
<
float
>
},
{
0
/*matchL2_gpu<unsigned char>*/
,
0
/*matchL2_gpu<signed char>*/
,
0
/*matchL2_gpu<unsigned short>*/
,
0
/*matchL2_gpu<short>*/
,
0
/*matchL2_gpu<int>*/
,
ocl_matchL2_gpu
<
float
>
},
{
ocl_matchHamming_gpu
<
unsigned
char
>
,
0
/*matchHamming_gpu<signed char>*/
,
ocl_matchHamming_gpu
<
unsigned
short
>
,
0
/*matchHamming_gpu<short>*/
,
ocl_matchHamming_gpu
<
int
>
,
0
/*matchHamming_gpu<float>*/
}
};
CV_ERROR
(
CV_UNSUPPORTED_DEPTH_ERR
,
"BruteForceMatch OpenCL only support float type query!
\n
"
);
}
CV_Assert
(
query
.
channels
()
==
1
&&
query
.
depth
()
<
CV_64F
);
const
int
nQuery
=
query
.
rows
;
trainIdx
.
create
(
1
,
nQuery
,
CV_32S
);
imgIdx
.
create
(
1
,
nQuery
,
CV_32S
);
distance
.
create
(
1
,
nQuery
,
CV_32F
);
caller_t
func
=
callers
[
distType
][
query
.
depth
()];
CV_Assert
(
func
!=
0
);
trainIdx
.
create
(
1
,
query
.
rows
,
CV_32S
);
imgIdx
.
create
(
1
,
query
.
rows
,
CV_32S
);
distance
.
create
(
1
,
query
.
rows
,
CV_32F
);
func
(
query
,
trainCollection
,
masks
,
trainIdx
,
imgIdx
,
distance
);
matchDispatcher
(
query
,
(
const
oclMat
*
)
trainCollection
.
ptr
(),
trainCollection
.
cols
,
masks
,
trainIdx
,
imgIdx
,
distance
,
distType
);
exit
:
return
;
}
void
cv
::
ocl
::
BruteForceMatcher_OCL_base
::
matchDownload
(
const
oclMat
&
trainIdx
,
const
oclMat
&
imgIdx
,
const
oclMat
&
distance
,
vector
<
DMatch
>
&
matches
)
...
...
@@ -1164,52 +743,39 @@ void cv::ocl::BruteForceMatcher_OCL_base::knnMatchSingle(const oclMat &query, co
if
(
query
.
empty
()
||
train
.
empty
())
return
;
typedef
void
(
*
caller_t
)(
const
oclMat
&
query
,
const
oclMat
&
train
,
int
k
,
const
oclMat
&
mask
,
const
oclMat
&
trainIdx
,
const
oclMat
&
distance
,
const
oclMat
&
allDist
);
// match1 doesn't support signed char type, match2 only support float, hamming support uchar, ushort and int
int
callType
=
query
.
depth
(
);
static
const
caller_t
callers
[
3
][
6
]
=
char
cvFuncName
[]
=
"knnMatchSingle"
;
if
(
callType
!=
5
)
CV_ERROR
(
CV_UNSUPPORTED_FORMAT_ERR
,
"BruteForceMatch OpenCL only support float type query!
\n
"
);
if
((
distType
==
0
&&
callType
==
1
)
||
(
distType
==
1
&&
callType
!=
5
)
||
(
distType
==
2
&&
(
callType
!=
0
||
callType
!=
2
||
callType
!=
4
)))
{
{
ocl_matchL1_gpu
<
unsigned
char
>
,
0
/*ocl_matchL1_gpu<signed char>*/
,
ocl_matchL1_gpu
<
unsigned
short
>
,
ocl_matchL1_gpu
<
short
>
,
ocl_matchL1_gpu
<
int
>
,
ocl_matchL1_gpu
<
float
>
},
{
0
/*ocl_matchL2_gpu<unsigned char>*/
,
0
/*ocl_matchL2_gpu<signed char>*/
,
0
/*ocl_matchL2_gpu<unsigned short>*/
,
0
/*ocl_matchL2_gpu<short>*/
,
0
/*ocl_matchL2_gpu<int>*/
,
ocl_matchL2_gpu
<
float
>
},
{
ocl_matchHamming_gpu
<
unsigned
char
>
,
0
/*ocl_matchHamming_gpu<signed char>*/
,
ocl_matchHamming_gpu
<
unsigned
short
>
,
0
/*ocl_matchHamming_gpu<short>*/
,
ocl_matchHamming_gpu
<
int
>
,
0
/*ocl_matchHamming_gpu<float>*/
}
};
CV_ERROR
(
CV_UNSUPPORTED_DEPTH_ERR
,
"BruteForceMatch OpenCL only support float type query!
\n
"
);
}
CV_Assert
(
query
.
channels
()
==
1
&&
query
.
depth
()
<
CV_64F
);
CV_Assert
(
train
.
type
()
==
query
.
type
()
&&
train
.
cols
==
query
.
cols
);
const
int
nQuery
=
query
.
rows
;
const
int
nTrain
=
train
.
rows
;
if
(
k
==
2
)
{
trainIdx
.
create
(
1
,
nQuery
,
CV_32SC2
);
distance
.
create
(
1
,
nQuery
,
CV_32FC2
);
trainIdx
.
create
(
1
,
query
.
rows
,
CV_32SC2
);
distance
.
create
(
1
,
query
.
rows
,
CV_32FC2
);
}
else
{
trainIdx
.
create
(
nQuery
,
k
,
CV_32S
);
distance
.
create
(
nQuery
,
k
,
CV_32F
);
allDist
.
create
(
nQuery
,
nTrain
,
CV_32FC1
);
trainIdx
.
create
(
query
.
rows
,
k
,
CV_32S
);
distance
.
create
(
query
.
rows
,
k
,
CV_32F
);
allDist
.
create
(
query
.
rows
,
train
.
rows
,
CV_32FC1
);
}
trainIdx
.
setTo
(
Scalar
::
all
(
-
1
));
caller_t
func
=
callers
[
distType
][
query
.
depth
()];
CV_Assert
(
func
!=
0
);
func
(
query
,
train
,
k
,
mask
,
trainIdx
,
distance
,
allDist
);
kmatchDispatcher
(
query
,
train
,
k
,
mask
,
trainIdx
,
distance
,
allDist
,
distType
);
exit
:
return
;
}
void
cv
::
ocl
::
BruteForceMatcher_OCL_base
::
knnMatchDownload
(
const
oclMat
&
trainIdx
,
const
oclMat
&
distance
,
vector
<
vector
<
DMatch
>
>
&
matches
,
bool
compactResult
)
...
...
@@ -1394,8 +960,6 @@ namespace
void
cv
::
ocl
::
BruteForceMatcher_OCL_base
::
knnMatch
(
const
oclMat
&
query
,
vector
<
vector
<
DMatch
>
>
&
matches
,
int
k
,
const
vector
<
oclMat
>
&
masks
,
bool
compactResult
)
{
if
(
k
==
2
)
{
oclMat
trainCollection
;
...
...
@@ -1455,50 +1019,34 @@ void cv::ocl::BruteForceMatcher_OCL_base::radiusMatchSingle(const oclMat &query,
if
(
query
.
empty
()
||
train
.
empty
())
return
;
typedef
void
(
*
caller_t
)(
const
oclMat
&
query
,
const
oclMat
&
train
,
float
maxDistance
,
const
oclMat
&
mask
,
const
oclMat
&
trainIdx
,
const
oclMat
&
distance
,
const
oclMat
&
nMatches
);
// match1 doesn't support signed char type, match2 only support float, hamming support uchar, ushort and int
int
callType
=
query
.
depth
();
char
cvFuncName
[]
=
"radiusMatchSingle"
;
if
(
callType
!=
5
)
CV_ERROR
(
CV_UNSUPPORTED_FORMAT_ERR
,
"BruteForceMatch OpenCL only support float type query!
\n
"
);
//#if
0
static
const
caller_t
callers
[
3
][
6
]
=
if
((
distType
==
0
&&
callType
==
1
)
||
(
distType
==
1
&&
callType
!=
5
)
||
(
distType
==
2
&&
(
callType
!=
0
||
callType
!=
2
||
callType
!=
4
)))
{
{
ocl_matchL1_gpu
<
unsigned
char
>
,
0
/*ocl_matchL1_gpu<signed char>*/
,
ocl_matchL1_gpu
<
unsigned
short
>
,
ocl_matchL1_gpu
<
short
>
,
ocl_matchL1_gpu
<
int
>
,
ocl_matchL1_gpu
<
float
>
},
{
0
/*ocl_matchL2_gpu<unsigned char>*/
,
0
/*ocl_matchL2_gpu<signed char>*/
,
0
/*ocl_matchL2_gpu<unsigned short>*/
,
0
/*ocl_matchL2_gpu<short>*/
,
0
/*ocl_matchL2_gpu<int>*/
,
ocl_matchL2_gpu
<
float
>
},
{
ocl_matchHamming_gpu
<
unsigned
char
>
,
0
/*ocl_matchHamming_gpu<signed char>*/
,
ocl_matchHamming_gpu
<
unsigned
short
>
,
0
/*ocl_matchHamming_gpu<short>*/
,
ocl_matchHamming_gpu
<
int
>
,
0
/*ocl_matchHamming_gpu<float>*/
}
};
//#endif
const
int
nQuery
=
query
.
rows
;
const
int
nTrain
=
train
.
rows
;
CV_ERROR
(
CV_UNSUPPORTED_DEPTH_ERR
,
"BruteForceMatch OpenCL only support float type query!
\n
"
);
}
CV_Assert
(
query
.
channels
()
==
1
&&
query
.
depth
()
<
CV_64F
);
CV_Assert
(
train
.
type
()
==
query
.
type
()
&&
train
.
cols
==
query
.
cols
);
CV_Assert
(
trainIdx
.
empty
()
||
(
trainIdx
.
rows
==
nQuery
&&
trainIdx
.
size
()
==
distance
.
size
()));
CV_Assert
(
trainIdx
.
empty
()
||
(
trainIdx
.
rows
==
query
.
rows
&&
trainIdx
.
size
()
==
distance
.
size
()));
nMatches
.
create
(
1
,
nQuery
,
CV_32SC1
);
nMatches
.
create
(
1
,
query
.
rows
,
CV_32SC1
);
if
(
trainIdx
.
empty
())
{
trainIdx
.
create
(
nQuery
,
std
::
max
((
nTrain
/
100
),
10
),
CV_32SC1
);
distance
.
create
(
nQuery
,
std
::
max
((
nTrain
/
100
),
10
),
CV_32FC1
);
trainIdx
.
create
(
query
.
rows
,
std
::
max
((
train
.
rows
/
100
),
10
),
CV_32SC1
);
distance
.
create
(
query
.
rows
,
std
::
max
((
train
.
rows
/
100
),
10
),
CV_32FC1
);
}
nMatches
.
setTo
(
Scalar
::
all
(
0
));
caller_t
func
=
callers
[
distType
][
query
.
depth
()];
//CV_Assert(func != 0);
//func(query, train, maxDistance, mask, trainIdx, distance, nMatches, cc, StreamAccessor::getStream(stream));
func
(
query
,
train
,
maxDistance
,
mask
,
trainIdx
,
distance
,
nMatches
);
matchDispatcher
(
query
,
train
,
maxDistance
,
mask
,
trainIdx
,
distance
,
nMatches
,
distType
);
exit
:
return
;
}
void
cv
::
ocl
::
BruteForceMatcher_OCL_base
::
radiusMatchDownload
(
const
oclMat
&
trainIdx
,
const
oclMat
&
distance
,
const
oclMat
&
nMatches
,
...
...
@@ -1697,5 +1245,3 @@ void cv::ocl::BruteForceMatcher_OCL_base::radiusMatch(const oclMat &query, vecto
radiusMatchCollection
(
query
,
trainIdx
,
imgIdx
,
distance
,
nMatches
,
maxDistance
,
masks
);
radiusMatchDownload
(
trainIdx
,
imgIdx
,
distance
,
nMatches
,
matches
,
compactResult
);
}
modules/ocl/src/haar.cpp
View file @
6c99b5c9
...
...
@@ -953,8 +953,8 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
//int flag = 0;
oclMat
gimg1
(
gimg
.
rows
,
gimg
.
cols
,
CV_8UC1
);
oclMat
gsum
(
totalheight
,
gimg
.
cols
+
1
,
CV_32SC1
);
oclMat
gsqsum
(
totalheight
,
gimg
.
cols
+
1
,
CV_32FC1
);
oclMat
gsum
(
totalheight
+
4
,
gimg
.
cols
+
1
,
CV_32SC1
);
oclMat
gsqsum
(
totalheight
+
4
,
gimg
.
cols
+
1
,
CV_32FC1
);
//cl_mem cascadebuffer;
cl_mem
stagebuffer
;
...
...
modules/ocl/src/match_template.cpp
View file @
6c99b5c9
...
...
@@ -71,6 +71,9 @@ namespace cv
void
matchTemplate_SQDIFF_NORMED
(
const
oclMat
&
image
,
const
oclMat
&
templ
,
oclMat
&
result
,
MatchTemplateBuf
&
buf
);
void
convolve_32F
(
const
oclMat
&
image
,
const
oclMat
&
templ
,
oclMat
&
result
,
MatchTemplateBuf
&
buf
);
void
matchTemplate_CCORR
(
const
oclMat
&
image
,
const
oclMat
&
templ
,
oclMat
&
result
,
MatchTemplateBuf
&
buf
);
...
...
@@ -90,41 +93,65 @@ namespace cv
void
matchTemplateNaive_CCORR
(
const
oclMat
&
image
,
const
oclMat
&
templ
,
oclMat
&
result
,
int
cn
);
void
extractFirstChannel_32F
(
const
oclMat
&
image
,
oclMat
&
result
);
// Evaluates optimal template's area threshold. If
// template's area is less than the threshold, we use naive match
// template version, otherwise FFT-based (if available)
static
int
getTemplateThreshold
(
int
method
,
int
depth
)
static
bool
useNaive
(
int
,
int
,
Size
)
{
switch
(
method
)
{
case
CV_TM_CCORR
:
if
(
depth
==
CV_32F
)
return
250
;
if
(
depth
==
CV_8U
)
return
300
;
break
;
case
CV_TM_SQDIFF
:
if
(
depth
==
CV_32F
)
return
0x7fffffff
;
// do naive SQDIFF for CV_32F
if
(
depth
==
CV_8U
)
return
300
;
break
;
}
CV_Error
(
CV_StsBadArg
,
"getTemplateThreshold: unsupported match template mode"
);
return
0
;
// FIXME!
// always use naive until convolve is imported
return
true
;
}
//////////////////////////////////////////////////////////////////////
// SQDIFF
void
matchTemplate_SQDIFF
(
const
oclMat
&
image
,
const
oclMat
&
templ
,
oclMat
&
result
,
MatchTemplateBuf
&
)
const
oclMat
&
image
,
const
oclMat
&
templ
,
oclMat
&
result
,
MatchTemplateBuf
&
buf
)
{
result
.
create
(
image
.
rows
-
templ
.
rows
+
1
,
image
.
cols
-
templ
.
cols
+
1
,
CV_32F
);
if
(
templ
.
size
().
area
()
<
getTemplateThreshold
(
CV_TM_SQDIFF
,
image
.
depth
()))
if
(
useNaive
(
CV_TM_SQDIFF
,
image
.
depth
(),
templ
.
size
()))
{
matchTemplateNaive_SQDIFF
(
image
,
templ
,
result
,
image
.
oclchannels
());
return
;
}
else
{
// TODO
CV_Error
(
CV_StsBadArg
,
"Not supported yet for this size template"
);
buf
.
image_sqsums
.
resize
(
1
);
// TODO, add double support for ocl::integral
// use CPU integral temporarily
Mat
sums
,
sqsums
;
cv
::
integral
(
Mat
(
image
.
reshape
(
1
)),
sums
,
sqsums
);
buf
.
image_sqsums
[
0
]
=
sqsums
;
unsigned
long
long
templ_sqsum
=
(
unsigned
long
long
)
sqrSum
(
templ
.
reshape
(
1
))[
0
];
matchTemplate_CCORR
(
image
,
templ
,
result
,
buf
);
//port CUDA's matchTemplatePrepared_SQDIFF_8U
Context
*
clCxt
=
image
.
clCxt
;
string
kernelName
=
"matchTemplate_Prepared_SQDIFF"
;
vector
<
pair
<
size_t
,
const
void
*>
>
args
;
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
buf
.
image_sqsums
[
0
].
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
result
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_ulong
),
(
void
*
)
&
templ_sqsum
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
result
.
rows
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
result
.
cols
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
templ
.
rows
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
templ
.
cols
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
buf
.
image_sqsums
[
0
].
offset
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
buf
.
image_sqsums
[
0
].
step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
result
.
offset
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
result
.
step
));
size_t
globalThreads
[
3
]
=
{
result
.
cols
,
result
.
rows
,
1
};
size_t
localThreads
[
3
]
=
{
16
,
16
,
1
};
const
char
*
build_opt
=
image
.
oclchannels
()
==
4
?
"-D CN4"
:
""
;
openCLExecuteKernel
(
clCxt
,
&
match_template
,
kernelName
,
globalThreads
,
localThreads
,
args
,
1
,
CV_8U
,
build_opt
);
}
}
...
...
@@ -134,7 +161,6 @@ namespace cv
matchTemplate_CCORR
(
image
,
templ
,
result
,
buf
);
buf
.
image_sums
.
resize
(
1
);
integral
(
image
.
reshape
(
1
),
buf
.
image_sums
[
0
]);
unsigned
long
long
templ_sqsum
=
(
unsigned
long
long
)
sqrSum
(
templ
.
reshape
(
1
))[
0
];
...
...
@@ -156,7 +182,7 @@ namespace cv
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
result
.
step
));
size_t
globalThreads
[
3
]
=
{
result
.
cols
,
result
.
rows
,
1
};
size_t
localThreads
[
3
]
=
{
32
,
8
,
1
};
size_t
localThreads
[
3
]
=
{
16
,
16
,
1
};
openCLExecuteKernel
(
clCxt
,
&
match_template
,
kernelName
,
globalThreads
,
localThreads
,
args
,
1
,
CV_8U
);
}
...
...
@@ -191,33 +217,39 @@ namespace cv
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
result
.
step
));
size_t
globalThreads
[
3
]
=
{
result
.
cols
,
result
.
rows
,
1
};
size_t
localThreads
[
3
]
=
{
32
,
8
,
1
};
size_t
localThreads
[
3
]
=
{
16
,
16
,
1
};
openCLExecuteKernel
(
clCxt
,
&
match_template
,
kernelName
,
globalThreads
,
localThreads
,
args
,
image
.
oclchannels
(),
image
.
depth
());
}
//////////////////////////////////////////////////////////////////////
// CCORR
void
convolve_32F
(
const
oclMat
&
,
const
oclMat
&
,
oclMat
&
,
MatchTemplateBuf
&
)
{
CV_Error
(
-
1
,
"convolve is not fully implemented yet"
);
}
void
matchTemplate_CCORR
(
const
oclMat
&
image
,
const
oclMat
&
templ
,
oclMat
&
result
,
MatchTemplateBuf
&
buf
)
{
result
.
create
(
image
.
rows
-
templ
.
rows
+
1
,
image
.
cols
-
templ
.
cols
+
1
,
CV_32F
);
if
(
templ
.
size
().
area
()
<
getTemplateThreshold
(
CV_TM_SQDIFF
,
image
.
depth
()))
if
(
useNaive
(
CV_TM_CCORR
,
image
.
depth
(),
templ
.
size
()))
{
matchTemplateNaive_CCORR
(
image
,
templ
,
result
,
image
.
oclchannels
());
return
;
}
else
{
CV_Error
(
CV_StsBadArg
,
"Not supported yet for this size template"
);
if
(
image
.
depth
()
==
CV_8U
&&
templ
.
depth
()
==
CV_8U
)
{
image
.
convertTo
(
buf
.
imagef
,
CV_32F
);
templ
.
convertTo
(
buf
.
templf
,
CV_32F
);
convolve_32F
(
buf
.
imagef
,
buf
.
templf
,
result
,
buf
);
}
else
{
convolve_32F
(
image
,
templ
,
result
,
buf
);
}
CV_Assert
(
image
.
oclchannels
()
==
1
);
oclMat
o_result
(
image
.
size
(),
CV_MAKETYPE
(
CV_32F
,
image
.
oclchannels
()));
filter2D
(
buf
.
imagef
,
o_result
,
CV_32F
,
buf
.
templf
,
Point
(
0
,
0
));
result
=
o_result
(
Rect
(
0
,
0
,
image
.
rows
-
templ
.
rows
+
1
,
image
.
cols
-
templ
.
cols
+
1
));
}
}
...
...
@@ -249,7 +281,7 @@ namespace cv
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
result
.
step
));
size_t
globalThreads
[
3
]
=
{
result
.
cols
,
result
.
rows
,
1
};
size_t
localThreads
[
3
]
=
{
32
,
8
,
1
};
size_t
localThreads
[
3
]
=
{
16
,
16
,
1
};
openCLExecuteKernel
(
clCxt
,
&
match_template
,
kernelName
,
globalThreads
,
localThreads
,
args
,
1
,
CV_8U
);
}
...
...
@@ -284,7 +316,7 @@ namespace cv
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
result
.
step
));
size_t
globalThreads
[
3
]
=
{
result
.
cols
,
result
.
rows
,
1
};
size_t
localThreads
[
3
]
=
{
32
,
8
,
1
};
size_t
localThreads
[
3
]
=
{
16
,
16
,
1
};
openCLExecuteKernel
(
clCxt
,
&
match_template
,
kernelName
,
globalThreads
,
localThreads
,
args
,
image
.
oclchannels
(),
image
.
depth
());
}
//////////////////////////////////////////////////////////////////////
...
...
@@ -301,7 +333,7 @@ namespace cv
kernelName
=
"matchTemplate_Prepared_CCOFF"
;
size_t
globalThreads
[
3
]
=
{
result
.
cols
,
result
.
rows
,
1
};
size_t
localThreads
[
3
]
=
{
32
,
8
,
1
};
size_t
localThreads
[
3
]
=
{
16
,
16
,
1
};
vector
<
pair
<
size_t
,
const
void
*>
>
args
;
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
result
.
data
)
);
...
...
@@ -313,22 +345,22 @@ namespace cv
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
result
.
cols
)
);
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
result
.
offset
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
result
.
step
));
Vec4f
templ_sum
=
Vec4f
::
all
(
0
);
// to be continued in the following section
if
(
image
.
oclchannels
()
==
1
)
{
buf
.
image_sums
.
resize
(
1
);
integral
(
image
,
buf
.
image_sums
[
0
]);
float
templ_sum
=
0
;
templ_sum
=
(
float
)
sum
(
templ
)[
0
]
/
templ
.
size
().
area
();
templ_sum
[
0
]
=
(
float
)
sum
(
templ
)[
0
]
/
templ
.
size
().
area
();
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
buf
.
image_sums
[
0
].
data
)
);
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
buf
.
image_sums
[
0
].
offset
)
);
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
buf
.
image_sums
[
0
].
step
)
);
args
.
push_back
(
make_pair
(
sizeof
(
cl_float
),
(
void
*
)
&
templ_sum
)
);
args
.
push_back
(
make_pair
(
sizeof
(
cl_float
),
(
void
*
)
&
templ_sum
[
0
]
)
);
}
else
{
Vec4f
templ_sum
=
Vec4f
::
all
(
0
);
split
(
image
,
buf
.
images
);
templ_sum
=
sum
(
templ
)
/
templ
.
size
().
area
();
buf
.
image_sums
.
resize
(
buf
.
images
.
size
());
...
...
@@ -374,7 +406,7 @@ namespace cv
kernelName
=
"matchTemplate_Prepared_CCOFF_NORMED"
;
size_t
globalThreads
[
3
]
=
{
result
.
cols
,
result
.
rows
,
1
};
size_t
localThreads
[
3
]
=
{
32
,
8
,
1
};
size_t
localThreads
[
3
]
=
{
16
,
16
,
1
};
vector
<
pair
<
size_t
,
const
void
*>
>
args
;
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
result
.
data
)
);
...
...
@@ -387,20 +419,22 @@ namespace cv
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
result
.
offset
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
result
.
step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_float
),
(
void
*
)
&
scale
)
);
Vec4f
templ_sum
=
Vec4f
::
all
(
0
);
Vec4f
templ_sqsum
=
Vec4f
::
all
(
0
);
// to be continued in the following section
if
(
image
.
oclchannels
()
==
1
)
{
buf
.
image_sums
.
resize
(
1
);
buf
.
image_sqsums
.
resize
(
1
);
integral
(
image
,
buf
.
image_sums
[
0
],
buf
.
image_sqsums
[
0
]);
float
templ_sum
=
0
;
float
templ_sqsum
=
0
;
templ_sum
=
(
float
)
sum
(
templ
)[
0
];
templ_s
qsum
=
sqrS
um
(
templ
)[
0
];
templ_s
um
[
0
]
=
(
float
)
s
um
(
templ
)[
0
];
templ_sqsum
-=
scale
*
templ_sum
*
templ_sum
;
templ_sum
*=
scale
;
templ_sqsum
[
0
]
=
sqrSum
(
templ
)[
0
];
templ_sqsum
[
0
]
-=
scale
*
templ_sum
[
0
]
*
templ_sum
[
0
];
templ_sum
[
0
]
*=
scale
;
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
buf
.
image_sums
[
0
].
data
)
);
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
buf
.
image_sums
[
0
].
offset
)
);
...
...
@@ -408,13 +442,11 @@ namespace cv
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
buf
.
image_sqsums
[
0
].
data
)
);
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
buf
.
image_sqsums
[
0
].
offset
)
);
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
buf
.
image_sqsums
[
0
].
step
)
);
args
.
push_back
(
make_pair
(
sizeof
(
cl_float
),
(
void
*
)
&
templ_sum
)
);
args
.
push_back
(
make_pair
(
sizeof
(
cl_float
),
(
void
*
)
&
templ_sqsum
)
);
args
.
push_back
(
make_pair
(
sizeof
(
cl_float
),
(
void
*
)
&
templ_sum
[
0
]
)
);
args
.
push_back
(
make_pair
(
sizeof
(
cl_float
),
(
void
*
)
&
templ_sqsum
[
0
]
)
);
}
else
{
Vec4f
templ_sum
=
Vec4f
::
all
(
0
);
Vec4f
templ_sqsum
=
Vec4f
::
all
(
0
);
split
(
image
,
buf
.
images
);
templ_sum
=
sum
(
templ
);
...
...
@@ -465,7 +497,27 @@ namespace cv
}
openCLExecuteKernel
(
clCxt
,
&
match_template
,
kernelName
,
globalThreads
,
localThreads
,
args
,
image
.
oclchannels
(),
image
.
depth
());
}
void
extractFirstChannel_32F
(
const
oclMat
&
image
,
oclMat
&
result
)
{
Context
*
clCxt
=
image
.
clCxt
;
string
kernelName
;
kernelName
=
"extractFirstChannel"
;
size_t
globalThreads
[
3
]
=
{
result
.
cols
,
result
.
rows
,
1
};
size_t
localThreads
[
3
]
=
{
16
,
16
,
1
};
vector
<
pair
<
size_t
,
const
void
*>
>
args
;
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
image
.
data
)
);
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
),
(
void
*
)
&
result
.
data
)
);
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
result
.
rows
)
);
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
result
.
cols
)
);
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
image
.
offset
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
result
.
offset
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
image
.
step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
),
(
void
*
)
&
result
.
step
));
openCLExecuteKernel
(
clCxt
,
&
match_template
,
kernelName
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
);
}
}
/*ocl*/
}
/*cv*/
...
...
modules/ocl/src/moments.cpp
View file @
6c99b5c9
...
...
@@ -143,7 +143,7 @@ static void icvContourMoments( CvSeq* contour, CvMoments* mom )
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
dst_step
));
openCLExecuteKernel
(
dst_a
.
clCxt
,
&
moments
,
"icvContourMoments"
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
);
cv
::
Mat
dst
(
dst_a
);
a00
=
a10
=
a01
=
a20
=
a11
=
a02
=
a30
=
a21
=
a12
=
a03
=
0.0
;
if
(
!
cv
::
ocl
::
Context
::
getContext
()
->
supportsFeature
(
Context
::
CL_DOUBLE
))
...
...
@@ -277,16 +277,7 @@ static void ocl_cvMoments( const void* array, CvMoments* mom, int binary )
blocky
=
size
.
height
/
TILE_SIZE
;
else
blocky
=
size
.
height
/
TILE_SIZE
+
1
;
cv
::
ocl
::
oclMat
dst_m00
(
blocky
,
blockx
,
CV_64FC1
);
cv
::
ocl
::
oclMat
dst_m10
(
blocky
,
blockx
,
CV_64FC1
);
cv
::
ocl
::
oclMat
dst_m01
(
blocky
,
blockx
,
CV_64FC1
);
cv
::
ocl
::
oclMat
dst_m20
(
blocky
,
blockx
,
CV_64FC1
);
cv
::
ocl
::
oclMat
dst_m11
(
blocky
,
blockx
,
CV_64FC1
);
cv
::
ocl
::
oclMat
dst_m02
(
blocky
,
blockx
,
CV_64FC1
);
cv
::
ocl
::
oclMat
dst_m30
(
blocky
,
blockx
,
CV_64FC1
);
cv
::
ocl
::
oclMat
dst_m21
(
blocky
,
blockx
,
CV_64FC1
);
cv
::
ocl
::
oclMat
dst_m12
(
blocky
,
blockx
,
CV_64FC1
);
cv
::
ocl
::
oclMat
dst_m03
(
blocky
,
blockx
,
CV_64FC1
);
cv
::
ocl
::
oclMat
dst_m
(
blocky
*
10
,
blockx
,
CV_64FC1
);
cl_mem
sum
=
openCLCreateBuffer
(
src
.
clCxt
,
CL_MEM_READ_WRITE
,
10
*
sizeof
(
double
));
int
tile_width
=
std
::
min
(
size
.
width
,
TILE_SIZE
);
int
tile_height
=
std
::
min
(
size
.
height
,
TILE_SIZE
);
...
...
@@ -299,25 +290,17 @@ static void ocl_cvMoments( const void* array, CvMoments* mom, int binary )
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
src
.
step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
tileSize
.
width
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
tileSize
.
height
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
dst_m00
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
dst_m10
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
dst_m01
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
dst_m20
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
dst_m11
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
dst_m02
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
dst_m30
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
dst_m21
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
dst_m12
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
dst_m03
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
dst_m00
.
cols
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
dst_m00
.
step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
dst_m
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
dst_m
.
cols
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
dst_m
.
step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
blocky
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
type
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
depth
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
cn
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
coi
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
binary
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
TILE_SIZE
));
openCLExecuteKernel
(
dst_m
00
.
clCxt
,
&
moments
,
"CvMoments"
,
globalThreads
,
localThreads
,
args
,
-
1
,
depth
);
openCLExecuteKernel
(
dst_m
.
clCxt
,
&
moments
,
"CvMoments"
,
globalThreads
,
localThreads
,
args
,
-
1
,
depth
);
size_t
localThreadss
[
3
]
=
{
128
,
1
,
1
};
size_t
globalThreadss
[
3
]
=
{
128
,
1
,
1
};
...
...
@@ -327,20 +310,12 @@ static void ocl_cvMoments( const void* array, CvMoments* mom, int binary )
args_sum
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
tile_width
));
args_sum
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
TILE_SIZE
));
args_sum
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
sum
));
args_sum
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
dst_m00
.
data
));
args_sum
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
dst_m10
.
data
));
args_sum
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
dst_m01
.
data
));
args_sum
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
dst_m20
.
data
));
args_sum
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
dst_m11
.
data
));
args_sum
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
dst_m02
.
data
));
args_sum
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
dst_m30
.
data
));
args_sum
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
dst_m21
.
data
));
args_sum
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
dst_m12
.
data
));
args_sum
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
dst_m03
.
data
));
openCLExecuteKernel
(
dst_m00
.
clCxt
,
&
moments
,
"dst_sum"
,
globalThreadss
,
localThreadss
,
args_sum
,
-
1
,
-
1
);
args_sum
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
dst_m
.
data
));
args_sum
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
dst_m
.
step
));
openCLExecuteKernel
(
dst_m
.
clCxt
,
&
moments
,
"dst_sum"
,
globalThreadss
,
localThreadss
,
args_sum
,
-
1
,
-
1
);
double
*
dstsum
=
new
double
[
10
];
memset
(
dstsum
,
0
,
10
*
sizeof
(
double
));
openCLReadBuffer
(
dst_m
00
.
clCxt
,
sum
,(
void
*
)
dstsum
,
10
*
sizeof
(
double
));
openCLReadBuffer
(
dst_m
.
clCxt
,
sum
,(
void
*
)
dstsum
,
10
*
sizeof
(
double
));
mom
->
m00
=
dstsum
[
0
];
mom
->
m10
=
dstsum
[
1
];
mom
->
m01
=
dstsum
[
2
];
...
...
@@ -351,6 +326,7 @@ static void ocl_cvMoments( const void* array, CvMoments* mom, int binary )
mom
->
m21
=
dstsum
[
7
];
mom
->
m12
=
dstsum
[
8
];
mom
->
m03
=
dstsum
[
9
];
delete
[]
dstsum
;
icvCompleteMomentState
(
mom
);
}
...
...
modules/ocl/src/opencl/brute_force_match.cl
View file @
6c99b5c9
...
...
@@ -5,19 +5,93 @@ int bit1Count(float x)
{
int
c
=
0
;
int
ix
=
(
int
)
x
;
for
(
int
i
=
0
; i < 32 ; i++)
{
c
+=
ix
&
0x1
;
ix
>>=
1
;
}
return
(
float
)
c
;
}
float
reduce_block
(
__local
float
*s_query,
__local
float
*s_train,
int
block_size,
int
lidx,
int
lidy,
int
distType
)
{
/*
there
are
threee
types
in
the
reducer.
the
first
is
L1Dist,
which
to
sum
the
abs
(
v1,
v2
)
,
the
second
is
L2Dist,
which
to
sum
the
(
v1
-
v2
)
*
(
v1
-
v2
)
,
the
third
is
humming,
which
to
popc
(
v1
^
v2
)
,
popc
is
to
count
the
bits
are
set
to
1*/
float
result
=
0
;
switch
(
distType
)
{
case
0:
for
(
int
j
=
0
; j < block_size ; j++)
{
result
+=
fabs
(
s_query[lidy
*
block_size
+
j]
-
s_train[j
*
block_size
+
lidx]
)
;
}
break
;
case
1:
for
(
int
j
=
0
; j < block_size ; j++)
{
float
qr
=
s_query[lidy
*
block_size
+
j]
-
s_train[j
*
block_size
+
lidx]
;
result
+=
qr
*
qr
;
}
break
;
case
2:
for
(
int
j
=
0
; j < block_size ; j++)
{
result
+=
bit1Count
((
uint
)
s_query[lidy
*
block_size
+
j]
^
(
uint
)
s_train[
(
uint
)
j
*
block_size
+
lidx]
)
;
}
break
;
}
return
result
;
}
float
reduce_multi_block
(
__local
float
*s_query,
__local
float
*s_train,
int
max_desc_len,
int
block_size,
int
block_index,
int
lidx,
int
lidy,
int
distType
)
{
/*
there
are
threee
types
in
the
reducer.
the
first
is
L1Dist,
which
to
sum
the
abs
(
v1,
v2
)
,
the
second
is
L2Dist,
which
to
sum
the
(
v1
-
v2
)
*
(
v1
-
v2
)
,
the
third
is
humming,
which
to
popc
(
v1
^
v2
)
,
popc
is
to
count
the
bits
are
set
to
1*/
float
result
=
0
;
switch
(
distType
)
{
case
0:
for
(
int
j
=
0
; j < block_size ; j++)
{
result
+=
fabs
(
s_query[lidy
*
max_desc_len
+
block_index
*
block_size
+
j]
-
s_train[j
*
block_size
+
lidx]
)
;
}
break
;
case
1:
for
(
int
j
=
0
; j < block_size ; j++)
{
float
qr
=
s_query[lidy
*
max_desc_len
+
block_index
*
block_size
+
j]
-
s_train[j
*
block_size
+
lidx]
;
result
+=
qr
*
qr
;
}
break
;
case
2:
for
(
int
j
=
0
; j < block_size ; j++)
{
//result
+=
popcount
((
uint
)
s_query[lidy
*
max_desc_len
+
block_index
*
block_size
+
j]
^
(
uint
)
s_train[j
*
block_size
+
lidx]
)
;
result
+=
bit1Count
((
uint
)
s_query[lidy
*
max_desc_len
+
block_index
*
block_size
+
j]
^
(
uint
)
s_train[j
*
block_size
+
lidx]
)
;
}
break
;
}
return
result
;
}
/*
2dim
launch,
global
size:
dim0
is
(
query
rows
+
block_size
-
1
)
/
block_size
*
block_size,
dim1
is
block_size
local
size:
dim0
is
block_size,
dim1
is
block_size.
*/
__kernel
void
BruteForceMatch_UnrollMatch
(
__kernel
void
BruteForceMatch_UnrollMatch
_D5
(
__global
float
*query,
__global
float
*train,
//__global
float
*mask,
...
...
@@ -42,7 +116,6 @@ __kernel void BruteForceMatch_UnrollMatch(
__local
float
*s_train
=
sharebuffer
+
block_size
*
max_desc_len
;
int
queryIdx
=
groupidx
*
block_size
+
lidy
;
//
load
the
query
into
local
memory.
for
(
int
i
=
0
; i < max_desc_len / block_size; i ++)
{
...
...
@@ -55,11 +128,9 @@ __kernel void BruteForceMatch_UnrollMatch(
//
loopUnrolledCached
to
find
the
best
trainIdx
and
best
distance.
volatile
int
imgIdx
=
0
;
for
(
int
t
=
0
; t < (train_rows + block_size - 1) / block_size ; t++)
{
float
result
=
0
;
for
(
int
i
=
0
; i < max_desc_len / block_size ; i++)
{
//load
a
block_size
*
block_size
block
into
local
train.
...
...
@@ -69,38 +140,7 @@ __kernel void BruteForceMatch_UnrollMatch(
//synchronize
to
make
sure
each
elem
for
reduceIteration
in
share
memory
is
written
already.
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
/*
there
are
threee
types
in
the
reducer.
the
first
is
L1Dist,
which
to
sum
the
abs
(
v1,
v2
)
,
the
second
is
L2Dist,
which
to
sum
the
(
v1
-
v2
)
*
(
v1
-
v2
)
,
the
third
is
humming,
which
to
popc
(
v1
^
v2
)
,
popc
is
to
count
the
bits
are
set
to
1*/
switch
(
distType
)
{
case
0:
for
(
int
j
=
0
; j < block_size ; j++)
{
result
+=
fabs
(
s_query[lidy
*
max_desc_len
+
i
*
block_size
+
j]
-
s_train[j
*
block_size
+
lidx]
)
;
}
break
;
case
1:
for
(
int
j
=
0
; j < block_size ; j++)
{
float
qr
=
s_query[lidy
*
max_desc_len
+
i
*
block_size
+
j]
-
s_train[j
*
block_size
+
lidx]
;
result
+=
qr
*
qr
;
}
break
;
case
2:
for
(
int
j
=
0
; j < block_size ; j++)
{
//result
+=
popcount
((
uint
)
s_query[lidy
*
max_desc_len
+
i
*
block_size
+
j]
^
(
uint
)
s_train[j
*
block_size
+
lidx]
)
;
result
+=
bit1Count
((
uint
)
s_query[lidy
*
max_desc_len
+
i
*
block_size
+
j]
^
(
uint
)
s_train[j
*
block_size
+
lidx]
)
;
}
break
;
}
result
+=
reduce_multi_block
(
s_query,
s_train,
max_desc_len,
block_size,
i,
lidx,
lidy,
distType
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
...
...
@@ -116,8 +156,8 @@ __kernel void BruteForceMatch_UnrollMatch(
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
__local
float
*s_distance
=
(
__local
float
*
)(
sharebuffer
)
;
__local
int
*
s_trainIdx
=
(
__local
int
*
)(
sharebuffer
+
block_size
*
block_size
)
;
__local
float
*s_distance
=
(
__local
float*
)(
sharebuffer
)
;
__local
int
*
s_trainIdx
=
(
__local
int
*
)(
sharebuffer
+
block_size
*
block_size
)
;
//find
BestMatch
s_distance
+=
lidy
*
block_size
;
...
...
@@ -144,7 +184,7 @@ __kernel void BruteForceMatch_UnrollMatch(
}
}
__kernel
void
BruteForceMatch_Match
(
__kernel
void
BruteForceMatch_Match
_D5
(
__global
float
*query,
__global
float
*train,
//__global
float
*mask,
...
...
@@ -177,7 +217,6 @@ __kernel void BruteForceMatch_Match(
{
//Dist
dist
;
float
result
=
0
;
for
(
int
i
=
0
; i < (query_cols + block_size - 1) / block_size ; i++)
{
const
int
loadx
=
lidx
+
i
*
block_size
;
...
...
@@ -193,38 +232,7 @@ __kernel void BruteForceMatch_Match(
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
/*
there
are
threee
types
in
the
reducer.
the
first
is
L1Dist,
which
to
sum
the
abs
(
v1,
v2
)
,
the
second
is
L2Dist,
which
to
sum
the
(
v1
-
v2
)
*
(
v1
-
v2
)
,
the
third
is
humming,
which
to
popc
(
v1
^
v2
)
,
popc
is
to
count
the
bits
are
set
to
1*/
switch
(
distType
)
{
case
0:
for
(
int
j
=
0
; j < block_size ; j++)
{
result
+=
fabs
(
s_query[lidy
*
block_size
+
j]
-
s_train[j
*
block_size
+
lidx]
)
;
}
break
;
case
1:
for
(
int
j
=
0
; j < block_size ; j++)
{
float
qr
=
s_query[lidy
*
block_size
+
j]
-
s_train[j
*
block_size
+
lidx]
;
result
+=
qr
*
qr
;
}
break
;
case
2:
for
(
int
j
=
0
; j < block_size ; j++)
{
//result
+=
popcount
((
uint
)
s_query[lidy
*
block_size
+
j]
^
(
uint
)
s_train[j
*
block_size
+
lidx]
)
;
result
+=
bit1Count
((
uint
)
s_query[lidy
*
block_size
+
j]
^
(
uint
)
s_train[
(
uint
)
j
*
block_size
+
lidx]
)
;
}
break
;
}
result
+=
reduce_block
(
s_query,
s_train,
block_size,
lidx,
lidy,
distType
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
...
...
@@ -270,7 +278,7 @@ __kernel void BruteForceMatch_Match(
}
//radius_unrollmatch
__kernel
void
BruteForceMatch_RadiusUnrollMatch
(
__kernel
void
BruteForceMatch_RadiusUnrollMatch
_D5
(
__global
float
*query,
__global
float
*train,
float
maxDistance,
...
...
@@ -303,7 +311,6 @@ __kernel void BruteForceMatch_RadiusUnrollMatch(
__local
float
*s_train
=
sharebuffer
+
block_size
*
block_size
;
float
result
=
0
;
for
(
int
i
=
0
; i < max_desc_len / block_size ; ++i)
{
//load
a
block_size
*
block_size
block
into
local
train.
...
...
@@ -315,37 +322,7 @@ __kernel void BruteForceMatch_RadiusUnrollMatch(
//synchronize
to
make
sure
each
elem
for
reduceIteration
in
share
memory
is
written
already.
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
/*
there
are
three
types
in
the
reducer.
the
first
is
L1Dist,
which
to
sum
the
abs
(
v1,
v2
)
,
the
second
is
L2Dist,
which
to
sum
the
(
v1
-
v2
)
*
(
v1
-
v2
)
,
the
third
is
humming,
which
to
popc
(
v1
^
v2
)
,
popc
is
to
count
the
bits
are
set
to
1*/
switch
(
distType
)
{
case
0:
for
(
int
j
=
0
; j < block_size ; ++j)
{
result
+=
fabs
(
s_query[lidy
*
block_size
+
j]
-
s_train[j
*
block_size
+
lidx]
)
;
}
break
;
case
1:
for
(
int
j
=
0
; j < block_size ; ++j)
{
float
qr
=
s_query[lidy
*
block_size
+
j]
-
s_train[j
*
block_size
+
lidx]
;
result
+=
qr
*
qr
;
}
break
;
case
2:
for
(
int
j
=
0
; j < block_size ; ++j)
{
result
+=
bit1Count
((
uint
)
s_query[lidy
*
block_size
+
j]
^
(
uint
)
s_train[j
*
block_size
+
lidx]
)
;
}
break
;
}
result
+=
reduce_block
(
s_query,
s_train,
block_size,
lidx,
lidy,
distType
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
...
...
@@ -354,7 +331,7 @@ __kernel void BruteForceMatch_RadiusUnrollMatch(
{
unsigned
int
ind
=
atom_inc
(
nMatches
+
queryIdx/*,
(
unsigned
int
)
-1*/
)
;
if
(
ind
<
bestTrainIdx_cols
)
if
(
ind
<
bestTrainIdx_cols
)
{
//bestImgIdx
=
imgIdx
;
bestTrainIdx[queryIdx
*
(
ostep
/
sizeof
(
int
))
+
ind]
=
trainIdx
;
...
...
@@ -364,7 +341,7 @@ __kernel void BruteForceMatch_RadiusUnrollMatch(
}
//radius_match
__kernel
void
BruteForceMatch_RadiusMatch
(
__kernel
void
BruteForceMatch_RadiusMatch
_D5
(
__global
float
*query,
__global
float
*train,
float
maxDistance,
...
...
@@ -396,7 +373,6 @@ __kernel void BruteForceMatch_RadiusMatch(
__local
float
*s_train
=
sharebuffer
+
block_size
*
block_size
;
float
result
=
0
;
for
(
int
i
=
0
; i < (query_cols + block_size - 1) / block_size ; ++i)
{
//load
a
block_size
*
block_size
block
into
local
train.
...
...
@@ -408,46 +384,16 @@ __kernel void BruteForceMatch_RadiusMatch(
//synchronize
to
make
sure
each
elem
for
reduceIteration
in
share
memory
is
written
already.
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
/*
there
are
three
types
in
the
reducer.
the
first
is
L1Dist,
which
to
sum
the
abs
(
v1,
v2
)
,
the
second
is
L2Dist,
which
to
sum
the
(
v1
-
v2
)
*
(
v1
-
v2
)
,
the
third
is
humming,
which
to
popc
(
v1
^
v2
)
,
popc
is
to
count
the
bits
are
set
to
1*/
switch
(
distType
)
{
case
0:
for
(
int
j
=
0
; j < block_size ; ++j)
{
result
+=
fabs
(
s_query[lidy
*
block_size
+
j]
-
s_train[j
*
block_size
+
lidx]
)
;
}
break
;
case
1:
for
(
int
j
=
0
; j < block_size ; ++j)
{
float
qr
=
s_query[lidy
*
block_size
+
j]
-
s_train[j
*
block_size
+
lidx]
;
result
+=
qr
*
qr
;
}
break
;
case
2:
for
(
int
j
=
0
; j < block_size ; ++j)
{
result
+=
bit1Count
((
uint
)
s_query[lidy
*
block_size
+
j]
^
(
uint
)
s_train[j
*
block_size
+
lidx]
)
;
}
break
;
}
result
+=
reduce_block
(
s_query,
s_train,
block_size,
lidx,
lidy,
distType
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
if
(
queryIdx
<
query_rows
&&
trainIdx
<
train_rows
&&
result
<
maxDistance/*
&&
mask
(
queryIdx,
trainIdx
)
*/
)
{
unsigned
int
ind
=
atom_inc
(
nMatches
+
queryIdx
/*,
(
unsigned
int
)
-1*/
)
;
unsigned
int
ind
=
atom_inc
(
nMatches
+
queryIdx
)
;
if
(
ind
<
bestTrainIdx_cols
)
if
(
ind
<
bestTrainIdx_cols
)
{
//bestImgIdx
=
imgIdx
;
bestTrainIdx[queryIdx
*
(
ostep
/
sizeof
(
int
))
+
ind]
=
trainIdx
;
...
...
@@ -457,7 +403,7 @@ __kernel void BruteForceMatch_RadiusMatch(
}
__kernel
void
BruteForceMatch_knnUnrollMatch
(
__kernel
void
BruteForceMatch_knnUnrollMatch
_D5
(
__global
float
*query,
__global
float
*train,
//__global
float
*mask,
...
...
@@ -496,11 +442,9 @@ __kernel void BruteForceMatch_knnUnrollMatch(
//loopUnrolledCached
volatile
int
imgIdx
=
0
;
for
(
int
t
=
0
; t < (train_rows + block_size - 1) / block_size ; t++)
{
float
result
=
0
;
for
(
int
i
=
0
; i < max_desc_len / block_size ; i++)
{
const
int
loadX
=
lidx
+
i
*
block_size
;
...
...
@@ -511,38 +455,7 @@ __kernel void BruteForceMatch_knnUnrollMatch(
//synchronize
to
make
sure
each
elem
for
reduceIteration
in
share
memory
is
written
already.
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
/*
there
are
threee
types
in
the
reducer.
the
first
is
L1Dist,
which
to
sum
the
abs
(
v1,
v2
)
,
the
second
is
L2Dist,
which
to
sum
the
(
v1
-
v2
)
*
(
v1
-
v2
)
,
the
third
is
humming,
which
to
popc
(
v1
^
v2
)
,
popc
is
to
count
the
bits
are
set
to
1*/
switch
(
distType
)
{
case
0:
for
(
int
j
=
0
; j < block_size ; j++)
{
result
+=
fabs
(
s_query[lidy
*
max_desc_len
+
i
*
block_size
+
j]
-
s_train[j
*
block_size
+
lidx]
)
;
}
break
;
case
1:
for
(
int
j
=
0
; j < block_size ; j++)
{
float
qr
=
s_query[lidy
*
max_desc_len
+
i
*
block_size
+
j]
-
s_train[j
*
block_size
+
lidx]
;
result
+=
qr
*
qr
;
}
break
;
case
2:
for
(
int
j
=
0
; j < block_size ; j++)
{
//result
+=
popcount
((
uint
)
s_query[lidy
*
max_desc_len
+
i
*
block_size
+
j]
^
(
uint
)
s_train[j
*
block_size
+
lidx]
)
;
result
+=
bit1Count
((
uint
)
s_query[lidy
*
max_desc_len
+
i
*
block_size
+
j]
^
(
uint
)
s_train[j
*
block_size
+
lidx]
)
;
}
break
;
}
result
+=
reduce_multi_block
(
s_query,
s_train,
max_desc_len,
block_size,
i,
lidx,
lidy,
distType
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
...
...
@@ -589,7 +502,6 @@ __kernel void BruteForceMatch_knnUnrollMatch(
for
(
int
i
=
0
; i < block_size ; i++)
{
float
val
=
s_distance[i]
;
if
(
val
<
bestDistance1
)
{
bestDistance2
=
bestDistance1
;
...
...
@@ -640,7 +552,7 @@ __kernel void BruteForceMatch_knnUnrollMatch(
}
}
__kernel
void
BruteForceMatch_knnMatch
(
__kernel
void
BruteForceMatch_knnMatch
_D5
(
__global
float
*query,
__global
float
*train,
//__global
float
*mask,
...
...
@@ -673,8 +585,7 @@ __kernel void BruteForceMatch_knnMatch(
for
(
int
t
=
0
; t < (train_rows + block_size - 1) / block_size ; t++)
{
float
result
=
0.0f
;
for
(
int
i
=
0
; i < (query_cols + block_size - 1) / block_size ; i++)
for
(
int
i
=
0
; i < (query_cols + block_size -1) / block_size ; i++)
{
const
int
loadx
=
lidx
+
i
*
block_size
;
//load
query
and
train
into
local
memory
...
...
@@ -689,38 +600,7 @@ __kernel void BruteForceMatch_knnMatch(
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
/*
there
are
threee
types
in
the
reducer.
the
first
is
L1Dist,
which
to
sum
the
abs
(
v1,
v2
)
,
the
second
is
L2Dist,
which
to
sum
the
(
v1
-
v2
)
*
(
v1
-
v2
)
,
the
third
is
humming,
which
to
popc
(
v1
^
v2
)
,
popc
is
to
count
the
bits
are
set
to
1*/
switch
(
distType
)
{
case
0:
for
(
int
j
=
0
; j < block_size ; j++)
{
result
+=
fabs
(
s_query[lidy
*
block_size
+
j]
-
s_train[j
*
block_size
+
lidx]
)
;
}
break
;
case
1:
for
(
int
j
=
0
; j < block_size ; j++)
{
float
qr
=
s_query[lidy
*
block_size
+
j]
-
s_train[j
*
block_size
+
lidx]
;
result
+=
qr
*
qr
;
}
break
;
case
2:
for
(
int
j
=
0
; j < block_size ; j++)
{
//result
+=
popcount
((
uint
)
s_query[lidy
*
block_size
+
j]
^
(
uint
)
s_train[j
*
block_size
+
lidx]
)
;
result
+=
bit1Count
((
uint
)
s_query[lidy
*
block_size
+
j]
^
(
uint
)
s_train[
(
uint
)
j
*
block_size
+
lidx]
)
;
}
break
;
}
result
+=
reduce_block
(
s_query,
s_train,
block_size,
lidx,
lidy,
distType
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
...
...
@@ -767,7 +647,6 @@ __kernel void BruteForceMatch_knnMatch(
for
(
int
i
=
0
; i < block_size ; i++)
{
float
val
=
s_distance[i]
;
if
(
val
<
bestDistance1
)
{
bestDistance2
=
bestDistance1
;
...
...
@@ -818,7 +697,7 @@ __kernel void BruteForceMatch_knnMatch(
}
}
kernel
void
BruteForceMatch_calcDistanceUnrolled
(
kernel
void
BruteForceMatch_calcDistanceUnrolled
_D5
(
__global
float
*query,
__global
float
*train,
//__global
float
*mask,
...
...
@@ -836,7 +715,7 @@ kernel void BruteForceMatch_calcDistanceUnrolled(
/*
Todo
*/
}
kernel
void
BruteForceMatch_calcDistance
(
kernel
void
BruteForceMatch_calcDistance
_D5
(
__global
float
*query,
__global
float
*train,
//__global
float
*mask,
...
...
@@ -853,7 +732,7 @@ kernel void BruteForceMatch_calcDistance(
/*
Todo
*/
}
kernel
void
BruteForceMatch_findBestMatch
(
kernel
void
BruteForceMatch_findBestMatch
_D5
(
__global
float
*allDist,
__global
int
*bestTrainIdx,
__global
float
*bestDistance,
...
...
modules/ocl/src/opencl/haarobjectdetect.cl
View file @
6c99b5c9
...
...
@@ -211,10 +211,14 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
int4
data
=
*
(
__global
int4*
)
&sum[glb_off]
;
int
lcl_off
=
mad24
(
lcl_y,
readwidth,
lcl_x<<2
)
;
#
if
OFF
lcldata[lcl_off]
=
data.x
;
lcldata[lcl_off+1]
=
data.y
;
lcldata[lcl_off+2]
=
data.z
;
lcldata[lcl_off+3]
=
data.w
;
#
else
vstore4
(
data,
0
,
&lcldata[lcl_off]
)
;
#
endif
}
lcloutindex[lcl_id]
=
0
;
...
...
@@ -559,3 +563,7 @@ if(result)
}
}
*/
modules/ocl/src/opencl/match_template.cl
View file @
6c99b5c9
...
...
@@ -45,22 +45,28 @@
#
pragma
OPENCL
EXTENSION
cl_amd_printf
:
enable
#
if
defined
(
__ATI__
)
#
pragma
OPENCL
EXTENSION
cl_amd_fp64:enable
#
if
defined
(
DOUBLE_SUPPORT
)
#
elif
defined
(
__NVIDIA__
)
#
ifdef
cl_khr_fp64
#
pragma
OPENCL
EXTENSION
cl_khr_fp64:enable
#
elif
defined
(
cl_amd_fp64
)
#
pragma
OPENCL
EXTENSION
cl_amd_fp64:enable
#
endif
#
if
!defined
(
USE_SQR_INTEGRAL
)
&&
(
defined
(
__ATI__
)
||
defined
(
__NVIDIA__
))
#
define
TYPE_IMAGE_SQSUM
double
#
else
#
define
TYPE_IMAGE_SQSUM
ulong
#
define
TYPE_IMAGE_SQSUM
float
#
endif
#
ifndef
CN4
#
define
CN4
1
#
else
#
define
CN4
4
#
endif
//////////////////////////////////////////////////
//
utilities
#
define
SQSUMS_PTR
(
ox,
oy
)
mad24
(
gidy
+
oy,
img_sqsums_step,
gidx
+
img_sqsums_offset
+
ox
)
#
define
SQSUMS_PTR
(
ox,
oy
)
mad24
(
gidy
+
oy,
img_sqsums_step,
(
gidx
+
img_sqsums_offset
+
ox
)
*
CN4
)
#
define
SUMS_PTR
(
ox,
oy
)
mad24
(
gidy
+
oy,
img_sums_step,
gidx
+
img_sums_offset
+
ox
)
//
normAcc*
are
accurate
normalization
routines
which
make
GPU
matchTemplate
//
consistent
with
CPU
one
...
...
@@ -95,7 +101,7 @@ float normAcc_SQDIFF(float num, float denum)
__kernel
void
normalizeKernel_C1_D0
(
__global
const
TYPE_IMAGE_SQSUM
*
img_sqsums,
__global
const
float
*
img_sqsums,
__global
float
*
res,
ulong
tpl_sqsum,
int
res_rows,
...
...
@@ -119,8 +125,8 @@ void normalizeKernel_C1_D0
if
(
gidx
<
res_cols
&&
gidy
<
res_rows
)
{
float
image_sqsum_
=
(
float
)(
(
img_sqsums[SQSUMS_PTR
(
tpl_cols,
tpl_rows
)
]
-
img_sqsums[SQSUMS_PTR
(
tpl_cols,
0
)
]
)
-
(
img_sqsums[SQSUMS_PTR
(
0
,
tpl_rows
)
]
-
img_sqsums[SQSUMS_PTR
(
0
,
0
)
]
))
;
(
img_sqsums[SQSUMS_PTR
(
tpl_cols,
tpl_rows
)
]
-
img_sqsums[SQSUMS_PTR
(
tpl_cols,
0
)
]
)
-
(
img_sqsums[SQSUMS_PTR
(
0
,
tpl_rows
)
]
-
img_sqsums[SQSUMS_PTR
(
0
,
0
)
]
))
;
res[res_idx]
=
normAcc
(
res[res_idx],
sqrt
(
image_sqsum_
*
tpl_sqsum
))
;
}
}
...
...
@@ -152,8 +158,8 @@ void matchTemplate_Prepared_SQDIFF_C1_D0
if
(
gidx
<
res_cols
&&
gidy
<
res_rows
)
{
float
image_sqsum_
=
(
float
)(
(
img_sqsums[SQSUMS_PTR
(
tpl_cols,
tpl_rows
)
]
-
img_sqsums[SQSUMS_PTR
(
tpl_cols,
0
)
]
)
-
(
img_sqsums[SQSUMS_PTR
(
0
,
tpl_rows
)
]
-
img_sqsums[SQSUMS_PTR
(
0
,
0
)
]
))
;
(
img_sqsums[SQSUMS_PTR
(
tpl_cols,
tpl_rows
)
]
-
img_sqsums[SQSUMS_PTR
(
tpl_cols,
0
)
]
)
-
(
img_sqsums[SQSUMS_PTR
(
0
,
tpl_rows
)
]
-
img_sqsums[SQSUMS_PTR
(
0
,
0
)
]
))
;
res[res_idx]
=
image_sqsum_
-
2.f
*
res[res_idx]
+
tpl_sqsum
;
}
}
...
...
@@ -161,7 +167,7 @@ void matchTemplate_Prepared_SQDIFF_C1_D0
__kernel
void
matchTemplate_Prepared_SQDIFF_NORMED_C1_D0
(
__global
const
TYPE_IMAGE_SQSUM
*
img_sqsums,
__global
const
float
*
img_sqsums,
__global
float
*
res,
ulong
tpl_sqsum,
int
res_rows,
...
...
@@ -185,10 +191,10 @@ void matchTemplate_Prepared_SQDIFF_NORMED_C1_D0
if
(
gidx
<
res_cols
&&
gidy
<
res_rows
)
{
float
image_sqsum_
=
(
float
)(
(
img_sqsums[SQSUMS_PTR
(
tpl_cols,
tpl_rows
)
]
-
img_sqsums[SQSUMS_PTR
(
tpl_cols,
0
)
]
)
-
(
img_sqsums[SQSUMS_PTR
(
0
,
tpl_rows
)
]
-
img_sqsums[SQSUMS_PTR
(
0
,
0
)
]
))
;
(
img_sqsums[SQSUMS_PTR
(
tpl_cols,
tpl_rows
)
]
-
img_sqsums[SQSUMS_PTR
(
tpl_cols,
0
)
]
)
-
(
img_sqsums[SQSUMS_PTR
(
0
,
tpl_rows
)
]
-
img_sqsums[SQSUMS_PTR
(
0
,
0
)
]
))
;
res[res_idx]
=
normAcc_SQDIFF
(
image_sqsum_
-
2.f
*
res[res_idx]
+
tpl_sqsum,
sqrt
(
image_sqsum_
*
tpl_sqsum
))
;
sqrt
(
image_sqsum_
*
tpl_sqsum
))
;
}
}
...
...
@@ -628,8 +634,8 @@ void matchTemplate_Prepared_CCOFF_C1_D0
if
(
gidx
<
res_cols
&&
gidy
<
res_rows
)
{
float
sum
=
(
float
)(
(
img_sums[SUMS_PTR
(
tpl_cols,
tpl_rows
)
]
-
img_sums[SUMS_PTR
(
tpl_cols,
0
)
]
)
-
(
img_sums[SUMS_PTR
(
0
,
tpl_rows
)
]
-
img_sums[SUMS_PTR
(
0
,
0
)
]
))
;
(
img_sums[SUMS_PTR
(
tpl_cols,
tpl_rows
)
]
-
img_sums[SUMS_PTR
(
tpl_cols,
0
)
]
)
-
(
img_sums[SUMS_PTR
(
0
,
tpl_rows
)
]
-
img_sums[SUMS_PTR
(
0
,
0
)
]
))
;
res[res_idx]
-=
sum
*
tpl_sum
;
}
}
...
...
@@ -671,17 +677,17 @@ void matchTemplate_Prepared_CCOFF_C4_D0
{
float
ccorr
=
res[res_idx]
;
ccorr
-=
tpl_sum_c0*
(
float
)(
(
img_sums_c0[SUMS_PTR
(
tpl_cols,
tpl_rows
)
]
-
img_sums_c0[SUMS_PTR
(
tpl_cols,
0
)
]
)
-
(
img_sums_c0[SUMS_PTR
(
0
,
tpl_rows
)
]
-
img_sums_c0[SUMS_PTR
(
0
,
0
)
]
))
;
(
img_sums_c0[SUMS_PTR
(
tpl_cols,
tpl_rows
)
]
-
img_sums_c0[SUMS_PTR
(
tpl_cols,
0
)
]
)
-
(
img_sums_c0[SUMS_PTR
(
0
,
tpl_rows
)
]
-
img_sums_c0[SUMS_PTR
(
0
,
0
)
]
))
;
ccorr
-=
tpl_sum_c1*
(
float
)(
(
img_sums_c1[SUMS_PTR
(
tpl_cols,
tpl_rows
)
]
-
img_sums_c1[SUMS_PTR
(
tpl_cols,
0
)
]
)
-
(
img_sums_c1[SUMS_PTR
(
0
,
tpl_rows
)
]
-
img_sums_c1[SUMS_PTR
(
0
,
0
)
]
))
;
(
img_sums_c1[SUMS_PTR
(
tpl_cols,
tpl_rows
)
]
-
img_sums_c1[SUMS_PTR
(
tpl_cols,
0
)
]
)
-
(
img_sums_c1[SUMS_PTR
(
0
,
tpl_rows
)
]
-
img_sums_c1[SUMS_PTR
(
0
,
0
)
]
))
;
ccorr
-=
tpl_sum_c2*
(
float
)(
(
img_sums_c2[SUMS_PTR
(
tpl_cols,
tpl_rows
)
]
-
img_sums_c2[SUMS_PTR
(
tpl_cols,
0
)
]
)
-
(
img_sums_c2[SUMS_PTR
(
0
,
tpl_rows
)
]
-
img_sums_c2[SUMS_PTR
(
0
,
0
)
]
))
;
(
img_sums_c2[SUMS_PTR
(
tpl_cols,
tpl_rows
)
]
-
img_sums_c2[SUMS_PTR
(
tpl_cols,
0
)
]
)
-
(
img_sums_c2[SUMS_PTR
(
0
,
tpl_rows
)
]
-
img_sums_c2[SUMS_PTR
(
0
,
0
)
]
))
;
ccorr
-=
tpl_sum_c3*
(
float
)(
(
img_sums_c3[SUMS_PTR
(
tpl_cols,
tpl_rows
)
]
-
img_sums_c3[SUMS_PTR
(
tpl_cols,
0
)
]
)
-
(
img_sums_c3[SUMS_PTR
(
0
,
tpl_rows
)
]
-
img_sums_c3[SUMS_PTR
(
0
,
0
)
]
))
;
(
img_sums_c3[SUMS_PTR
(
tpl_cols,
tpl_rows
)
]
-
img_sums_c3[SUMS_PTR
(
tpl_cols,
0
)
]
)
-
(
img_sums_c3[SUMS_PTR
(
0
,
tpl_rows
)
]
-
img_sums_c3[SUMS_PTR
(
0
,
0
)
]
))
;
res[res_idx]
=
ccorr
;
}
}
...
...
@@ -702,7 +708,7 @@ void matchTemplate_Prepared_CCOFF_NORMED_C1_D0
__global
const
uint
*
img_sums,
int
img_sums_offset,
int
img_sums_step,
__global
const
TYPE_IMAGE_SQSUM
*
img_sqsums,
__global
const
float
*
img_sqsums,
int
img_sqsums_offset,
int
img_sqsums_step,
float
tpl_sum,
...
...
@@ -725,12 +731,12 @@ void matchTemplate_Prepared_CCOFF_NORMED_C1_D0
if
(
gidx
<
res_cols
&&
gidy
<
res_rows
)
{
float
image_sum_
=
(
float
)(
(
img_sums[SUMS_PTR
(
tpl_cols,
tpl_rows
)
]
-
img_sums[SUMS_PTR
(
tpl_cols,
0
)
]
)
-
(
img_sums[SUMS_PTR
(
0
,
tpl_rows
)
]
-
img_sums[SUMS_PTR
(
0
,
0
)
]
))
;
(
img_sums[SUMS_PTR
(
tpl_cols,
tpl_rows
)
]
-
img_sums[SUMS_PTR
(
tpl_cols,
0
)
]
)
-
(
img_sums[SUMS_PTR
(
0
,
tpl_rows
)
]
-
img_sums[SUMS_PTR
(
0
,
0
)
]
))
;
float
image_sqsum_
=
(
float
)(
(
img_sqsums[SQSUMS_PTR
(
tpl_cols,
tpl_rows
)
]
-
img_sqsums[SQSUMS_PTR
(
tpl_cols,
0
)
]
)
-
(
img_sqsums[SQSUMS_PTR
(
0
,
tpl_rows
)
]
-
img_sqsums[SQSUMS_PTR
(
0
,
0
)
]
))
;
(
img_sqsums[SQSUMS_PTR
(
tpl_cols,
tpl_rows
)
]
-
img_sqsums[SQSUMS_PTR
(
tpl_cols,
0
)
]
)
-
(
img_sqsums[SQSUMS_PTR
(
0
,
tpl_rows
)
]
-
img_sqsums[SQSUMS_PTR
(
0
,
0
)
]
))
;
res[res_idx]
=
normAcc
(
res[res_idx]
-
image_sum_
*
tpl_sum,
sqrt
(
tpl_sqsum
*
(
image_sqsum_
-
weight
*
image_sum_
*
image_sum_
)))
;
}
...
...
@@ -754,10 +760,10 @@ void matchTemplate_Prepared_CCOFF_NORMED_C4_D0
__global
const
uint
*
img_sums_c3,
int
img_sums_offset,
int
img_sums_step,
__global
const
TYPE_IMAGE_SQSUM
*
img_sqsums_c0,
__global
const
TYPE_IMAGE_SQSUM
*
img_sqsums_c1,
__global
const
TYPE_IMAGE_SQSUM
*
img_sqsums_c2,
__global
const
TYPE_IMAGE_SQSUM
*
img_sqsums_c3,
__global
const
float
*
img_sqsums_c0,
__global
const
float
*
img_sqsums_c1,
__global
const
float
*
img_sqsums_c2,
__global
const
float
*
img_sqsums_c3,
int
img_sqsums_offset,
int
img_sqsums_step,
float
tpl_sum_c0,
...
...
@@ -782,42 +788,71 @@ void matchTemplate_Prepared_CCOFF_NORMED_C4_D0
if
(
gidx
<
res_cols
&&
gidy
<
res_rows
)
{
float
image_sum_c0
=
(
float
)(
(
img_sums_c0[SUMS_PTR
(
tpl_cols,
tpl_rows
)
]
-
img_sums_c0[SUMS_PTR
(
tpl_cols,
0
)
]
)
-
(
img_sums_c0[SUMS_PTR
(
0
,
tpl_rows
)
]
-
img_sums_c0[SUMS_PTR
(
0
,
0
)
]
))
;
(
img_sums_c0[SUMS_PTR
(
tpl_cols,
tpl_rows
)
]
-
img_sums_c0[SUMS_PTR
(
tpl_cols,
0
)
]
)
-
(
img_sums_c0[SUMS_PTR
(
0
,
tpl_rows
)
]
-
img_sums_c0[SUMS_PTR
(
0
,
0
)
]
))
;
float
image_sum_c1
=
(
float
)(
(
img_sums_c1[SUMS_PTR
(
tpl_cols,
tpl_rows
)
]
-
img_sums_c1[SUMS_PTR
(
tpl_cols,
0
)
]
)
-
(
img_sums_c1[SUMS_PTR
(
0
,
tpl_rows
)
]
-
img_sums_c1[SUMS_PTR
(
0
,
0
)
]
))
;
(
img_sums_c1[SUMS_PTR
(
tpl_cols,
tpl_rows
)
]
-
img_sums_c1[SUMS_PTR
(
tpl_cols,
0
)
]
)
-
(
img_sums_c1[SUMS_PTR
(
0
,
tpl_rows
)
]
-
img_sums_c1[SUMS_PTR
(
0
,
0
)
]
))
;
float
image_sum_c2
=
(
float
)(
(
img_sums_c2[SUMS_PTR
(
tpl_cols,
tpl_rows
)
]
-
img_sums_c2[SUMS_PTR
(
tpl_cols,
0
)
]
)
-
(
img_sums_c2[SUMS_PTR
(
0
,
tpl_rows
)
]
-
img_sums_c2[SUMS_PTR
(
0
,
0
)
]
))
;
(
img_sums_c2[SUMS_PTR
(
tpl_cols,
tpl_rows
)
]
-
img_sums_c2[SUMS_PTR
(
tpl_cols,
0
)
]
)
-
(
img_sums_c2[SUMS_PTR
(
0
,
tpl_rows
)
]
-
img_sums_c2[SUMS_PTR
(
0
,
0
)
]
))
;
float
image_sum_c3
=
(
float
)(
(
img_sums_c3[SUMS_PTR
(
tpl_cols,
tpl_rows
)
]
-
img_sums_c3[SUMS_PTR
(
tpl_cols,
0
)
]
)
-
(
img_sums_c3[SUMS_PTR
(
0
,
tpl_rows
)
]
-
img_sums_c3[SUMS_PTR
(
0
,
0
)
]
))
;
(
img_sums_c3[SUMS_PTR
(
tpl_cols,
tpl_rows
)
]
-
img_sums_c3[SUMS_PTR
(
tpl_cols,
0
)
]
)
-
(
img_sums_c3[SUMS_PTR
(
0
,
tpl_rows
)
]
-
img_sums_c3[SUMS_PTR
(
0
,
0
)
]
))
;
float
image_sqsum_c0
=
(
float
)(
(
img_sqsums_c0[SQSUMS_PTR
(
tpl_cols,
tpl_rows
)
]
-
img_sqsums_c0[SQSUMS_PTR
(
tpl_cols,
0
)
]
)
-
(
img_sqsums_c0[SQSUMS_PTR
(
0
,
tpl_rows
)
]
-
img_sqsums_c0[SQSUMS_PTR
(
0
,
0
)
]
))
;
(
img_sqsums_c0[SQSUMS_PTR
(
tpl_cols,
tpl_rows
)
]
-
img_sqsums_c0[SQSUMS_PTR
(
tpl_cols,
0
)
]
)
-
(
img_sqsums_c0[SQSUMS_PTR
(
0
,
tpl_rows
)
]
-
img_sqsums_c0[SQSUMS_PTR
(
0
,
0
)
]
))
;
float
image_sqsum_c1
=
(
float
)(
(
img_sqsums_c1[SQSUMS_PTR
(
tpl_cols,
tpl_rows
)
]
-
img_sqsums_c1[SQSUMS_PTR
(
tpl_cols,
0
)
]
)
-
(
img_sqsums_c1[SQSUMS_PTR
(
0
,
tpl_rows
)
]
-
img_sqsums_c1[SQSUMS_PTR
(
0
,
0
)
]
))
;
(
img_sqsums_c1[SQSUMS_PTR
(
tpl_cols,
tpl_rows
)
]
-
img_sqsums_c1[SQSUMS_PTR
(
tpl_cols,
0
)
]
)
-
(
img_sqsums_c1[SQSUMS_PTR
(
0
,
tpl_rows
)
]
-
img_sqsums_c1[SQSUMS_PTR
(
0
,
0
)
]
))
;
float
image_sqsum_c2
=
(
float
)(
(
img_sqsums_c2[SQSUMS_PTR
(
tpl_cols,
tpl_rows
)
]
-
img_sqsums_c2[SQSUMS_PTR
(
tpl_cols,
0
)
]
)
-
(
img_sqsums_c2[SQSUMS_PTR
(
0
,
tpl_rows
)
]
-
img_sqsums_c2[SQSUMS_PTR
(
0
,
0
)
]
))
;
(
img_sqsums_c2[SQSUMS_PTR
(
tpl_cols,
tpl_rows
)
]
-
img_sqsums_c2[SQSUMS_PTR
(
tpl_cols,
0
)
]
)
-
(
img_sqsums_c2[SQSUMS_PTR
(
0
,
tpl_rows
)
]
-
img_sqsums_c2[SQSUMS_PTR
(
0
,
0
)
]
))
;
float
image_sqsum_c3
=
(
float
)(
(
img_sqsums_c3[SQSUMS_PTR
(
tpl_cols,
tpl_rows
)
]
-
img_sqsums_c3[SQSUMS_PTR
(
tpl_cols,
0
)
]
)
-
(
img_sqsums_c3[SQSUMS_PTR
(
0
,
tpl_rows
)
]
-
img_sqsums_c3[SQSUMS_PTR
(
0
,
0
)
]
))
;
(
img_sqsums_c3[SQSUMS_PTR
(
tpl_cols,
tpl_rows
)
]
-
img_sqsums_c3[SQSUMS_PTR
(
tpl_cols,
0
)
]
)
-
(
img_sqsums_c3[SQSUMS_PTR
(
0
,
tpl_rows
)
]
-
img_sqsums_c3[SQSUMS_PTR
(
0
,
0
)
]
))
;
float
num
=
res[res_idx]
-
image_sum_c0
*
tpl_sum_c0
-
image_sum_c1
*
tpl_sum_c1
-
image_sum_c2
*
tpl_sum_c2
-
image_sum_c3
*
tpl_sum_c3
;
image_sum_c0
*
tpl_sum_c0
-
image_sum_c1
*
tpl_sum_c1
-
image_sum_c2
*
tpl_sum_c2
-
image_sum_c3
*
tpl_sum_c3
;
float
denum
=
sqrt
(
tpl_sqsum
*
(
image_sqsum_c0
-
weight
*
image_sum_c0
*
image_sum_c0
+
image_sqsum_c1
-
weight
*
image_sum_c1
*
image_sum_c1
+
image_sqsum_c2
-
weight
*
image_sum_c2
*
image_sum_c2
+
image_sqsum_c3
-
weight
*
image_sum_c0
*
image_sum_c3
)
)
;
image_sqsum_c0
-
weight
*
image_sum_c0
*
image_sum_c0
+
image_sqsum_c1
-
weight
*
image_sum_c1
*
image_sum_c1
+
image_sqsum_c2
-
weight
*
image_sum_c2
*
image_sum_c2
+
image_sqsum_c3
-
weight
*
image_sum_c0
*
image_sum_c3
)
)
;
res[res_idx]
=
normAcc
(
num,
denum
)
;
}
}
//////////////////////////////////////////////////////////////////////
//
extractFirstChannel
__kernel
void
extractFirstChannel
(
const
__global
float4*
img,
__global
float*
res,
int
rows,
int
cols,
int
img_offset,
int
res_offset,
int
img_step,
int
res_step
)
{
img_step
/=
sizeof
(
float4
)
;
res_step
/=
sizeof
(
float
)
;
img_offset
/=
sizeof
(
float4
)
;
res_offset
/=
sizeof
(
float
)
;
img
+=
img_offset
;
res
+=
res_offset
;
int
gidx
=
get_global_id
(
0
)
;
int
gidy
=
get_global_id
(
1
)
;
if
(
gidx
<
cols
&&
gidy
<
rows
)
{
res[gidx
+
gidy
*
res_step]
=
img[gidx
+
gidy
*
img_step].x
;
}
}
modules/ocl/src/opencl/moments.cl
View file @
6c99b5c9
...
...
@@ -6,25 +6,27 @@
#
pragma
OPENCL
EXTENSION
cl_amd_fp64:enable
#
endif
typedef
double
T
;
typedef
double
F
;
typedef
double4
F4
;
#
define
convert_F4
convert_double4
#
else
typedef
float
double
;
typedef
float4
double
4
;
typedef
float
F
;
typedef
float4
F
4
;
typedef
long
T
;
#
define
convert_
double
4
convert_float4
#
define
convert_
F
4
convert_float4
#
endif
//#pragma
OPENCL
EXTENSION
cl_amd_printf:enable
//#if
defined
(
DOUBLE_SUPPORT
)
#
define
DST_ROW_A00
0
#
define
DST_ROW_A10
1
#
define
DST_ROW_A01
2
#
define
DST_ROW_A20
3
#
define
DST_ROW_A11
4
#
define
DST_ROW_A02
5
#
define
DST_ROW_A30
6
#
define
DST_ROW_A21
7
#
define
DST_ROW_A12
8
#
define
DST_ROW_A03
9
#
define
DST_ROW_00
0
#
define
DST_ROW_10
1
#
define
DST_ROW_01
2
#
define
DST_ROW_20
3
#
define
DST_ROW_11
4
#
define
DST_ROW_02
5
#
define
DST_ROW_30
6
#
define
DST_ROW_21
7
#
define
DST_ROW_12
8
#
define
DST_ROW_03
9
__kernel
void
icvContourMoments
(
int
contour_total,
__global
float*
reader_oclmat_data,
...
...
@@ -60,36 +62,76 @@ __kernel void icvContourMoments(int contour_total,
yii_1
=
yi_1
+
yi
;
dst_step
/=
sizeof
(
T
)
;
*
(
dst_a
+
DST_ROW_
A
00
*
dst_step
+
idx
)
=
dxy
;
*
(
dst_a
+
DST_ROW_
A
10
*
dst_step
+
idx
)
=
dxy
*
xii_1
;
*
(
dst_a
+
DST_ROW_
A
01
*
dst_step
+
idx
)
=
dxy
*
yii_1
;
*
(
dst_a
+
DST_ROW_
A
20
*
dst_step
+
idx
)
=
dxy
*
(
xi_1
*
xii_1
+
xi2
)
;
*
(
dst_a
+
DST_ROW_
A
11
*
dst_step
+
idx
)
=
dxy
*
(
xi_1
*
(
yii_1
+
yi_1
)
+
xi
*
(
yii_1
+
yi
))
;
*
(
dst_a
+
DST_ROW_
A
02
*
dst_step
+
idx
)
=
dxy
*
(
yi_1
*
yii_1
+
yi2
)
;
*
(
dst_a
+
DST_ROW_
A
30
*
dst_step
+
idx
)
=
dxy
*
xii_1
*
(
xi_12
+
xi2
)
;
*
(
dst_a
+
DST_ROW_
A
03
*
dst_step
+
idx
)
=
dxy
*
yii_1
*
(
yi_12
+
yi2
)
;
*
(
dst_a
+
DST_ROW_
A
21
*
dst_step
+
idx
)
=
*
(
dst_a
+
DST_ROW_00
*
dst_step
+
idx
)
=
dxy
;
*
(
dst_a
+
DST_ROW_10
*
dst_step
+
idx
)
=
dxy
*
xii_1
;
*
(
dst_a
+
DST_ROW_01
*
dst_step
+
idx
)
=
dxy
*
yii_1
;
*
(
dst_a
+
DST_ROW_20
*
dst_step
+
idx
)
=
dxy
*
(
xi_1
*
xii_1
+
xi2
)
;
*
(
dst_a
+
DST_ROW_11
*
dst_step
+
idx
)
=
dxy
*
(
xi_1
*
(
yii_1
+
yi_1
)
+
xi
*
(
yii_1
+
yi
))
;
*
(
dst_a
+
DST_ROW_02
*
dst_step
+
idx
)
=
dxy
*
(
yi_1
*
yii_1
+
yi2
)
;
*
(
dst_a
+
DST_ROW_30
*
dst_step
+
idx
)
=
dxy
*
xii_1
*
(
xi_12
+
xi2
)
;
*
(
dst_a
+
DST_ROW_03
*
dst_step
+
idx
)
=
dxy
*
yii_1
*
(
yi_12
+
yi2
)
;
*
(
dst_a
+
DST_ROW_21
*
dst_step
+
idx
)
=
dxy
*
(
xi_12
*
(
3
*
yi_1
+
yi
)
+
2
*
xi
*
xi_1
*
yii_1
+
xi2
*
(
yi_1
+
3
*
yi
))
;
*
(
dst_a
+
DST_ROW_
A
12
*
dst_step
+
idx
)
=
*
(
dst_a
+
DST_ROW_12
*
dst_step
+
idx
)
=
dxy
*
(
yi_12
*
(
3
*
xi_1
+
xi
)
+
2
*
yi
*
yi_1
*
xii_1
+
yi2
*
(
xi_1
+
3
*
xi
))
;
}
//#endif
//#if
defined
(
DOUBLE_SUPPORT
)
__kernel
void
dst_sum
(
int
src_rows,
int
src_cols,
int
tile_height,
int
tile_width,
int
TILE_SIZE,
__global
F*
sum,
__global
F*
dst_m,
int
dst_step
)
{
int
gidy
=
get_global_id
(
0
)
;
int
gidx
=
get_global_id
(
1
)
;
int
block_y
=
src_rows/tile_height
;
int
block_x
=
src_cols/tile_width
;
int
block_num
;
if
(
src_rows
>
TILE_SIZE
&&
src_rows
%
TILE_SIZE
!=
0
)
block_y
++
;
if
(
src_cols
>
TILE_SIZE
&&
src_cols
%
TILE_SIZE
!=
0
)
block_x
++
;
block_num
=
block_y
*
block_x
;
__local
F
dst_sum[10][128]
;
if
(
gidy<128-block_num
)
for
(
int
i=0
; i<10; i++)
dst_sum[i][gidy+block_num]=0
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
dst_step
/=
sizeof
(
F
)
;
if
(
gidy<block_num
)
{
dst_sum[0][gidy]
=
*
(
dst_m
+
mad24
(
DST_ROW_00
*
block_y,
dst_step,
gidy
))
;
dst_sum[1][gidy]
=
*
(
dst_m
+
mad24
(
DST_ROW_10
*
block_y,
dst_step,
gidy
))
;
dst_sum[2][gidy]
=
*
(
dst_m
+
mad24
(
DST_ROW_01
*
block_y,
dst_step,
gidy
))
;
dst_sum[3][gidy]
=
*
(
dst_m
+
mad24
(
DST_ROW_20
*
block_y,
dst_step,
gidy
))
;
dst_sum[4][gidy]
=
*
(
dst_m
+
mad24
(
DST_ROW_11
*
block_y,
dst_step,
gidy
))
;
dst_sum[5][gidy]
=
*
(
dst_m
+
mad24
(
DST_ROW_02
*
block_y,
dst_step,
gidy
))
;
dst_sum[6][gidy]
=
*
(
dst_m
+
mad24
(
DST_ROW_30
*
block_y,
dst_step,
gidy
))
;
dst_sum[7][gidy]
=
*
(
dst_m
+
mad24
(
DST_ROW_21
*
block_y,
dst_step,
gidy
))
;
dst_sum[8][gidy]
=
*
(
dst_m
+
mad24
(
DST_ROW_12
*
block_y,
dst_step,
gidy
))
;
dst_sum[9][gidy]
=
*
(
dst_m
+
mad24
(
DST_ROW_03
*
block_y,
dst_step,
gidy
))
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
for
(
int
lsize=64
; lsize>0; lsize>>=1)
{
if
(
gidy<lsize
)
{
int
lsize2
=
gidy
+
lsize
;
for
(
int
i=0
; i<10; i++)
dst_sum[i][gidy]
+=
dst_sum[i][lsize2]
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
if
(
gidy==0
)
for
(
int
i=0
; i<10; i++)
sum[i]
=
dst_sum[i][0]
;
}
__kernel
void
CvMoments_D0
(
__global
uchar16*
src_data,
int
src_rows,
int
src_cols,
int
src_step,
int
tileSize_width,
int
tileSize_height,
__global
double*
dst_m00,
__global
double*
dst_m10,
__global
double*
dst_m01,
__global
double*
dst_m20,
__global
double*
dst_m11,
__global
double*
dst_m02,
__global
double*
dst_m30,
__global
double*
dst_m21,
__global
double*
dst_m12,
__global
double*
dst_m03,
int
dst_cols,
int
dst_step,
int
type,
int
depth,
int
cn,
int
coi,
int
binary,
int
TILE_SIZE
)
__global
F*
dst_m,
int
dst_cols,
int
dst_step,
int
blocky,
int
type,
int
depth,
int
cn,
int
coi,
int
binary,
int
TILE_SIZE
)
{
uchar
tmp_coi[16]
; // get the coi data
uchar16
tmp[16]
;
...
...
@@ -127,7 +169,7 @@ __kernel void CvMoments_D0(__global uchar16* src_data, int src_rows, int src_col
if
(
binary
)
for
(
int
i=0
; i < tileSize_width; i+=VLEN_C)
tmp[i/VLEN_C]
=
(
tmp[i/VLEN_C]!=zero
)
?full:zero
;
double
mom[10]
;
F
mom[10]
;
__local
int
m[10][128]
;
if
(
lidy
==
0
)
for
(
int
i=0
; i<10; i++)
...
...
@@ -197,119 +239,53 @@ __kernel void CvMoments_D0(__global uchar16* src_data, int src_rows, int src_col
if
(
lidy
==
0&&lidx
==
0
)
{
for
(
int
mt
=
0
; mt < 10; mt++ )
mom[mt]
=
(
double
)
lm[mt]
;
mom[mt]
=
(
F
)
lm[mt]
;
if
(
binary
)
{
double
s
=
1./255
;
F
s
=
1./255
;
for
(
int
mt
=
0
; mt < 10; mt++ )
mom[mt]
*=
s
;
}
double
xm
=
x
*
mom[0],
ym
=
y
*
mom[0]
;
F
xm
=
x
*
mom[0],
ym
=
y
*
mom[0]
;
//
accumulate
moments
computed
in
each
tile
dst_step
/=
sizeof
(
F
)
;
//
+
m00
(
=
m00
'
)
dst_m00[wgidy*dst_cols+wgidx]
=
mom[0]
;
*
(
dst_m
+
mad24
(
DST_ROW_00
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[0]
;
//
+
m10
(
=
m10
'
+
x*m00
'
)
dst_m10[wgidy*dst_cols+wgidx]
=
mom[1]
+
xm
;
*
(
dst_m
+
mad24
(
DST_ROW_10
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[1]
+
xm
;
//
+
m01
(
=
m01
'
+
y*m00
'
)
dst_m01[wgidy*dst_cols+wgidx]
=
mom[2]
+
ym
;
*
(
dst_m
+
mad24
(
DST_ROW_01
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[2]
+
ym
;
//
+
m20
(
=
m20
'
+
2*x*m10
'
+
x*x*m00
'
)
dst_m20[wgidy*dst_cols+wgidx]
=
mom[3]
+
x
*
(
mom[1]
*
2
+
xm
)
;
*
(
dst_m
+
mad24
(
DST_ROW_20
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[3]
+
x
*
(
mom[1]
*
2
+
xm
)
;
//
+
m11
(
=
m11
'
+
x*m01
'
+
y*m10
'
+
x*y*m00
'
)
dst_m11[wgidy*dst_cols+wgidx]
=
mom[4]
+
x
*
(
mom[2]
+
ym
)
+
y
*
mom[1]
;
*
(
dst_m
+
mad24
(
DST_ROW_11
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[4]
+
x
*
(
mom[2]
+
ym
)
+
y
*
mom[1]
;
//
+
m02
(
=
m02
'
+
2*y*m01
'
+
y*y*m00
'
)
dst_m02[wgidy*dst_cols+wgidx]
=
mom[5]
+
y
*
(
mom[2]
*
2
+
ym
)
;
*
(
dst_m
+
mad24
(
DST_ROW_02
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[5]
+
y
*
(
mom[2]
*
2
+
ym
)
;
//
+
m30
(
=
m30
'
+
3*x*m20
'
+
3*x*x*m10
'
+
x*x*x*m00
'
)
dst_m30[wgidy*dst_cols+wgidx]
=
mom[6]
+
x
*
(
3.
*
mom[3]
+
x
*
(
3.
*
mom[1]
+
xm
))
;
*
(
dst_m
+
mad24
(
DST_ROW_30
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[6]
+
x
*
(
3.
*
mom[3]
+
x
*
(
3.
*
mom[1]
+
xm
))
;
//
+
m21
(
=
m21
'
+
x*
(
2*m11
'
+
2*y*m10
'
+
x*m01
'
+
x*y*m00
'
)
+
y*m20
'
)
dst_m21[wgidy*dst_cols+wgidx]
=
mom[7]
+
x
*
(
2
*
(
mom[4]
+
y
*
mom[1]
)
+
x
*
(
mom[2]
+
ym
))
+
y
*
mom[3]
;
*
(
dst_m
+
mad24
(
DST_ROW_21
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[7]
+
x
*
(
2
*
(
mom[4]
+
y
*
mom[1]
)
+
x
*
(
mom[2]
+
ym
))
+
y
*
mom[3]
;
//
+
m12
(
=
m12
'
+
y*
(
2*m11
'
+
2*x*m01
'
+
y*m10
'
+
x*y*m00
'
)
+
x*m02
'
)
dst_m12[wgidy*dst_cols+wgidx]
=
mom[8]
+
y
*
(
2
*
(
mom[4]
+
x
*
mom[2]
)
+
y
*
(
mom[1]
+
xm
))
+
x
*
mom[5]
;
*
(
dst_m
+
mad24
(
DST_ROW_12
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[8]
+
y
*
(
2
*
(
mom[4]
+
x
*
mom[2]
)
+
y
*
(
mom[1]
+
xm
))
+
x
*
mom[5]
;
//
+
m03
(
=
m03
'
+
3*y*m02
'
+
3*y*y*m01
'
+
y*y*y*m00
'
)
dst_m03[wgidy*dst_cols+wgidx]
=
mom[9]
+
y
*
(
3.
*
mom[5]
+
y
*
(
3.
*
mom[2]
+
ym
))
;
*
(
dst_m
+
mad24
(
DST_ROW_03
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[9]
+
y
*
(
3.
*
mom[5]
+
y
*
(
3.
*
mom[2]
+
ym
))
;
}
}
//#endif
//#if
defined
(
DOUBLE_SUPPORT
)
__kernel
void
dst_sum
(
int
src_rows,
int
src_cols,
int
tile_height,
int
tile_width,
int
TILE_SIZE,
__global
double*
sum,
__global
double*
dst_m00,
__global
double*
dst_m10,
__global
double*
dst_m01,
__global
double*
dst_m20,
__global
double*
dst_m11,
__global
double*
dst_m02,
__global
double*
dst_m30,
__global
double*
dst_m21,
__global
double*
dst_m12,
__global
double*
dst_m03
)
{
int
gidy
=
get_global_id
(
0
)
;
int
gidx
=
get_global_id
(
1
)
;
int
block_y
=
src_rows/tile_height
;
int
block_x
=
src_cols/tile_width
;
int
block_num
;
if
(
src_rows
>
TILE_SIZE
&&
src_rows
%
TILE_SIZE
!=
0
)
block_y
++
;
if
(
src_cols
>
TILE_SIZE
&&
src_cols
%
TILE_SIZE
!=
0
)
block_x
++
;
block_num
=
block_y
*
block_x
;
__local
double
dst_sum[10][128]
;
if
(
gidy<128-block_num
)
for
(
int
i=0
; i<10; i++)
dst_sum[i][gidy+block_num]=0
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
gidy<block_num
)
{
dst_sum[0][gidy]
=
dst_m00[gidy]
;
dst_sum[1][gidy]
=
dst_m10[gidy]
;
dst_sum[2][gidy]
=
dst_m01[gidy]
;
dst_sum[3][gidy]
=
dst_m20[gidy]
;
dst_sum[4][gidy]
=
dst_m11[gidy]
;
dst_sum[5][gidy]
=
dst_m02[gidy]
;
dst_sum[6][gidy]
=
dst_m30[gidy]
;
dst_sum[7][gidy]
=
dst_m21[gidy]
;
dst_sum[8][gidy]
=
dst_m12[gidy]
;
dst_sum[9][gidy]
=
dst_m03[gidy]
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
for
(
int
lsize=64
; lsize>0; lsize>>=1)
{
if
(
gidy<lsize
)
{
int
lsize2
=
gidy
+
lsize
;
for
(
int
i=0
; i<10; i++)
dst_sum[i][gidy]
+=
dst_sum[i][lsize2]
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
if
(
gidy==0
)
for
(
int
i=0
; i<10; i++)
sum[i]
=
dst_sum[i][0]
;
}
//#endif
//#if
defined
(
DOUBLE_SUPPORT
)
__kernel
void
CvMoments_D2
(
__global
ushort8*
src_data,
int
src_rows,
int
src_cols,
int
src_step,
int
tileSize_width,
int
tileSize_height,
__global
double*
dst_m00,
__global
double*
dst_m10,
__global
double*
dst_m01,
__global
double*
dst_m20,
__global
double*
dst_m11,
__global
double*
dst_m02,
__global
double*
dst_m30,
__global
double*
dst_m21,
__global
double*
dst_m12,
__global
double*
dst_m03,
int
dst_cols,
int
dst_step,
__global
F*
dst_m,
int
dst_cols,
int
dst_step,
int
blocky,
int
type,
int
depth,
int
cn,
int
coi,
int
binary,
const
int
TILE_SIZE
)
{
ushort
tmp_coi[8]
; // get the coi data
...
...
@@ -345,7 +321,7 @@ __kernel void CvMoments_D2(__global ushort8* src_data, int src_rows, int src_col
if
(
binary
)
for
(
int
i=0
; i < tileSize_width; i+=VLEN_US)
tmp[i/VLEN_US]
=
(
tmp[i/VLEN_US]!=zero
)
?full:zero
;
double
mom[10]
;
F
mom[10]
;
__local
long
m[10][128]
;
if
(
lidy
==
0
)
for
(
int
i=0
; i<10; i++)
...
...
@@ -415,64 +391,55 @@ __kernel void CvMoments_D2(__global ushort8* src_data, int src_rows, int src_col
if
(
lidy
==
0&&lidx
==
0
)
{
for
(
int
mt
=
0
; mt < 10; mt++ )
mom[mt]
=
(
double
)
lm[mt]
;
mom[mt]
=
(
F
)
lm[mt]
;
if
(
binary
)
{
double
s
=
1./255
;
F
s
=
1./255
;
for
(
int
mt
=
0
; mt < 10; mt++ )
mom[mt]
*=
s
;
}
double
xm
=
x
*mom[0],
ym
=
y
*
mom[0]
;
F
xm
=
x
*mom[0],
ym
=
y
*
mom[0]
;
//
accumulate
moments
computed
in
each
tile
dst_step
/=
sizeof
(
F
)
;
//
+
m00
(
=
m00
'
)
dst_m00[wgidy*dst_cols+wgidx]
=
mom[0]
;
*
(
dst_m
+
mad24
(
DST_ROW_00
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[0]
;
//
+
m10
(
=
m10
'
+
x*m00
'
)
dst_m10[wgidy*dst_cols+wgidx]
=
mom[1]
+
xm
;
*
(
dst_m
+
mad24
(
DST_ROW_10
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[1]
+
xm
;
//
+
m01
(
=
m01
'
+
y*m00
'
)
dst_m01[wgidy*dst_cols+wgidx]
=
mom[2]
+
ym
;
*
(
dst_m
+
mad24
(
DST_ROW_01
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[2]
+
ym
;
//
+
m20
(
=
m20
'
+
2*x*m10
'
+
x*x*m00
'
)
dst_m20[wgidy*dst_cols+wgidx]
=
mom[3]
+
x
*
(
mom[1]
*
2
+
xm
)
;
*
(
dst_m
+
mad24
(
DST_ROW_20
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[3]
+
x
*
(
mom[1]
*
2
+
xm
)
;
//
+
m11
(
=
m11
'
+
x*m01
'
+
y*m10
'
+
x*y*m00
'
)
dst_m11[wgidy*dst_cols+wgidx]
=
mom[4]
+
x
*
(
mom[2]
+
ym
)
+
y
*
mom[1]
;
*
(
dst_m
+
mad24
(
DST_ROW_11
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[4]
+
x
*
(
mom[2]
+
ym
)
+
y
*
mom[1]
;
//
+
m02
(
=
m02
'
+
2*y*m01
'
+
y*y*m00
'
)
dst_m02[wgidy*dst_cols+wgidx]
=
mom[5]
+
y
*
(
mom[2]
*
2
+
ym
)
;
*
(
dst_m
+
mad24
(
DST_ROW_02
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[5]
+
y
*
(
mom[2]
*
2
+
ym
)
;
//
+
m30
(
=
m30
'
+
3*x*m20
'
+
3*x*x*m10
'
+
x*x*x*m00
'
)
dst_m30[wgidy*dst_cols+wgidx]
=
mom[6]
+
x
*
(
3.
*
mom[3]
+
x
*
(
3.
*
mom[1]
+
xm
))
;
*
(
dst_m
+
mad24
(
DST_ROW_30
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[6]
+
x
*
(
3.
*
mom[3]
+
x
*
(
3.
*
mom[1]
+
xm
))
;
//
+
m21
(
=
m21
'
+
x*
(
2*m11
'
+
2*y*m10
'
+
x*m01
'
+
x*y*m00
'
)
+
y*m20
'
)
dst_m21[wgidy*dst_cols+wgidx]
=
mom[7]
+
x
*
(
2
*
(
mom[4]
+
y
*
mom[1]
)
+
x
*
(
mom[2]
+
ym
))
+
y
*
mom[3]
;
*
(
dst_m
+
mad24
(
DST_ROW_21
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[7]
+
x
*
(
2
*
(
mom[4]
+
y
*
mom[1]
)
+
x
*
(
mom[2]
+
ym
))
+
y
*
mom[3]
;
//
+
m12
(
=
m12
'
+
y*
(
2*m11
'
+
2*x*m01
'
+
y*m10
'
+
x*y*m00
'
)
+
x*m02
'
)
dst_m12[wgidy*dst_cols+wgidx]
=
mom[8]
+
y
*
(
2
*
(
mom[4]
+
x
*
mom[2]
)
+
y
*
(
mom[1]
+
xm
))
+
x
*
mom[5]
;
*
(
dst_m
+
mad24
(
DST_ROW_12
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[8]
+
y
*
(
2
*
(
mom[4]
+
x
*
mom[2]
)
+
y
*
(
mom[1]
+
xm
))
+
x
*
mom[5]
;
//
+
m03
(
=
m03
'
+
3*y*m02
'
+
3*y*y*m01
'
+
y*y*y*m00
'
)
dst_m03[wgidy*dst_cols+wgidx]
=
mom[9]
+
y
*
(
3.
*
mom[5]
+
y
*
(
3.
*
mom[2]
+
ym
))
;
*
(
dst_m
+
mad24
(
DST_ROW_03
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[9]
+
y
*
(
3.
*
mom[5]
+
y
*
(
3.
*
mom[2]
+
ym
))
;
}
}
//#endif
//#if
defined
(
DOUBLE_SUPPORT
)
__kernel
void
CvMoments_D3
(
__global
short8*
src_data,
int
src_rows,
int
src_cols,
int
src_step,
int
tileSize_width,
int
tileSize_height,
__global
double*
dst_m00,
__global
double*
dst_m10,
__global
double*
dst_m01,
__global
double*
dst_m20,
__global
double*
dst_m11,
__global
double*
dst_m02,
__global
double*
dst_m30,
__global
double*
dst_m21,
__global
double*
dst_m12,
__global
double*
dst_m03,
int
dst_cols,
int
dst_step,
__global
F*
dst_m,
int
dst_cols,
int
dst_step,
int
blocky,
int
type,
int
depth,
int
cn,
int
coi,
int
binary,
const
int
TILE_SIZE
)
{
short
tmp_coi[8]
; // get the coi data
...
...
@@ -509,7 +476,7 @@ __kernel void CvMoments_D3(__global short8* src_data, int src_rows, int src_cols
for
(
int
i=0
; i < tileSize_width; i+=(VLEN_S))
tmp[i/VLEN_S]
=
(
tmp[i/VLEN_S]!=zero
)
?full:zero
;
double
mom[10]
;
F
mom[10]
;
__local
long
m[10][128]
;
if
(
lidy
==
0
)
for
(
int
i=0
; i<10; i++)
...
...
@@ -579,64 +546,55 @@ __kernel void CvMoments_D3(__global short8* src_data, int src_rows, int src_cols
if
(
lidy
==0
&&lidx
==0
)
{
for
(
int
mt
=
0
; mt < 10; mt++ )
mom[mt]
=
(
double
)
lm[mt]
;
mom[mt]
=
(
F
)
lm[mt]
;
if
(
binary
)
{
double
s
=
1./255
;
F
s
=
1./255
;
for
(
int
mt
=
0
; mt < 10; mt++ )
mom[mt]
*=
s
;
}
double
xm
=
x
*
mom[0],
ym
=
y*mom[0]
;
F
xm
=
x
*
mom[0],
ym
=
y*mom[0]
;
//
accumulate
moments
computed
in
each
tile
dst_step
/=
sizeof
(
F
)
;
//
+
m00
(
=
m00
'
)
dst_m00[wgidy*dst_cols+wgidx]
=
mom[0]
;
*
(
dst_m
+
mad24
(
DST_ROW_00
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[0]
;
//
+
m10
(
=
m10
'
+
x*m00
'
)
dst_m10[wgidy*dst_cols+wgidx]
=
mom[1]
+
xm
;
*
(
dst_m
+
mad24
(
DST_ROW_10
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[1]
+
xm
;
//
+
m01
(
=
m01
'
+
y*m00
'
)
dst_m01[wgidy*dst_cols+wgidx]
=
mom[2]
+
ym
;
*
(
dst_m
+
mad24
(
DST_ROW_01
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[2]
+
ym
;
//
+
m20
(
=
m20
'
+
2*x*m10
'
+
x*x*m00
'
)
dst_m20[wgidy*dst_cols+wgidx]
=
mom[3]
+
x
*
(
mom[1]
*
2
+
xm
)
;
*
(
dst_m
+
mad24
(
DST_ROW_20
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[3]
+
x
*
(
mom[1]
*
2
+
xm
)
;
//
+
m11
(
=
m11
'
+
x*m01
'
+
y*m10
'
+
x*y*m00
'
)
dst_m11[wgidy*dst_cols+wgidx]
=
mom[4]
+
x
*
(
mom[2]
+
ym
)
+
y
*
mom[1]
;
*
(
dst_m
+
mad24
(
DST_ROW_11
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[4]
+
x
*
(
mom[2]
+
ym
)
+
y
*
mom[1]
;
//
+
m02
(
=
m02
'
+
2*y*m01
'
+
y*y*m00
'
)
dst_m02[wgidy*dst_cols+wgidx]
=
mom[5]
+
y
*
(
mom[2]
*
2
+
ym
)
;
*
(
dst_m
+
mad24
(
DST_ROW_02
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[5]
+
y
*
(
mom[2]
*
2
+
ym
)
;
//
+
m30
(
=
m30
'
+
3*x*m20
'
+
3*x*x*m10
'
+
x*x*x*m00
'
)
dst_m30[wgidy*dst_cols+wgidx]
=
mom[6]
+
x
*
(
3.
*
mom[3]
+
x
*
(
3.
*
mom[1]
+
xm
))
;
*
(
dst_m
+
mad24
(
DST_ROW_30
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[6]
+
x
*
(
3.
*
mom[3]
+
x
*
(
3.
*
mom[1]
+
xm
))
;
//
+
m21
(
=
m21
'
+
x*
(
2*m11
'
+
2*y*m10
'
+
x*m01
'
+
x*y*m00
'
)
+
y*m20
'
)
dst_m21[wgidy*dst_cols+wgidx]
=
mom[7]
+
x
*
(
2
*
(
mom[4]
+
y
*
mom[1]
)
+
x
*
(
mom[2]
+
ym
))
+
y
*
mom[3]
;
*
(
dst_m
+
mad24
(
DST_ROW_21
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[7]
+
x
*
(
2
*
(
mom[4]
+
y
*
mom[1]
)
+
x
*
(
mom[2]
+
ym
))
+
y
*
mom[3]
;
//
+
m12
(
=
m12
'
+
y*
(
2*m11
'
+
2*x*m01
'
+
y*m10
'
+
x*y*m00
'
)
+
x*m02
'
)
dst_m12[wgidy*dst_cols+wgidx]
=
mom[8]
+
y
*
(
2
*
(
mom[4]
+
x
*
mom[2]
)
+
y
*
(
mom[1]
+
xm
))
+
x
*
mom[5]
;
*
(
dst_m
+
mad24
(
DST_ROW_12
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[8]
+
y
*
(
2
*
(
mom[4]
+
x
*
mom[2]
)
+
y
*
(
mom[1]
+
xm
))
+
x
*
mom[5]
;
//
+
m03
(
=
m03
'
+
3*y*m02
'
+
3*y*y*m01
'
+
y*y*y*m00
'
)
dst_m03[wgidy*dst_cols+wgidx]
=
mom[9]
+
y
*
(
3.
*
mom[5]
+
y
*
(
3.
*
mom[2]
+
ym
))
;
*
(
dst_m
+
mad24
(
DST_ROW_03
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[9]
+
y
*
(
3.
*
mom[5]
+
y
*
(
3.
*
mom[2]
+
ym
))
;
}
}
//#endif
//#if
defined
(
DOUBLE_SUPPORT
)
__kernel
void
CvMoments_D5
(
__global
float*
src_data,
int
src_rows,
int
src_cols,
int
src_step,
int
tileSize_width,
int
tileSize_height,
__global
double*
dst_m00,
__global
double*
dst_m10,
__global
double*
dst_m01,
__global
double*
dst_m20,
__global
double*
dst_m11,
__global
double*
dst_m02,
__global
double*
dst_m30,
__global
double*
dst_m21,
__global
double*
dst_m12,
__global
double*
dst_m03,
int
dst_cols,
int
dst_step,
__global
F*
dst_m,
int
dst_cols,
int
dst_step,
int
blocky,
int
type,
int
depth,
int
cn,
int
coi,
int
binary,
const
int
TILE_SIZE
)
{
float
tmp_coi[4]
; // get the coi data
...
...
@@ -672,23 +630,23 @@ __kernel void CvMoments_D5( __global float* src_data, int src_rows, int src_cols
if
(
binary
)
for
(
int
i=0
; i < tileSize_width; i+=4)
tmp[i/VLEN_F]
=
(
tmp[i/VLEN_F]!=zero
)
?full:zero
;
double
mom[10]
;
__local
double
m[10][128]
;
F
mom[10]
;
__local
F
m[10][128]
;
if
(
lidy
==
0
)
for
(
int
i
=
0
; i < 10; i ++)
for
(
int
j
=
0
; j < 128; j ++)
m[i][j]
=
0
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
double
lm[10]
=
{0}
;
double4
x0
=
(
double
4
)(
0
)
;
double4
x1
=
(
double
4
)(
0
)
;
double4
x2
=
(
double
4
)(
0
)
;
double4
x3
=
(
double
4
)(
0
)
;
F
lm[10]
=
{0}
;
F4
x0
=
(
F
4
)(
0
)
;
F4
x1
=
(
F
4
)(
0
)
;
F4
x2
=
(
F
4
)(
0
)
;
F4
x3
=
(
F
4
)(
0
)
;
for
(
int
xt
=
0
; xt < tileSize_width; xt+=VLEN_F )
{
double4
v_xt
=
(
double
4
)(
xt,
xt+1,
xt+2,
xt+3
)
;
double4
p
=
convert_double
4
(
tmp[xt/VLEN_F]
)
;
double
4
xp
=
v_xt
*
p,
xxp
=
xp
*
v_xt
;
F4
v_xt
=
(
F
4
)(
xt,
xt+1,
xt+2,
xt+3
)
;
F4
p
=
convert_F
4
(
tmp[xt/VLEN_F]
)
;
F
4
xp
=
v_xt
*
p,
xxp
=
xp
*
v_xt
;
x0
+=
p
;
x1
+=
xp
;
x2
+=
xxp
;
...
...
@@ -698,14 +656,14 @@ __kernel void CvMoments_D5( __global float* src_data, int src_rows, int src_cols
x1.s0
+=
x1.s1
+
x1.s2
+
x1.s3
;
x2.s0
+=
x2.s1
+
x2.s2
+
x2.s3
;
x3.s0
+=
x3.s1
+
x3.s2
+
x3.s3
;
/*
double
py
=
lidy
*
x0.s0,
sy
=
lidy*lidy
;
F
py
=
lidy
*
x0.s0,
sy
=
lidy*lidy
;
int
bheight
=
min
(
tileSize_height,
TILE_SIZE/2
)
;
if
(
bheight
>=
TILE_SIZE/2&&lidy
>
bheight-1&&lidy
<
tileSize_height
)
{
m[9][lidy-bheight]
=
((
double
)
py
)
*
sy
; // m03
m[8][lidy-bheight]
=
((
double
)
x1.s0
)
*
sy
; // m12
m[7][lidy-bheight]
=
((
double
)
x2.s0
)
*
lidy
; // m21
m[9][lidy-bheight]
=
((
F
)
py
)
*
sy
; // m03
m[8][lidy-bheight]
=
((
F
)
x1.s0
)
*
sy
; // m12
m[7][lidy-bheight]
=
((
F
)
x2.s0
)
*
lidy
; // m21
m[6][lidy-bheight]
=
x3.s0
; // m30
m[5][lidy-bheight]
=
x0.s0
*
sy
; // m02
m[4][lidy-bheight]
=
x1.s0
*
lidy
; // m11
...
...
@@ -714,11 +672,12 @@ __kernel void CvMoments_D5( __global float* src_data, int src_rows, int src_cols
m[1][lidy-bheight]
=
x1.s0
; // m10
m[0][lidy-bheight]
=
x0.s0
; // m00
}
else
if
(
lidy
<
bheight
)
{
lm[9]
=
((
double
)
py
)
*
sy
; // m03
lm[8]
=
((
double
)
x1.s0
)
*
sy
; // m12
lm[7]
=
((
double
)
x2.s0
)
*
lidy
; // m21
lm[9]
=
((
F
)
py
)
*
sy
; // m03
lm[8]
=
((
F
)
x1.s0
)
*
sy
; // m12
lm[7]
=
((
F
)
x2.s0
)
*
lidy
; // m21
lm[6]
=
x3.s0
; // m30
lm[5]
=
x0.s0
*
sy
; // m02
lm[4]
=
x1.s0
*
lidy
; // m11
...
...
@@ -741,69 +700,59 @@ __kernel void CvMoments_D5( __global float* src_data, int src_rows, int src_cols
}
if
(
lidy
==
0&&lidx
==
0
)
{
for
(
int
mt
=
0
; mt < 10; mt++ )
mom[mt]
=
(
double
)
lm[mt]
;
for
(
int
mt
=
0
; mt < 10; mt++ )
mom[mt]
=
(
F
)
lm[mt]
;
if
(
binary
)
{
double
s
=
1./255
;
F
s
=
1./255
;
for
(
int
mt
=
0
; mt < 10; mt++ )
mom[mt]
*=
s
;
}
double
xm
=
x
*
mom[0],
ym
=
y
*
mom[0]
;
F
xm
=
x
*
mom[0],
ym
=
y
*
mom[0]
;
//
accumulate
moments
computed
in
each
tile
dst_step
/=
sizeof
(
F
)
;
//
+
m00
(
=
m00
'
)
dst_m00[wgidy*dst_cols+wgidx]
=
mom[0]
;
*
(
dst_m
+
mad24
(
DST_ROW_00
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[0]
;
//
+
m10
(
=
m10
'
+
x*m00
'
)
dst_m10[wgidy*dst_cols+wgidx]
=
mom[1]
+
xm
;
*
(
dst_m
+
mad24
(
DST_ROW_10
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[1]
+
xm
;
//
+
m01
(
=
m01
'
+
y*m00
'
)
dst_m01[wgidy*dst_cols+wgidx]
=
mom[2]
+
ym
;
*
(
dst_m
+
mad24
(
DST_ROW_01
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[2]
+
ym
;
//
+
m20
(
=
m20
'
+
2*x*m10
'
+
x*x*m00
'
)
dst_m20[wgidy*dst_cols+wgidx]
=
mom[3]
+
x
*
(
mom[1]
*
2
+
xm
)
;
*
(
dst_m
+
mad24
(
DST_ROW_20
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[3]
+
x
*
(
mom[1]
*
2
+
xm
)
;
//
+
m11
(
=
m11
'
+
x*m01
'
+
y*m10
'
+
x*y*m00
'
)
dst_m11[wgidy*dst_cols+wgidx]
=
mom[4]
+
x
*
(
mom[2]
+
ym
)
+
y
*
mom[1]
;
*
(
dst_m
+
mad24
(
DST_ROW_11
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[4]
+
x
*
(
mom[2]
+
ym
)
+
y
*
mom[1]
;
//
+
m02
(
=
m02
'
+
2*y*m01
'
+
y*y*m00
'
)
dst_m02[wgidy*dst_cols+wgidx]
=
mom[5]
+
y
*
(
mom[2]
*
2
+
ym
)
;
*
(
dst_m
+
mad24
(
DST_ROW_02
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[5]
+
y
*
(
mom[2]
*
2
+
ym
)
;
//
+
m30
(
=
m30
'
+
3*x*m20
'
+
3*x*x*m10
'
+
x*x*x*m00
'
)
dst_m30[wgidy*dst_cols+wgidx]
=
mom[6]
+
x
*
(
3.
*
mom[3]
+
x
*
(
3.
*
mom[1]
+
xm
))
;
*
(
dst_m
+
mad24
(
DST_ROW_30
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[6]
+
x
*
(
3.
*
mom[3]
+
x
*
(
3.
*
mom[1]
+
xm
))
;
//
+
m21
(
=
m21
'
+
x*
(
2*m11
'
+
2*y*m10
'
+
x*m01
'
+
x*y*m00
'
)
+
y*m20
'
)
dst_m21[wgidy*dst_cols+wgidx]
=
mom[7]
+
x
*
(
2
*
(
mom[4]
+
y
*
mom[1]
)
+
x
*
(
mom[2]
+
ym
))
+
y
*
mom[3]
;
*
(
dst_m
+
mad24
(
DST_ROW_21
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[7]
+
x
*
(
2
*
(
mom[4]
+
y
*
mom[1]
)
+
x
*
(
mom[2]
+
ym
))
+
y
*
mom[3]
;
//
+
m12
(
=
m12
'
+
y*
(
2*m11
'
+
2*x*m01
'
+
y*m10
'
+
x*y*m00
'
)
+
x*m02
'
)
dst_m12[wgidy*dst_cols+wgidx]
=
mom[8]
+
y
*
(
2
*
(
mom[4]
+
x
*
mom[2]
)
+
y
*
(
mom[1]
+
xm
))
+
x
*
mom[5]
;
*
(
dst_m
+
mad24
(
DST_ROW_12
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[8]
+
y
*
(
2
*
(
mom[4]
+
x
*
mom[2]
)
+
y
*
(
mom[1]
+
xm
))
+
x
*
mom[5]
;
//
+
m03
(
=
m03
'
+
3*y*m02
'
+
3*y*y*m01
'
+
y*y*y*m00
'
)
dst_m03[wgidy*dst_cols+wgidx]
=
mom[9]
+
y
*
(
3.
*
mom[5]
+
y
*
(
3.
*
mom[2]
+
ym
))
;
}
*/
*
(
dst_m
+
mad24
(
DST_ROW_03
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[9]
+
y
*
(
3.
*
mom[5]
+
y
*
(
3.
*
mom[2]
+
ym
))
;
}
}
//#endif
//#if
defined
(
DOUBLE_SUPPORT
)
__kernel
void
CvMoments_D6
(
__global
double*
src_data,
int
src_rows,
int
src_cols,
int
src_step,
int
tileSize_width,
int
tileSize_height,
__global
double*
dst_m00,
__global
double*
dst_m10,
__global
double*
dst_m01,
__global
double*
dst_m20,
__global
double*
dst_m11,
__global
double*
dst_m02,
__global
double*
dst_m30,
__global
double*
dst_m21,
__global
double*
dst_m12,
__global
double*
dst_m03,
int
dst_cols,
int
dst_step,
__kernel
void
CvMoments_D6
(
__global
F*
src_data,
int
src_rows,
int
src_cols,
int
src_step,
int
tileSize_width,
int
tileSize_height,
__global
F*
dst_m,
int
dst_cols,
int
dst_step,
int
blocky,
int
type,
int
depth,
int
cn,
int
coi,
int
binary,
const
int
TILE_SIZE
)
{
double
tmp_coi[4]
; // get the coi data
double
4
tmp[64]
;
F
tmp_coi[4]
; // get the coi data
F
4
tmp[64]
;
int
VLEN_D
=
4
; // length of vetor
int
gidy
=
get_global_id
(
0
)
;
int
gidx
=
get_global_id
(
1
)
;
...
...
@@ -820,39 +769,39 @@ __kernel void CvMoments_D6(__global double* src_data, int src_rows, int src_col
if
(
tileSize_width
<
TILE_SIZE
)
for
(
int
i
=
tileSize_width
; i < rstep; i++ )
*
((
__global
double
*
)
src_data+
(
y+lidy
)
*src_step/8+x+i
)
=
0
;
*
((
__global
F
*
)
src_data+
(
y+lidy
)
*src_step/8+x+i
)
=
0
;
if
(
coi
>
0
)
for
(
int
i=0
; i < tileSize_width; i+=VLEN_D)
{
for
(
int
j=0
; j<4; j++)
tmp_coi[j]
=
*
(
src_data+
(
y+lidy
)
*src_step/8+
(
x+i+j
)
*kcn+coi-1
)
;
tmp[i/VLEN_D]
=
(
double
4
)(
tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3]
)
;
tmp[i/VLEN_D]
=
(
F
4
)(
tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3]
)
;
}
else
for
(
int
i=0
; i < tileSize_width; i+=VLEN_D)
tmp[i/VLEN_D]
=
(
double
4
)(
*
(
src_data+
(
y+lidy
)
*src_step/8+x+i
)
,
*
(
src_data+
(
y+lidy
)
*src_step/8+x+i+1
)
,
*
(
src_data+
(
y+lidy
)
*src_step/8+x+i+2
)
,
*
(
src_data+
(
y+lidy
)
*src_step/8+x+i+3
))
;
double4
zero
=
(
double
4
)(
0
)
;
double4
full
=
(
double
4
)(
255
)
;
tmp[i/VLEN_D]
=
(
F
4
)(
*
(
src_data+
(
y+lidy
)
*src_step/8+x+i
)
,
*
(
src_data+
(
y+lidy
)
*src_step/8+x+i+1
)
,
*
(
src_data+
(
y+lidy
)
*src_step/8+x+i+2
)
,
*
(
src_data+
(
y+lidy
)
*src_step/8+x+i+3
))
;
F4
zero
=
(
F
4
)(
0
)
;
F4
full
=
(
F
4
)(
255
)
;
if
(
binary
)
for
(
int
i=0
; i < tileSize_width; i+=VLEN_D)
tmp[i/VLEN_D]
=
(
tmp[i/VLEN_D]!=zero
)
?full:zero
;
double
mom[10]
;
__local
double
m[10][128]
;
F
mom[10]
;
__local
F
m[10][128]
;
if
(
lidy
==
0
)
for
(
int
i=0
; i<10; i++)
for
(
int
j=0
; j<128; j++)
m[i][j]=0
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
double
lm[10]
=
{0}
;
double4
x0
=
(
double
4
)(
0
)
;
double4
x1
=
(
double
4
)(
0
)
;
double4
x2
=
(
double
4
)(
0
)
;
double4
x3
=
(
double
4
)(
0
)
;
F
lm[10]
=
{0}
;
F4
x0
=
(
F
4
)(
0
)
;
F4
x1
=
(
F
4
)(
0
)
;
F4
x2
=
(
F
4
)(
0
)
;
F4
x3
=
(
F
4
)(
0
)
;
for
(
int
xt
=
0
; xt < tileSize_width; xt+=VLEN_D )
{
double4
v_xt
=
(
double
4
)(
xt,
xt+1,
xt+2,
xt+3
)
;
double
4
p
=
tmp[xt/VLEN_D]
;
double
4
xp
=
v_xt
*
p,
xxp
=
xp
*
v_xt
;
F4
v_xt
=
(
F
4
)(
xt,
xt+1,
xt+2,
xt+3
)
;
F
4
p
=
tmp[xt/VLEN_D]
;
F
4
xp
=
v_xt
*
p,
xxp
=
xp
*
v_xt
;
x0
+=
p
;
x1
+=
xp
;
x2
+=
xxp
;
...
...
@@ -863,13 +812,13 @@ __kernel void CvMoments_D6(__global double* src_data, int src_rows, int src_col
x2.s0
+=
x2.s1
+
x2.s2
+
x2.s3
;
x3.s0
+=
x3.s1
+
x3.s2
+
x3.s3
;
double
py
=
lidy
*
x0.s0,
sy
=
lidy*lidy
;
F
py
=
lidy
*
x0.s0,
sy
=
lidy*lidy
;
int
bheight
=
min
(
tileSize_height,
TILE_SIZE/2
)
;
if
(
bheight
>=
TILE_SIZE/2&&lidy
>
bheight-1&&lidy
<
tileSize_height
)
{
m[9][lidy-bheight]
=
((
double
)
py
)
*
sy
; // m03
m[8][lidy-bheight]
=
((
double
)
x1.s0
)
*
sy
; // m12
m[7][lidy-bheight]
=
((
double
)
x2.s0
)
*
lidy
; // m21
m[9][lidy-bheight]
=
((
F
)
py
)
*
sy
; // m03
m[8][lidy-bheight]
=
((
F
)
x1.s0
)
*
sy
; // m12
m[7][lidy-bheight]
=
((
F
)
x2.s0
)
*
lidy
; // m21
m[6][lidy-bheight]
=
x3.s0
; // m30
m[5][lidy-bheight]
=
x0.s0
*
sy
; // m02
m[4][lidy-bheight]
=
x1.s0
*
lidy
; // m11
...
...
@@ -881,9 +830,9 @@ __kernel void CvMoments_D6(__global double* src_data, int src_rows, int src_col
else
if
(
lidy
<
bheight
)
{
lm[9]
=
((
double
)
py
)
*
sy
; // m03
lm[8]
=
((
double
)
x1.s0
)
*
sy
; // m12
lm[7]
=
((
double
)
x2.s0
)
*
lidy
; // m21
lm[9]
=
((
F
)
py
)
*
sy
; // m03
lm[8]
=
((
F
)
x1.s0
)
*
sy
; // m12
lm[7]
=
((
F
)
x2.s0
)
*
lidy
; // m21
lm[6]
=
x3.s0
; // m30
lm[5]
=
x0.s0
*
sy
; // m02
lm[4]
=
x1.s0
*
lidy
; // m11
...
...
@@ -907,47 +856,47 @@ __kernel void CvMoments_D6(__global double* src_data, int src_rows, int src_col
if
(
lidy
==
0&&lidx
==
0
)
{
for
(
int
mt
=
0
; mt < 10; mt++ )
mom[mt]
=
(
double
)
lm[mt]
;
mom[mt]
=
(
F
)
lm[mt]
;
if
(
binary
)
{
double
s
=
1./255
;
F
s
=
1./255
;
for
(
int
mt
=
0
; mt < 10; mt++ )
mom[mt]
*=
s
;
}
double
xm
=
x
*
mom[0],
ym
=
y
*
mom[0]
;
F
xm
=
x
*
mom[0],
ym
=
y
*
mom[0]
;
//
accumulate
moments
computed
in
each
tile
dst_step
/=
sizeof
(
F
)
;
//
+
m00
(
=
m00
'
)
dst_m00[wgidy*dst_cols+wgidx]
=
mom[0]
;
*
(
dst_m
+
mad24
(
DST_ROW_00
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[0]
;
//
+
m10
(
=
m10
'
+
x*m00
'
)
dst_m10[wgidy*dst_cols+wgidx]
=
mom[1]
+
xm
;
*
(
dst_m
+
mad24
(
DST_ROW_10
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[1]
+
xm
;
//
+
m01
(
=
m01
'
+
y*m00
'
)
dst_m01[wgidy*dst_cols+wgidx]
=
mom[2]
+
ym
;
*
(
dst_m
+
mad24
(
DST_ROW_01
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[2]
+
ym
;
//
+
m20
(
=
m20
'
+
2*x*m10
'
+
x*x*m00
'
)
dst_m20[wgidy*dst_cols+wgidx]
=
mom[3]
+
x
*
(
mom[1]
*
2
+
xm
)
;
*
(
dst_m
+
mad24
(
DST_ROW_20
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[3]
+
x
*
(
mom[1]
*
2
+
xm
)
;
//
+
m11
(
=
m11
'
+
x*m01
'
+
y*m10
'
+
x*y*m00
'
)
dst_m11[wgidy*dst_cols+wgidx]
=
mom[4]
+
x
*
(
mom[2]
+
ym
)
+
y
*
mom[1]
;
*
(
dst_m
+
mad24
(
DST_ROW_11
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[4]
+
x
*
(
mom[2]
+
ym
)
+
y
*
mom[1]
;
//
+
m02
(
=
m02
'
+
2*y*m01
'
+
y*y*m00
'
)
dst_m02[wgidy*dst_cols+wgidx]
=
mom[5]
+
y
*
(
mom[2]
*
2
+
ym
)
;
*
(
dst_m
+
mad24
(
DST_ROW_02
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[5]
+
y
*
(
mom[2]
*
2
+
ym
)
;
//
+
m30
(
=
m30
'
+
3*x*m20
'
+
3*x*x*m10
'
+
x*x*x*m00
'
)
dst_m30[wgidy*dst_cols+wgidx]
=
mom[6]
+
x
*
(
3.
*
mom[3]
+
x
*
(
3.
*
mom[1]
+
xm
))
;
*
(
dst_m
+
mad24
(
DST_ROW_30
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[6]
+
x
*
(
3.
*
mom[3]
+
x
*
(
3.
*
mom[1]
+
xm
))
;
//
+
m21
(
=
m21
'
+
x*
(
2*m11
'
+
2*y*m10
'
+
x*m01
'
+
x*y*m00
'
)
+
y*m20
'
)
dst_m21[wgidy*dst_cols+wgidx]
=
mom[7]
+
x
*
(
2
*
(
mom[4]
+
y
*
mom[1]
)
+
x
*
(
mom[2]
+
ym
))
+
y
*
mom[3]
;
*
(
dst_m
+
mad24
(
DST_ROW_21
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[7]
+
x
*
(
2
*
(
mom[4]
+
y
*
mom[1]
)
+
x
*
(
mom[2]
+
ym
))
+
y
*
mom[3]
;
//
+
m12
(
=
m12
'
+
y*
(
2*m11
'
+
2*x*m01
'
+
y*m10
'
+
x*y*m00
'
)
+
x*m02
'
)
dst_m12[wgidy*dst_cols+wgidx]
=
mom[8]
+
y
*
(
2
*
(
mom[4]
+
x
*
mom[2]
)
+
y
*
(
mom[1]
+
xm
))
+
x
*
mom[5]
;
*
(
dst_m
+
mad24
(
DST_ROW_12
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[8]
+
y
*
(
2
*
(
mom[4]
+
x
*
mom[2]
)
+
y
*
(
mom[1]
+
xm
))
+
x
*
mom[5]
;
//
+
m03
(
=
m03
'
+
3*y*m02
'
+
3*y*y*m01
'
+
y*y*y*m00
'
)
dst_m03[wgidy*dst_cols+wgidx]
=
mom[9]
+
y
*
(
3.
*
mom[5]
+
y
*
(
3.
*
mom[2]
+
ym
))
;
*
(
dst_m
+
mad24
(
DST_ROW_03
*
blocky,
dst_step,
mad24
(
wgidy,
dst_cols,
wgidx
)))
=
mom[9]
+
y
*
(
3.
*
mom[5]
+
y
*
(
3.
*
mom[2]
+
ym
))
;
}
}
//#endif
}
\ No newline at end of file
modules/ocl/src/opencl/pyr_up.cl
View file @
6c99b5c9
...
...
@@ -16,6 +16,8 @@
//
//
@Authors
//
Zhang
Chunpeng
chunpeng@multicorewareinc.com
//
Dachuan
Zhao,
dachuan@multicorewareinc.com
//
Yao
Wang,
yao@multicorewareinc.com
//
//
Redistribution
and
use
in
source
and
binary
forms,
with
or
without
modification,
//
are
permitted
provided
that
the
following
conditions
are
met:
...
...
@@ -53,20 +55,22 @@ uchar get_valid_uchar(uchar data)
//////////////////////////
CV_8UC1
//////////////////////////////////
///////////////////////////////////////////////////////////////////////
__kernel
void
pyrUp_C1_D0
(
__global
uchar*
src,__global
uchar*
dst,
int
srcRows,int
dstRows,int
srcCols,int
dstCols,
int
srcOffset,int
dstOffset,int
srcStep,int
dstStep
)
int
srcRows,int
dstRows,int
srcCols,int
dstCols,
int
srcOffset,int
dstOffset,int
srcStep,int
dstStep
)
{
const
int
x
=
get_global_id
(
0
)
;
const
int
y
=
get_global_id
(
1
)
;
__local
float
s_srcPatch[10][10]
;
__local
float
s_dstPatch[20][16]
;
const
int
tidx
=
get_local_id
(
0
)
;
const
int
tidy
=
get_local_id
(
1
)
;
const
int
lsizex
=
get_local_size
(
0
)
;
const
int
lsizey
=
get_local_size
(
1
)
;
if
(
get_local_id
(
0
)
<
10
&&
get_local_id
(
1
)
<
10
)
if
(
tidx
<
10
&&
tidy
<
10
)
{
int
srcx
=
(
int
)(
get_group_id
(
0
)
*
get_local_size
(
0
)
/
2
+
get_local_id
(
0
)
)
-
1
;
int
srcy
=
(
int
)(
get_group_id
(
1
)
*
get_local_size
(
1
)
/
2
+
get_local_id
(
1
)
)
-
1
;
int
srcx
=
mad24
((
int
)
get_group_id
(
0
)
,
(
lsizex>>1
)
,
tidx
)
-
1
;
int
srcy
=
mad24
((
int
)
get_group_id
(
1
)
,
(
lsizey>>1
)
,
tidy
)
-
1
;
srcx
=
abs
(
srcx
)
;
srcx
=
min
(
srcCols
-
1
,
srcx
)
;
...
...
@@ -74,25 +78,24 @@ __kernel void pyrUp_C1_D0(__global uchar* src,__global uchar* dst,
srcy
=
abs
(
srcy
)
;
srcy
=
min
(
srcRows
-1
,
srcy
)
;
s_srcPatch[
get_local_id
(
1
)
][get_local_id
(
0
)
]
=
(
float
)(
src[srcx
+
srcy
*
srcStep]
)
;
s_srcPatch[
tidy][tidx
]
=
(
float
)(
src[srcx
+
srcy
*
srcStep]
)
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
float
sum
=
0
;
const
int
evenFlag
=
(
int
)((
get_local_id
(
0
)
&
1
)
==
0
)
;
const
int
oddFlag
=
(
int
)((
get_local_id
(
0
)
&
1
)
!=
0
)
;
const
bool
eveny
=
((
get_local_id
(
1
)
&
1
)
==
0
)
;
const
int
tidx
=
get_local_id
(
0
)
;
const
int
evenFlag
=
(
int
)((
tidx
&
1
)
==
0
)
;
const
int
oddFlag
=
(
int
)((
tidx
&
1
)
!=
0
)
;
const
bool
eveny
=
((
tidy
&
1
)
==
0
)
;
if
(
eveny
)
{
sum
=
sum
+
(
evenFlag
*
0.0625f
)
*
s_srcPatch[1
+
(
get_local_id
(
1
)
>>
1
)
][1
+
((
tidx
-
2
)
>>
1
)
]
;
sum
=
sum
+
(
oddFlag
*
0.25f
)
*
s_srcPatch[1
+
(
get_local_id
(
1
)
>>
1
)
][1
+
((
tidx
-
1
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag
*
0.375f
)
*
s_srcPatch[1
+
(
get_local_id
(
1
)
>>
1
)
][1
+
((
tidx
)
>>
1
)
]
;
sum
=
sum
+
(
oddFlag
*
0.25f
)
*
s_srcPatch[1
+
(
get_local_id
(
1
)
>>
1
)
][1
+
((
tidx
+
1
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag
*
0.0625f
)
*
s_srcPatch[1
+
(
get_local_id
(
1
)
>>
1
)
][1
+
((
tidx
+
2
)
>>
1
)
]
;
sum
=
(
evenFlag
*
0.0625f
)
*
s_srcPatch[1
+
(
tidy
>>
1
)
][1
+
((
tidx
-
2
)
>>
1
)
]
;
sum
=
sum
+
(
oddFlag
*
0.25f
)
*
s_srcPatch[1
+
(
tidy
>>
1
)
][1
+
((
tidx
-
1
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag
*
0.375f
)
*
s_srcPatch[1
+
(
tidy
>>
1
)
][1
+
((
tidx
)
>>
1
)
]
;
sum
=
sum
+
(
oddFlag
*
0.25f
)
*
s_srcPatch[1
+
(
tidy
>>
1
)
][1
+
((
tidx
+
1
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag
*
0.0625f
)
*
s_srcPatch[1
+
(
tidy
>>
1
)
][1
+
((
tidx
+
2
)
>>
1
)
]
;
}
s_dstPatch[2
+
get_local_id
(
1
)
][get_local_id
(
0
)
]
=
sum
;
...
...
@@ -103,42 +106,40 @@ __kernel void pyrUp_C1_D0(__global uchar* src,__global uchar* dst,
if
(
eveny
)
{
sum
=
sum
+
(
evenFlag
*
0.0625f
)
*
s_srcPatch[0
][1
+
((
tidx
-
2
)
>>
1
)
]
;
sum
=
sum
+
(
oddFlag
*
0.25f
)
*
s_srcPatch[
0
][1
+
((
tidx
-
1
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag
*
0.375f
)
*
s_srcPatch[
0
][1
+
((
tidx
)
>>
1
)
]
;
sum
=
sum
+
(
oddFlag
*
0.25f
)
*
s_srcPatch[
0
][1
+
((
tidx
+
1
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag
*
0.0625f
)
*
s_srcPatch[
0
][1
+
((
tidx
+
2
)
>>
1
)
]
;
}
sum
=
(
evenFlag
*
0.0625f
)
*
s_srcPatch[lsizey
-
16
][1
+
((
tidx
-
2
)
>>
1
)
]
;
sum
=
sum
+
(
oddFlag
*
0.25f
)
*
s_srcPatch[
lsizey
-
16
][1
+
((
tidx
-
1
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag
*
0.375f
)
*
s_srcPatch[
lsizey
-
16
][1
+
((
tidx
)
>>
1
)
]
;
sum
=
sum
+
(
oddFlag
*
0.25f
)
*
s_srcPatch[
lsizey
-
16
][1
+
((
tidx
+
1
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag
*
0.0625f
)
*
s_srcPatch[
lsizey
-
16
][1
+
((
tidx
+
2
)
>>
1
)
]
;
}
s_dstPatch[get_local_id
(
1
)
][get_local_id
(
0
)
]
=
sum
;
}
s_dstPatch[get_local_id
(
1
)
][get_local_id
(
0
)
]
=
sum
;
}
if
(
get_local_id
(
1
)
>
13
)
{
if
(
get_local_id
(
1
)
>
13
)
{
sum
=
0
;
if
(
eveny
)
{
sum
=
sum
+
(
evenFlag
*
0.0625f
)
*
s_srcPatch[9
][1
+
((
tidx
-
2
)
>>
1
)
]
;
sum
=
sum
+
(
oddFlag
*
0.25f
)
*
s_srcPatch[
9
][1
+
((
tidx
-
1
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag
*
0.375f
)
*
s_srcPatch[
9
][1
+
((
tidx
)
>>
1
)
]
;
sum
=
sum
+
(
oddFlag
*
0.25f
)
*
s_srcPatch[
9
][1
+
((
tidx
+
1
)
>>
1
)
]
;
sum
=
(
evenFlag
*
0.0625f
)
*
s_srcPatch[lsizey
-
7
][1
+
((
tidx
-
2
)
>>
1
)
]
;
sum
=
sum
+
(
oddFlag
*
0.25f
)
*
s_srcPatch[
lsizey
-
7
][1
+
((
tidx
-
1
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag
*
0.375f
)
*
s_srcPatch[
lsizey
-
7
][1
+
((
tidx
)
>>
1
)
]
;
sum
=
sum
+
(
oddFlag
*
0.25f
)
*
s_srcPatch[
lsizey
-
7
][1
+
((
tidx
+
1
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag
*
0.0625f
)
*
s_srcPatch[9][1
+
((
tidx
+
2
)
>>
1
)
]
;
}
s_dstPatch[4
+
get_local_id
(
1
)
][get_local_id
(
0
)
]
=
sum
;
}
s_dstPatch[4
+
tidy][tidx
]
=
sum
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
sum
=
0
;
const
int
tidy
=
get_local_id
(
1
)
;
sum
=
sum
+
0.0625f
*
s_dstPatch[2
+
tidy
-
2][get_local_id
(
0
)
]
;
sum
=
sum
+
0.25f
*
s_dstPatch[2
+
tidy
-
1][get_local_id
(
0
)
]
;
sum
=
sum
+
0.375f
*
s_dstPatch[2
+
tidy
][get_local_id
(
0
)
]
;
sum
=
sum
+
0.25f
*
s_dstPatch[2
+
tidy
+
1][get_local_id
(
0
)
]
;
sum
=
sum
+
0.0625f
*
s_dstPatch[2
+
tidy
+
2][get_local_id
(
0
)
]
;
sum
=
0.0625f
*
s_dstPatch[2
+
tidy
-
2][tidx]
;
sum
=
sum
+
0.25f
*
s_dstPatch[2
+
tidy
-
1][tidx]
;
sum
=
sum
+
0.375f
*
s_dstPatch[2
+
tidy
][tidx]
;
sum
=
sum
+
0.25f
*
s_dstPatch[2
+
tidy
+
1][tidx]
;
sum
=
sum
+
0.0625f
*
s_dstPatch[2
+
tidy
+
2][tidx]
;
if
((
x
<
dstCols
)
&&
(
y
<
dstRows
))
dst[x
+
y
*
dstStep]
=
(
float
)(
4.0f
*
sum
)
;
...
...
@@ -149,8 +150,8 @@ __kernel void pyrUp_C1_D0(__global uchar* src,__global uchar* dst,
//////////////////////////
CV_16UC1
/////////////////////////////////
///////////////////////////////////////////////////////////////////////
__kernel
void
pyrUp_C1_D2
(
__global
ushort*
src,__global
ushort*
dst,
int
srcRows,int
dstRows,int
srcCols,int
dstCols,
int
srcOffset,int
dstOffset,int
srcStep,int
dstStep
)
int
srcRows,int
dstRows,int
srcCols,int
dstCols,
int
srcOffset,int
dstOffset,int
srcStep,int
dstStep
)
{
const
int
x
=
get_global_id
(
0
)
;
const
int
y
=
get_global_id
(
1
)
;
...
...
@@ -210,13 +211,13 @@ __kernel void pyrUp_C1_D2(__global ushort* src,__global ushort* dst,
sum
=
sum
+
(
evenFlag
*
0.375f
)
*
s_srcPatch[0][1
+
((
tidx
)
>>
1
)
]
;
sum
=
sum
+
(
oddFlag
*
0.25f
)
*
s_srcPatch[0][1
+
((
tidx
+
1
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag
*
0.0625f
)
*
s_srcPatch[0][1
+
((
tidx
+
2
)
>>
1
)
]
;
}
}
s_dstPatch[get_local_id
(
1
)
][get_local_id
(
0
)
]
=
sum
;
}
s_dstPatch[get_local_id
(
1
)
][get_local_id
(
0
)
]
=
sum
;
}
if
(
get_local_id
(
1
)
>
13
)
{
if
(
get_local_id
(
1
)
>
13
)
{
sum
=
0
;
if
(
eveny
)
...
...
@@ -228,7 +229,7 @@ __kernel void pyrUp_C1_D2(__global ushort* src,__global ushort* dst,
sum
=
sum
+
(
evenFlag
*
0.0625f
)
*
s_srcPatch[9][1
+
((
tidx
+
2
)
>>
1
)
]
;
}
s_dstPatch[4
+
get_local_id
(
1
)
][get_local_id
(
0
)
]
=
sum
;
}
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
...
...
@@ -251,12 +252,15 @@ __kernel void pyrUp_C1_D2(__global ushort* src,__global ushort* dst,
//////////////////////////
CV_32FC1
/////////////////////////////////
///////////////////////////////////////////////////////////////////////
__kernel
void
pyrUp_C1_D5
(
__global
float*
src,__global
float*
dst,
int
srcRows,int
dstRows,int
srcCols,int
dstCols,
int
srcOffset,int
dstOffset,int
srcStep,int
dstStep
)
int
srcRows,int
dstRows,int
srcCols,int
dstCols,
int
srcOffset,int
dstOffset,int
srcStep,int
dstStep
)
{
const
int
x
=
get_global_id
(
0
)
;
const
int
y
=
get_global_id
(
1
)
;
const
int
tidx
=
get_local_id
(
0
)
;
const
int
tidy
=
get_local_id
(
1
)
;
const
int
lsizex
=
get_local_size
(
0
)
;
const
int
lsizey
=
get_local_size
(
1
)
;
__local
float
s_srcPatch[10][10]
;
__local
float
s_dstPatch[20][16]
;
...
...
@@ -266,10 +270,10 @@ __kernel void pyrUp_C1_D5(__global float* src,__global float* dst,
dstStep
=
dstStep
>>
2
;
if
(
get_local_id
(
0
)
<
10
&&
get_local_id
(
1
)
<
10
)
if
(
tidx
<
10
&&
tidy
<
10
)
{
int
srcx
=
(
int
)(
get_group_id
(
0
)
*
get_local_size
(
0
)
/
2
+
get_local_id
(
0
)
)
-
1
;
int
srcy
=
(
int
)(
get_group_id
(
1
)
*
get_local_size
(
1
)
/
2
+
get_local_id
(
1
)
)
-
1
;
int
srcx
=
mad24
((
int
)
get_group_id
(
0
)
,
lsizex>>1,
tidx
)
-
1
;
int
srcy
=
mad24
((
int
)
get_group_id
(
1
)
,
lsizey>>1,
tidy
)
-
1
;
srcx
=
abs
(
srcx
)
;
srcx
=
min
(
srcCols
-
1
,
srcx
)
;
...
...
@@ -277,71 +281,67 @@ __kernel void pyrUp_C1_D5(__global float* src,__global float* dst,
srcy
=
abs
(
srcy
)
;
srcy
=
min
(
srcRows
-1
,
srcy
)
;
s_srcPatch[
get_local_id
(
1
)
][get_local_id
(
0
)
]
=
(
float
)(
src[srcx
+
srcy
*
srcStep]
)
;
s_srcPatch[
tidy][tidx
]
=
(
float
)(
src[srcx
+
srcy
*
srcStep]
)
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
float
sum
=
0
;
const
int
evenFlag
=
(
int
)((
get_local_id
(
0
)
&
1
)
==
0
)
;
const
int
oddFlag
=
(
int
)((
get_local_id
(
0
)
&
1
)
!=
0
)
;
const
bool
eveny
=
((
get_local_id
(
1
)
&
1
)
==
0
)
;
const
int
tidx
=
get_local_id
(
0
)
;
const
int
evenFlag
=
(
int
)((
tidx
&
1
)
==
0
)
;
const
int
oddFlag
=
(
int
)((
tidx
&
1
)
!=
0
)
;
const
bool
eveny
=
((
tidy
&
1
)
==
0
)
;
if
(
eveny
)
{
sum
=
sum
+
(
evenFlag
*
0.0625f
)
*
s_srcPatch[1
+
(
get_local_id
(
1
)
>>
1
)
][1
+
((
tidx
-
2
)
>>
1
)
]
;
sum
=
sum
+
(
oddFlag
*
0.25f
)
*
s_srcPatch[1
+
(
get_local_id
(
1
)
>>
1
)
][1
+
((
tidx
-
1
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag
*
0.375f
)
*
s_srcPatch[1
+
(
get_local_id
(
1
)
>>
1
)
][1
+
((
tidx
)
>>
1
)
]
;
sum
=
sum
+
(
oddFlag
*
0.25f
)
*
s_srcPatch[1
+
(
get_local_id
(
1
)
>>
1
)
][1
+
((
tidx
+
1
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag
*
0.0625f
)
*
s_srcPatch[1
+
(
get_local_id
(
1
)
>>
1
)
][1
+
((
tidx
+
2
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag
*
0.0625f
)
*
s_srcPatch[1
+
(
tidy
>>
1
)
][1
+
((
tidx
-
2
)
>>
1
)
]
;
sum
=
sum
+
(
oddFlag
*
0.25f
)
*
s_srcPatch[1
+
(
tidy
>>
1
)
][1
+
((
tidx
-
1
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag
*
0.375f
)
*
s_srcPatch[1
+
(
tidy
>>
1
)
][1
+
((
tidx
)
>>
1
)
]
;
sum
=
sum
+
(
oddFlag
*
0.25f
)
*
s_srcPatch[1
+
(
tidy
>>
1
)
][1
+
((
tidx
+
1
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag
*
0.0625f
)
*
s_srcPatch[1
+
(
tidy
>>
1
)
][1
+
((
tidx
+
2
)
>>
1
)
]
;
}
s_dstPatch[2
+
get_local_id
(
1
)
][get_local_id
(
0
)
]
=
sum
;
s_dstPatch[2
+
tidy][tidx
]
=
sum
;
if
(
get_local_id
(
1
)
<
2
)
if
(
tidy
<
2
)
{
sum
=
0
;
if
(
eveny
)
{
sum
=
sum
+
(
evenFlag
*
0.0625f
)
*
s_srcPatch[
0
][1
+
((
tidx
-
2
)
>>
1
)
]
;
sum
=
sum
+
(
oddFlag
*
0.25f
)
*
s_srcPatch[
0
][1
+
((
tidx
-
1
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag
*
0.375f
)
*
s_srcPatch[
0
][1
+
((
tidx
)
>>
1
)
]
;
sum
=
sum
+
(
oddFlag
*
0.25f
)
*
s_srcPatch[
0
][1
+
((
tidx
+
1
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag
*
0.0625f
)
*
s_srcPatch[
0
][1
+
((
tidx
+
2
)
>>
1
)
]
;
}
sum
=
sum
+
(
evenFlag
*
0.0625f
)
*
s_srcPatch[
lsizey
-
16
][1
+
((
tidx
-
2
)
>>
1
)
]
;
sum
=
sum
+
(
oddFlag
*
0.25f
)
*
s_srcPatch[
lsizey
-
16
][1
+
((
tidx
-
1
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag
*
0.375f
)
*
s_srcPatch[
lsizey
-
16
][1
+
((
tidx
)
>>
1
)
]
;
sum
=
sum
+
(
oddFlag
*
0.25f
)
*
s_srcPatch[
lsizey
-
16
][1
+
((
tidx
+
1
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag
*
0.0625f
)
*
s_srcPatch[
lsizey
-
16
][1
+
((
tidx
+
2
)
>>
1
)
]
;
}
s_dstPatch[get_local_id
(
1
)
][get_local_id
(
0
)
]
=
sum
;
}
s_dstPatch[tidy][tidx
]
=
sum
;
}
if
(
get_local_id
(
1
)
>
13
)
{
if
(
tidy
>
13
)
{
sum
=
0
;
if
(
eveny
)
{
sum
=
sum
+
(
evenFlag
*
0.0625f
)
*
s_srcPatch[
9
][1
+
((
tidx
-
2
)
>>
1
)
]
;
sum
=
sum
+
(
oddFlag
*
0.25f
)
*
s_srcPatch[
9
][1
+
((
tidx
-
1
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag
*
0.375f
)
*
s_srcPatch[
9
][1
+
((
tidx
)
>>
1
)
]
;
sum
=
sum
+
(
oddFlag
*
0.25f
)
*
s_srcPatch[
9
][1
+
((
tidx
+
1
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag
*
0.0625f
)
*
s_srcPatch[
9
][1
+
((
tidx
+
2
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag
*
0.0625f
)
*
s_srcPatch[
lsizey
-
7
][1
+
((
tidx
-
2
)
>>
1
)
]
;
sum
=
sum
+
(
oddFlag
*
0.25f
)
*
s_srcPatch[
lsizey
-
7
][1
+
((
tidx
-
1
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag
*
0.375f
)
*
s_srcPatch[
lsizey
-
7
][1
+
((
tidx
)
>>
1
)
]
;
sum
=
sum
+
(
oddFlag
*
0.25f
)
*
s_srcPatch[
lsizey
-
7
][1
+
((
tidx
+
1
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag
*
0.0625f
)
*
s_srcPatch[
lsizey
-
7
][1
+
((
tidx
+
2
)
>>
1
)
]
;
}
s_dstPatch[4
+
get_local_id
(
1
)
][get_local_id
(
0
)
]
=
sum
;
}
s_dstPatch[4
+
tidy][tidx
]
=
sum
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
sum
=
0
;
const
int
tidy
=
get_local_id
(
1
)
;
sum
=
sum
+
0.0625f
*
s_dstPatch[2
+
tidy
-
2][get_local_id
(
0
)
]
;
sum
=
sum
+
0.25f
*
s_dstPatch[2
+
tidy
-
1][get_local_id
(
0
)
]
;
sum
=
sum
+
0.375f
*
s_dstPatch[2
+
tidy
][get_local_id
(
0
)
]
;
sum
=
sum
+
0.25f
*
s_dstPatch[2
+
tidy
+
1][get_local_id
(
0
)
]
;
sum
=
sum
+
0.0625f
*
s_dstPatch[2
+
tidy
+
2][get_local_id
(
0
)
]
;
sum
=
0.0625f
*
s_dstPatch[2
+
tidy
-
2][tidx]
;
sum
=
sum
+
0.25f
*
s_dstPatch[2
+
tidy
-
1][tidx]
;
sum
=
sum
+
0.375f
*
s_dstPatch[2
+
tidy
][tidx]
;
sum
=
sum
+
0.25f
*
s_dstPatch[2
+
tidy
+
1][tidx]
;
sum
=
sum
+
0.0625f
*
s_dstPatch[2
+
tidy
+
2][tidx]
;
if
((
x
<
dstCols
)
&&
(
y
<
dstRows
))
dst[x
+
y
*
dstStep]
=
(
float
)(
4.0f
*
sum
)
;
...
...
@@ -376,37 +376,16 @@ uchar4 convert_float4_to_uchar4(float4 data)
return
u4Data
;
}
float4
int_x_float4
(
int
leftOpr,float4
rightOpr
)
{
float4
result
=
{0,0,0,0}
;
result.x
=
rightOpr.x
*
leftOpr
;
result.y
=
rightOpr.y
*
leftOpr
;
result.z
=
rightOpr.z
*
leftOpr
;
result.w
=
rightOpr.w
*
leftOpr
;
return
result
;
}
float4
float4_x_float4
(
float4
leftOpr,float4
rightOpr
)
{
float4
result
;
result.x
=
leftOpr.x
*
rightOpr.x
;
result.y
=
leftOpr.y
*
rightOpr.y
;
result.z
=
leftOpr.z
*
rightOpr.z
;
result.w
=
leftOpr.w
*
rightOpr.w
;
return
result
;
}
__kernel
void
pyrUp_C4_D0
(
__global
uchar4*
src,__global
uchar4*
dst,
int
srcRows,int
dstRows,int
srcCols,int
dstCols,
int
srcOffset,int
dstOffset,int
srcStep,int
dstStep
)
int
srcRows,int
dstRows,int
srcCols,int
dstCols,
int
srcOffset,int
dstOffset,int
srcStep,int
dstStep
)
{
const
int
x
=
get_global_id
(
0
)
;
const
int
y
=
get_global_id
(
1
)
;
const
int
tidx
=
get_local_id
(
0
)
;
const
int
tidy
=
get_local_id
(
1
)
;
const
int
lsizex
=
get_local_size
(
0
)
;
const
int
lsizey
=
get_local_size
(
1
)
;
__local
float4
s_srcPatch[10][10]
;
__local
float4
s_dstPatch[20][16]
;
...
...
@@ -416,10 +395,10 @@ __kernel void pyrUp_C4_D0(__global uchar4* src,__global uchar4* dst,
dstStep
>>=
2
;
if
(
get_local_id
(
0
)
<
10
&&
get_local_id
(
1
)
<
10
)
if
(
tidx
<
10
&&
tidy
<
10
)
{
int
srcx
=
(
int
)(
get_group_id
(
0
)
*
get_local_size
(
0
)
/
2
+
get_local_id
(
0
)
)
-
1
;
int
srcy
=
(
int
)(
get_group_id
(
1
)
*
get_local_size
(
1
)
/
2
+
get_local_id
(
1
)
)
-
1
;
int
srcx
=
mad24
((
int
)
get_group_id
(
0
)
,
lsizex>>1,
tidx
)
-
1
;
int
srcy
=
mad24
((
int
)
get_group_id
(
1
)
,
lsizey>>1,
tidy
)
-
1
;
srcx
=
abs
(
srcx
)
;
srcx
=
min
(
srcCols
-
1
,
srcx
)
;
...
...
@@ -427,17 +406,16 @@ __kernel void pyrUp_C4_D0(__global uchar4* src,__global uchar4* dst,
srcy
=
abs
(
srcy
)
;
srcy
=
min
(
srcRows
-1
,
srcy
)
;
s_srcPatch[
get_local_id
(
1
)
][get_local_id
(
0
)
]
=
covert_uchar4_to_float4
(
src[srcx
+
srcy
*
srcStep]
)
;
s_srcPatch[
tidy][tidx
]
=
covert_uchar4_to_float4
(
src[srcx
+
srcy
*
srcStep]
)
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
float4
sum
=
(
float4
)(
0
,
0
,
0
,
0
)
;
const
int
evenFlag
=
(
int
)((
get_local_id
(
0
)
&
1
)
==
0
)
;
const
int
oddFlag
=
(
int
)((
get_local_id
(
0
)
&
1
)
!=
0
)
;
const
bool
eveny
=
((
get_local_id
(
1
)
&
1
)
==
0
)
;
const
int
tidx
=
get_local_id
(
0
)
;
const
int
evenFlag
=
(
int
)((
tidx
&
1
)
==
0
)
;
const
int
oddFlag
=
(
int
)((
tidx
&
1
)
!=
0
)
;
const
bool
eveny
=
((
tidy
&
1
)
==
0
)
;
float4
co1
=
(
float4
)(
0.375f,
0.375f,
0.375f,
0.375f
)
;
float4
co2
=
(
float4
)(
0.25f,
0.25f,
0.25f,
0.25f
)
;
...
...
@@ -446,63 +424,59 @@ __kernel void pyrUp_C4_D0(__global uchar4* src,__global uchar4* dst,
if
(
eveny
)
{
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
evenFlag,
co3
)
,
s_srcPatch[1
+
(
get_local_id
(
1
)
>>
1
)
][1
+
((
tidx
-
2
)
>>
1
)
]
)
;
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
oddFlag
,
co2
)
,
s_srcPatch[1
+
(
get_local_id
(
1
)
>>
1
)
][1
+
((
tidx
-
1
)
>>
1
)
]
)
;
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
evenFlag,
co1
)
,
s_srcPatch[1
+
(
get_local_id
(
1
)
>>
1
)
][1
+
((
tidx
)
>>
1
)
]
)
;
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
oddFlag
,
co2
)
,
s_srcPatch[1
+
(
get_local_id
(
1
)
>>
1
)
][1
+
((
tidx
+
1
)
>>
1
)
]
)
;
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
evenFlag,
co3
)
,
s_srcPatch[1
+
(
get_local_id
(
1
)
>>
1
)
][1
+
((
tidx
+
2
)
>>
1
)
]
)
;
sum
=
sum
+
(
evenFlag
*
co3
)
*
s_srcPatch[1
+
(
tidy
>>
1
)
][1
+
((
tidx
-
2
)
>>
1
)
]
;
sum
=
sum
+
(
oddFlag
*
co2
)
*
s_srcPatch[1
+
(
tidy
>>
1
)
][1
+
((
tidx
-
1
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag
*
co1
)
*
s_srcPatch[1
+
(
tidy
>>
1
)
][1
+
((
tidx
)
>>
1
)
]
;
sum
=
sum
+
(
oddFlag
*
co2
)
*
s_srcPatch[1
+
(
tidy
>>
1
)
][1
+
((
tidx
+
1
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag
*
co3
)
*
s_srcPatch[1
+
(
tidy
>>
1
)
][1
+
((
tidx
+
2
)
>>
1
)
]
;
}
s_dstPatch[2
+
get_local_id
(
1
)
][get_local_id
(
0
)
]
=
sum
;
s_dstPatch[2
+
tidy][tidx
]
=
sum
;
if
(
get_local_id
(
1
)
<
2
)
if
(
tidy
<
2
)
{
sum
=
0
;
if
(
eveny
)
{
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
evenFlag
,
co3
)
,
s_srcPatch[0][1
+
((
tidx
-
2
)
>>
1
)
]
)
;
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
oddFlag
,
co2
)
,
s_srcPatch[0][1
+
((
tidx
-
1
)
>>
1
)
]
)
;
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
evenFlag
,
co1
)
,
s_srcPatch[0][1
+
((
tidx
)
>>
1
)
]
)
;
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
oddFlag
,
co2
)
,
s_srcPatch[0][1
+
((
tidx
+
1
)
>>
1
)
]
)
;
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
evenFlag
,
co3
)
,
s_srcPatch[0][1
+
((
tidx
+
2
)
>>
1
)
]
)
;
}
s_dstPatch[get_local_id
(
1
)
][get_local_id
(
0
)
]
=
sum
;
}
if
(
get_local_id
(
1
)
>
13
)
{
sum
=
sum
+
(
evenFlag
*
co3
)
*
s_srcPatch[lsizey-16][1
+
((
tidx
-
2
)
>>
1
)
]
;
sum
=
sum
+
(
oddFlag
*
co2
)
*
s_srcPatch[lsizey-16][1
+
((
tidx
-
1
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag
*
co1
)
*
s_srcPatch[lsizey-16][1
+
((
tidx
)
>>
1
)
]
;
sum
=
sum
+
(
oddFlag
*
co2
)
*
s_srcPatch[lsizey-16][1
+
((
tidx
+
1
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag
*
co3
)
*
s_srcPatch[lsizey-16][1
+
((
tidx
+
2
)
>>
1
)
]
;
}
s_dstPatch[tidy][tidx
]
=
sum
;
}
if
(
tidy
>
13
)
{
sum
=
0
;
if
(
eveny
)
{
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
evenFlag
,
co3
)
,
s_srcPatch[9][1
+
((
tidx
-
2
)
>>
1
)
]
)
;
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
oddFlag
,
co2
)
,
s_srcPatch[9][1
+
((
tidx
-
1
)
>>
1
)
]
)
;
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
evenFlag
,
co1
)
,
s_srcPatch[9][1
+
((
tidx
)
>>
1
)
]
)
;
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
oddFlag
,
co2
)
,
s_srcPatch[9][1
+
((
tidx
+
1
)
>>
1
)
]
)
;
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
evenFlag
,
co3
)
,
s_srcPatch[9][1
+
((
tidx
+
2
)
>>
1
)
]
)
;
sum
=
sum
+
(
evenFlag
*
co3
)
*
s_srcPatch[lsizey-7][1
+
((
tidx
-
2
)
>>
1
)
]
;
sum
=
sum
+
(
oddFlag
*
co2
)
*
s_srcPatch[lsizey-7][1
+
((
tidx
-
1
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag
*
co1
)
*
s_srcPatch[lsizey-7][1
+
((
tidx
)
>>
1
)
]
;
sum
=
sum
+
(
oddFlag
*
co2
)
*
s_srcPatch[lsizey-7][1
+
((
tidx
+
1
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag
*
co3
)
*
s_srcPatch[lsizey-7][1
+
((
tidx
+
2
)
>>
1
)
]
;
}
s_dstPatch[4
+
get_local_id
(
1
)
][get_local_id
(
0
)
]
=
sum
;
}
s_dstPatch[4
+
tidy][tidx
]
=
sum
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
sum
=
0
;
const
int
tidy
=
get_local_id
(
1
)
;
sum
=
sum
+
float4_x_float4
(
co3
,
s_dstPatch[2
+
tidy
-
2][get_local_id
(
0
)
]
)
;
sum
=
sum
+
float4_x_float4
(
co2
,
s_dstPatch[2
+
tidy
-
1][get_local_id
(
0
)
]
)
;
sum
=
sum
+
float4_x_float4
(
co1
,
s_dstPatch[2
+
tidy
][get_local_id
(
0
)
]
)
;
sum
=
sum
+
float4_x_float4
(
co2
,
s_dstPatch[2
+
tidy
+
1][get_local_id
(
0
)
]
)
;
sum
=
sum
+
float4_x_float4
(
co3
,
s_dstPatch[2
+
tidy
+
2][get_local_id
(
0
)
]
)
;
sum
=
co3
*
s_dstPatch[2
+
tidy
-
2][tidx]
;
sum
=
sum
+
co2
*
s_dstPatch[2
+
tidy
-
1][tidx]
;
sum
=
sum
+
co1
*
s_dstPatch[2
+
tidy
][tidx]
;
sum
=
sum
+
co2
*
s_dstPatch[2
+
tidy
+
1][tidx]
;
sum
=
sum
+
co3
*
s_dstPatch[2
+
tidy
+
2][tidx]
;
if
((
x
<
dstCols
)
&&
(
y
<
dstRows
))
{
dst[x
+
y
*
dstStep]
=
convert_float4_to_uchar4
(
int_x_float4
(
4.0f,sum
)
)
;
dst[x
+
y
*
dstStep]
=
convert_float4_to_uchar4
(
4.0f
*
sum
)
;
}
}
///////////////////////////////////////////////////////////////////////
...
...
@@ -535,8 +509,8 @@ ushort4 convert_float4_to_ushort4(float4 data)
__kernel
void
pyrUp_C4_D2
(
__global
ushort4*
src,__global
ushort4*
dst,
int
srcRows,int
dstRows,int
srcCols,int
dstCols,
int
srcOffset,int
dstOffset,int
srcStep,int
dstStep
)
int
srcRows,int
dstRows,int
srcCols,int
dstCols,
int
srcOffset,int
dstOffset,int
srcStep,int
dstStep
)
{
const
int
x
=
get_global_id
(
0
)
;
const
int
y
=
get_global_id
(
1
)
;
...
...
@@ -580,11 +554,11 @@ __kernel void pyrUp_C4_D2(__global ushort4* src,__global ushort4* dst,
if
(
eveny
)
{
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
evenFlag,
co3
)
,
s_srcPatch[1
+
(
get_local_id
(
1
)
>>
1
)
][1
+
((
tidx
-
2
)
>>
1
)
]
)
;
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
oddFlag
,
co2
)
,
s_srcPatch[1
+
(
get_local_id
(
1
)
>>
1
)
][1
+
((
tidx
-
1
)
>>
1
)
]
)
;
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
evenFlag,
co1
)
,
s_srcPatch[1
+
(
get_local_id
(
1
)
>>
1
)
][1
+
((
tidx
)
>>
1
)
]
)
;
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
oddFlag
,
co2
)
,
s_srcPatch[1
+
(
get_local_id
(
1
)
>>
1
)
][1
+
((
tidx
+
1
)
>>
1
)
]
)
;
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
evenFlag,
co3
)
,
s_srcPatch[1
+
(
get_local_id
(
1
)
>>
1
)
][1
+
((
tidx
+
2
)
>>
1
)
]
)
;
sum
=
sum
+
(
evenFlag*
co3
)
*
s_srcPatch[1
+
(
get_local_id
(
1
)
>>
1
)
][1
+
((
tidx
-
2
)
>>
1
)
]
;
sum
=
sum
+
(
oddFlag
*
co2
)
*
s_srcPatch[1
+
(
get_local_id
(
1
)
>>
1
)
][1
+
((
tidx
-
1
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag*
co1
)
*
s_srcPatch[1
+
(
get_local_id
(
1
)
>>
1
)
][1
+
((
tidx
)
>>
1
)
]
;
sum
=
sum
+
(
oddFlag
*
co2
)
*
s_srcPatch[1
+
(
get_local_id
(
1
)
>>
1
)
][1
+
((
tidx
+
1
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag*
co3
)
*
s_srcPatch[1
+
(
get_local_id
(
1
)
>>
1
)
][1
+
((
tidx
+
2
)
>>
1
)
]
;
}
...
...
@@ -596,31 +570,31 @@ __kernel void pyrUp_C4_D2(__global ushort4* src,__global ushort4* dst,
if
(
eveny
)
{
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
evenFlag
,
co3
)
,
s_srcPatch[0][1
+
((
tidx
-
2
)
>>
1
)
]
)
;
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
oddFlag
,
co2
)
,
s_srcPatch[0][1
+
((
tidx
-
1
)
>>
1
)
]
)
;
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
evenFlag
,
co1
)
,
s_srcPatch[0][1
+
((
tidx
)
>>
1
)
]
)
;
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
oddFlag
,
co2
)
,
s_srcPatch[0][1
+
((
tidx
+
1
)
>>
1
)
]
)
;
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
evenFlag
,
co3
)
,
s_srcPatch[0][1
+
((
tidx
+
2
)
>>
1
)
]
)
;
}
s_dstPatch[get_local_id
(
1
)
][get_local_id
(
0
)
]
=
sum
;
}
if
(
get_local_id
(
1
)
>
13
)
{
sum
=
sum
+
(
evenFlag
*
co3
)
*
s_srcPatch[0][1
+
((
tidx
-
2
)
>>
1
)
]
;
sum
=
sum
+
(
oddFlag
*
co2
)
*
s_srcPatch[0][1
+
((
tidx
-
1
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag
*
co1
)
*
s_srcPatch[0][1
+
((
tidx
)
>>
1
)
]
;
sum
=
sum
+
(
oddFlag
*
co2
)
*
s_srcPatch[0][1
+
((
tidx
+
1
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag
*
co3
)
*
s_srcPatch[0][1
+
((
tidx
+
2
)
>>
1
)
]
;
}
s_dstPatch[get_local_id
(
1
)
][get_local_id
(
0
)
]
=
sum
;
}
if
(
get_local_id
(
1
)
>
13
)
{
sum
=
0
;
if
(
eveny
)
{
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
evenFlag
,
co3
)
,
s_srcPatch[9][1
+
((
tidx
-
2
)
>>
1
)
]
)
;
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
oddFlag
,
co2
)
,
s_srcPatch[9][1
+
((
tidx
-
1
)
>>
1
)
]
)
;
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
evenFlag
,
co1
)
,
s_srcPatch[9][1
+
((
tidx
)
>>
1
)
]
)
;
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
oddFlag
,
co2
)
,
s_srcPatch[9][1
+
((
tidx
+
1
)
>>
1
)
]
)
;
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
evenFlag
,
co3
)
,
s_srcPatch[9][1
+
((
tidx
+
2
)
>>
1
)
]
)
;
sum
=
sum
+
(
evenFlag
*
co3
)
*
s_srcPatch[9][1
+
((
tidx
-
2
)
>>
1
)
]
;
sum
=
sum
+
(
oddFlag
*
co2
)
*
s_srcPatch[9][1
+
((
tidx
-
1
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag
*
co1
)
*
s_srcPatch[9][1
+
((
tidx
)
>>
1
)
]
;
sum
=
sum
+
(
oddFlag
*
co2
)
*
s_srcPatch[9][1
+
((
tidx
+
1
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag
*
co3
)
*
s_srcPatch[9][1
+
((
tidx
+
2
)
>>
1
)
]
;
}
s_dstPatch[4
+
get_local_id
(
1
)
][get_local_id
(
0
)
]
=
sum
;
}
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
...
...
@@ -628,15 +602,15 @@ __kernel void pyrUp_C4_D2(__global ushort4* src,__global ushort4* dst,
const
int
tidy
=
get_local_id
(
1
)
;
sum
=
sum
+
float4_x_float4
(
co3
,
s_dstPatch[2
+
tidy
-
2][get_local_id
(
0
)
]
)
;
sum
=
sum
+
float4_x_float4
(
co2
,
s_dstPatch[2
+
tidy
-
1][get_local_id
(
0
)
]
)
;
sum
=
sum
+
float4_x_float4
(
co1
,
s_dstPatch[2
+
tidy
][get_local_id
(
0
)
]
)
;
sum
=
sum
+
float4_x_float4
(
co2
,
s_dstPatch[2
+
tidy
+
1][get_local_id
(
0
)
]
)
;
sum
=
sum
+
float4_x_float4
(
co3
,
s_dstPatch[2
+
tidy
+
2][get_local_id
(
0
)
]
)
;
sum
=
sum
+
co3
*
s_dstPatch[2
+
tidy
-
2][get_local_id
(
0
)
]
;
sum
=
sum
+
co2
*
s_dstPatch[2
+
tidy
-
1][get_local_id
(
0
)
]
;
sum
=
sum
+
co1
*
s_dstPatch[2
+
tidy
][get_local_id
(
0
)
]
;
sum
=
sum
+
co2
*
s_dstPatch[2
+
tidy
+
1][get_local_id
(
0
)
]
;
sum
=
sum
+
co3
*
s_dstPatch[2
+
tidy
+
2][get_local_id
(
0
)
]
;
if
((
x
<
dstCols
)
&&
(
y
<
dstRows
))
{
dst[x
+
y
*
dstStep]
=
convert_float4_to_ushort4
(
int_x_float4
(
4.0f,sum
)
)
;
dst[x
+
y
*
dstStep]
=
convert_float4_to_ushort4
(
4.0f
*
sum
)
;
}
}
...
...
@@ -644,12 +618,15 @@ __kernel void pyrUp_C4_D2(__global ushort4* src,__global ushort4* dst,
//////////////////////////
CV_32FC4
//////////////////////////////////
///////////////////////////////////////////////////////////////////////
__kernel
void
pyrUp_C4_D5
(
__global
float4*
src,__global
float4*
dst,
int
srcRows,int
dstRows,int
srcCols,int
dstCols,
int
srcOffset,int
dstOffset,int
srcStep,int
dstStep
)
int
srcRows,int
dstRows,int
srcCols,int
dstCols,
int
srcOffset,int
dstOffset,int
srcStep,int
dstStep
)
{
const
int
x
=
get_global_id
(
0
)
;
const
int
y
=
get_global_id
(
1
)
;
const
int
tidx
=
get_local_id
(
0
)
;
const
int
tidy
=
get_local_id
(
1
)
;
const
int
lsizex
=
get_local_size
(
0
)
;
const
int
lsizey
=
get_local_size
(
1
)
;
__local
float4
s_srcPatch[10][10]
;
__local
float4
s_dstPatch[20][16]
;
...
...
@@ -659,10 +636,10 @@ __kernel void pyrUp_C4_D5(__global float4* src,__global float4* dst,
dstStep
>>=
4
;
if
(
get_local_id
(
0
)
<
10
&&
get_local_id
(
1
)
<
10
)
if
(
tidx
<
10
&&
tidy
<
10
)
{
int
srcx
=
(
int
)(
get_group_id
(
0
)
*
get_local_size
(
0
)
/
2
+
get_local_id
(
0
)
)
-
1
;
int
srcy
=
(
int
)(
get_group_id
(
1
)
*
get_local_size
(
1
)
/
2
+
get_local_id
(
1
)
)
-
1
;
int
srcx
=
(
int
)(
get_group_id
(
0
)
*
get_local_size
(
0
)
/
2
+
tidx
)
-
1
;
int
srcy
=
(
int
)(
get_group_id
(
1
)
*
get_local_size
(
1
)
/
2
+
tidy
)
-
1
;
srcx
=
abs
(
srcx
)
;
srcx
=
min
(
srcCols
-
1
,
srcx
)
;
...
...
@@ -670,17 +647,16 @@ __kernel void pyrUp_C4_D5(__global float4* src,__global float4* dst,
srcy
=
abs
(
srcy
)
;
srcy
=
min
(
srcRows
-1
,
srcy
)
;
s_srcPatch[
get_local_id
(
1
)
][get_local_id
(
0
)
]
=
(
float4
)(
src[srcx
+
srcy
*
srcStep]
)
;
s_srcPatch[
tidy][tidx
]
=
(
float4
)(
src[srcx
+
srcy
*
srcStep]
)
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
float4
sum
=
(
float4
)(
0
,
0
,
0
,
0
)
;
const
int
evenFlag
=
(
int
)((
get_local_id
(
0
)
&
1
)
==
0
)
;
const
int
oddFlag
=
(
int
)((
get_local_id
(
0
)
&
1
)
!=
0
)
;
const
bool
eveny
=
((
get_local_id
(
1
)
&
1
)
==
0
)
;
const
int
tidx
=
get_local_id
(
0
)
;
const
int
evenFlag
=
(
int
)((
tidx
&
1
)
==
0
)
;
const
int
oddFlag
=
(
int
)((
tidx
&
1
)
!=
0
)
;
const
bool
eveny
=
((
tidy
&
1
)
==
0
)
;
float4
co1
=
(
float4
)(
0.375f,
0.375f,
0.375f,
0.375f
)
;
float4
co2
=
(
float4
)(
0.25f,
0.25f,
0.25f,
0.25f
)
;
...
...
@@ -689,59 +665,55 @@ __kernel void pyrUp_C4_D5(__global float4* src,__global float4* dst,
if
(
eveny
)
{
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
evenFlag,
co3
)
,
s_srcPatch[1
+
(
get_local_id
(
1
)
>>
1
)
][1
+
((
tidx
-
2
)
>>
1
)
]
)
;
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
oddFlag
,
co2
)
,
s_srcPatch[1
+
(
get_local_id
(
1
)
>>
1
)
][1
+
((
tidx
-
1
)
>>
1
)
]
)
;
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
evenFlag,
co1
)
,
s_srcPatch[1
+
(
get_local_id
(
1
)
>>
1
)
][1
+
((
tidx
)
>>
1
)
]
)
;
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
oddFlag
,
co2
)
,
s_srcPatch[1
+
(
get_local_id
(
1
)
>>
1
)
][1
+
((
tidx
+
1
)
>>
1
)
]
)
;
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
evenFlag,
co3
)
,
s_srcPatch[1
+
(
get_local_id
(
1
)
>>
1
)
][1
+
((
tidx
+
2
)
>>
1
)
]
)
;
sum
=
sum
+
(
evenFlag*
co3
)
*
s_srcPatch[1
+
(
tidy
>>
1
)
][1
+
((
tidx
-
2
)
>>
1
)
]
;
sum
=
sum
+
(
oddFlag
*
co2
)
*
s_srcPatch[1
+
(
tidy
>>
1
)
][1
+
((
tidx
-
1
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag*
co1
)
*
s_srcPatch[1
+
(
tidy
>>
1
)
][1
+
((
tidx
)
>>
1
)
]
;
sum
=
sum
+
(
oddFlag
*
co2
)
*
s_srcPatch[1
+
(
tidy
>>
1
)
][1
+
((
tidx
+
1
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag*
co3
)
*
s_srcPatch[1
+
(
tidy
>>
1
)
][1
+
((
tidx
+
2
)
>>
1
)
]
;
}
s_dstPatch[2
+
get_local_id
(
1
)
][get_local_id
(
0
)
]
=
sum
;
s_dstPatch[2
+
tidy][tidx
]
=
sum
;
if
(
get_local_id
(
1
)
<
2
)
if
(
tidy
<
2
)
{
sum
=
0
;
if
(
eveny
)
{
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
evenFlag
,
co3
)
,
s_srcPatch[0][1
+
((
tidx
-
2
)
>>
1
)
]
)
;
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
oddFlag
,
co2
)
,
s_srcPatch[0][1
+
((
tidx
-
1
)
>>
1
)
]
)
;
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
evenFlag
,
co1
)
,
s_srcPatch[0][1
+
((
tidx
)
>>
1
)
]
)
;
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
oddFlag
,
co2
)
,
s_srcPatch[0][1
+
((
tidx
+
1
)
>>
1
)
]
)
;
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
evenFlag
,
co3
)
,
s_srcPatch[0][1
+
((
tidx
+
2
)
>>
1
)
]
)
;
}
s_dstPatch[get_local_id
(
1
)
][get_local_id
(
0
)
]
=
sum
;
}
if
(
get_local_id
(
1
)
>
13
)
{
sum
=
sum
+
(
evenFlag
*
co3
)
*
s_srcPatch[lsizey-16][1
+
((
tidx
-
2
)
>>
1
)
]
;
sum
=
sum
+
(
oddFlag
*
co2
)
*
s_srcPatch[lsizey-16][1
+
((
tidx
-
1
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag
*
co1
)
*
s_srcPatch[lsizey-16][1
+
((
tidx
)
>>
1
)
]
;
sum
=
sum
+
(
oddFlag
*
co2
)
*
s_srcPatch[lsizey-16][1
+
((
tidx
+
1
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag
*
co3
)
*
s_srcPatch[lsizey-16][1
+
((
tidx
+
2
)
>>
1
)
]
;
}
s_dstPatch[tidy][tidx
]
=
sum
;
}
if
(
tidy
>
13
)
{
sum
=
0
;
if
(
eveny
)
{
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
evenFlag
,
co3
)
,
s_srcPatch[9][1
+
((
tidx
-
2
)
>>
1
)
]
)
;
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
oddFlag
,
co2
)
,
s_srcPatch[9][1
+
((
tidx
-
1
)
>>
1
)
]
)
;
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
evenFlag
,
co1
)
,
s_srcPatch[9][1
+
((
tidx
)
>>
1
)
]
)
;
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
oddFlag
,
co2
)
,
s_srcPatch[9][1
+
((
tidx
+
1
)
>>
1
)
]
)
;
sum
=
sum
+
float4_x_float4
(
int_x_float4
(
evenFlag
,
co3
)
,
s_srcPatch[9][1
+
((
tidx
+
2
)
>>
1
)
]
)
;
sum
=
sum
+
(
evenFlag
*
co3
)
*
s_srcPatch[lsizey-7][1
+
((
tidx
-
2
)
>>
1
)
]
;
sum
=
sum
+
(
oddFlag
*
co2
)
*
s_srcPatch[lsizey-7][1
+
((
tidx
-
1
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag
*
co1
)
*
s_srcPatch[lsizey-7][1
+
((
tidx
)
>>
1
)
]
;
sum
=
sum
+
(
oddFlag
*
co2
)
*
s_srcPatch[lsizey-7][1
+
((
tidx
+
1
)
>>
1
)
]
;
sum
=
sum
+
(
evenFlag
*
co3
)
*
s_srcPatch[lsizey-7][1
+
((
tidx
+
2
)
>>
1
)
]
;
}
s_dstPatch[4
+
get_local_id
(
1
)
][get_local_id
(
0
)
]
=
sum
;
}
s_dstPatch[4
+
tidy][tidx
]
=
sum
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
sum
=
0
;
const
int
tidy
=
get_local_id
(
1
)
;
sum
=
sum
+
float4_x_float4
(
co3
,
s_dstPatch[2
+
tidy
-
2][get_local_id
(
0
)
]
)
;
sum
=
sum
+
float4_x_float4
(
co2
,
s_dstPatch[2
+
tidy
-
1][get_local_id
(
0
)
]
)
;
sum
=
sum
+
float4_x_float4
(
co1
,
s_dstPatch[2
+
tidy
][get_local_id
(
0
)
]
)
;
sum
=
sum
+
float4_x_float4
(
co2
,
s_dstPatch[2
+
tidy
+
1][get_local_id
(
0
)
]
)
;
sum
=
sum
+
float4_x_float4
(
co3
,
s_dstPatch[2
+
tidy
+
2][get_local_id
(
0
)
]
)
;
sum
=
co3
*
s_dstPatch[2
+
tidy
-
2][tidx]
;
sum
=
sum
+
co2
*
s_dstPatch[2
+
tidy
-
1][tidx]
;
sum
=
sum
+
co1
*
s_dstPatch[2
+
tidy
][tidx]
;
sum
=
sum
+
co2
*
s_dstPatch[2
+
tidy
+
1][tidx]
;
sum
=
sum
+
co3
*
s_dstPatch[2
+
tidy
+
2][tidx]
;
if
((
x
<
dstCols
)
&&
(
y
<
dstRows
))
{
...
...
modules/ocl/src/opencl/stereobm.cl
View file @
6c99b5c9
...
...
@@ -323,7 +323,7 @@ float sobel(__global unsigned char *input, int x, int y, int rows, int cols)
float
conv
=
0
;
int
y1
=
y==0?
0
:
y-1
;
int
x1
=
x==0?
0
:
x-1
;
if
(
x
<
cols
&&
y
<
rows
)
if
(
x
<
cols
&&
y
<
rows
&&
x
>
0
&&
y
>
0
)
{
conv
=
(
float
)
input[
(
y1
)
*
cols
+
(
x1
)
]
*
(
-1
)
+
(
float
)
input[
(
y1
)
*
cols
+
(
x+1
)
]
*
(
1
)
+
(
float
)
input[
(
y
)
*
cols
+
(
x1
)
]
*
(
-2
)
+
(
float
)
input[
(
y
)
*
cols
+
(
x+1
)
]
*
(
2
)
+
...
...
modules/ocl/test/test_brute_force_matcher.cpp
View file @
6c99b5c9
...
...
@@ -110,7 +110,7 @@ namespace
}
};
TEST_P
(
BruteForceMatcher
,
DISABLED_
Match_Single
)
TEST_P
(
BruteForceMatcher
,
Match_Single
)
{
cv
::
ocl
::
BruteForceMatcher_OCL_base
matcher
(
distType
);
...
...
@@ -130,7 +130,7 @@ namespace
ASSERT_EQ
(
0
,
badCount
);
}
TEST_P
(
BruteForceMatcher
,
DISABLED_
KnnMatch_2_Single
)
TEST_P
(
BruteForceMatcher
,
KnnMatch_2_Single
)
{
const
int
knn
=
2
;
...
...
modules/ocl/test/test_match_template.cpp
View file @
6c99b5c9
...
...
@@ -75,7 +75,7 @@ PARAM_TEST_CASE(MatchTemplate8U, cv::Size, TemplateSize, Channels, TemplateMetho
}
};
TEST_P
(
MatchTemplate8U
,
DISABLED_
Accuracy
)
TEST_P
(
MatchTemplate8U
,
Accuracy
)
{
std
::
cout
<<
"Method: "
<<
TEMPLATE_METHOD_NAMES
[
method
]
<<
std
::
endl
;
...
...
@@ -138,18 +138,18 @@ TEST_P(MatchTemplate32F, Accuracy)
EXPECT_MAT_NEAR
(
dst_gold
,
mat_dst
,
templ_size
.
area
()
*
1e-1
,
sss
);
}
INSTANTIATE_TEST_CASE_P
(
GPU
_ImgProc
,
MatchTemplate8U
,
INSTANTIATE_TEST_CASE_P
(
OCL
_ImgProc
,
MatchTemplate8U
,
testing
::
Combine
(
MTEMP_SIZES
,
testing
::
Values
(
TemplateSize
(
cv
::
Size
(
5
,
5
)),
TemplateSize
(
cv
::
Size
(
16
,
16
))
/*, TemplateSize(cv::Size(30, 30))*/
),
testing
::
Values
(
TemplateSize
(
cv
::
Size
(
5
,
5
)),
TemplateSize
(
cv
::
Size
(
16
,
16
))
,
TemplateSize
(
cv
::
Size
(
30
,
30
))
),
testing
::
Values
(
Channels
(
1
),
Channels
(
3
),
Channels
(
4
)),
ALL_TEMPLATE_METHODS
)
);
INSTANTIATE_TEST_CASE_P
(
GPU
_ImgProc
,
MatchTemplate32F
,
testing
::
Combine
(
INSTANTIATE_TEST_CASE_P
(
OCL
_ImgProc
,
MatchTemplate32F
,
testing
::
Combine
(
MTEMP_SIZES
,
testing
::
Values
(
TemplateSize
(
cv
::
Size
(
5
,
5
)),
TemplateSize
(
cv
::
Size
(
16
,
16
))
/*, TemplateSize(cv::Size(30, 30))*/
),
testing
::
Values
(
TemplateSize
(
cv
::
Size
(
5
,
5
)),
TemplateSize
(
cv
::
Size
(
16
,
16
))
,
TemplateSize
(
cv
::
Size
(
30
,
30
))
),
testing
::
Values
(
Channels
(
1
),
Channels
(
3
),
Channels
(
4
)),
testing
::
Values
(
TemplateMethod
(
cv
::
TM_SQDIFF
),
TemplateMethod
(
cv
::
TM_CCORR
))));
#endif
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment