Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
02fb3f0a
Commit
02fb3f0a
authored
Dec 17, 2013
by
Vadim Pisarevsky
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
it finally works!!!
parent
ef509ace
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
151 additions
and
575 deletions
+151
-575
cascadedetect.cpp
modules/objdetect/src/cascadedetect.cpp
+49
-32
cascadedetect.hpp
modules/objdetect/src/cascadedetect.hpp
+7
-7
haarobjectdetect.cl
modules/objdetect/src/opencl/haarobjectdetect.cl
+95
-536
No files found.
modules/objdetect/src/cascadedetect.cpp
View file @
02fb3f0a
...
@@ -44,6 +44,7 @@
...
@@ -44,6 +44,7 @@
#include "cascadedetect.hpp"
#include "cascadedetect.hpp"
#include "opencv2/objdetect/objdetect_c.h"
#include "opencv2/objdetect/objdetect_c.h"
#include "opencl_kernels.hpp"
#if defined (LOG_CASCADE_STATISTIC)
#if defined (LOG_CASCADE_STATISTIC)
struct
Logger
struct
Logger
...
@@ -491,7 +492,7 @@ bool HaarEvaluator::read(const FileNode& node)
...
@@ -491,7 +492,7 @@ bool HaarEvaluator::read(const FileNode& node)
features
->
resize
(
n
);
features
->
resize
(
n
);
FileNodeIterator
it
=
node
.
begin
();
FileNodeIterator
it
=
node
.
begin
();
hasTiltedFeatures
=
false
;
hasTiltedFeatures
=
false
;
std
::
vector
<
Feature
>
ff
=
*
features
;
std
::
vector
<
Feature
>
&
ff
=
*
features
;
sumSize0
=
Size
();
sumSize0
=
Size
();
ufbuf
.
release
();
ufbuf
.
release
();
...
@@ -552,30 +553,37 @@ bool HaarEvaluator::setImage( InputArray _image, Size _origWinSize, Size _sumSiz
...
@@ -552,30 +553,37 @@ bool HaarEvaluator::setImage( InputArray _image, Size _origWinSize, Size _sumSiz
tofs
=
(
int
)((
utilted
.
offset
-
usum
.
offset
)
/
sizeof
(
int
));
tofs
=
(
int
)((
utilted
.
offset
-
usum
.
offset
)
/
sizeof
(
int
));
}
}
else
else
{
integral
(
_image
,
usum
,
noArray
(),
noArray
(),
CV_32S
);
integral
(
_image
,
usum
,
noArray
(),
noArray
(),
CV_32S
);
}
sqrBoxFilter
(
_image
,
usqsum
,
CV_32S
,
sqrBoxFilter
(
_image
,
usqsum
,
CV_32S
,
Size
(
normrect
.
width
,
normrect
.
height
),
Size
(
normrect
.
width
,
normrect
.
height
),
Point
(
0
,
0
),
false
);
Point
(
0
,
0
),
false
);
/*sqrBoxFilter(_image.getMat(), sqsum, CV_32S,
Size(normrect.width, normrect.height),
Point(0, 0), false);
sqsum.copyTo(usqsum);*/
sumStep
=
(
int
)(
usum
.
step
/
usum
.
elemSize
());
sumStep
=
(
int
)(
usum
.
step
/
usum
.
elemSize
());
}
}
else
else
{
{
sum0
.
create
(
rn
*
rn_scale
,
cn
,
CV_32S
);
sum0
.
create
(
rn
*
rn_scale
,
cn
,
CV_32S
);
sqsum0
.
create
(
rn
,
cn
,
CV_
64F
);
sqsum0
.
create
(
rn
,
cn
,
CV_
32S
);
sum
=
sum0
(
Rect
(
0
,
0
,
cols
+
1
,
rows
+
1
));
sum
=
sum0
(
Rect
(
0
,
0
,
cols
+
1
,
rows
+
1
));
sqsum
=
sqsum0
(
Rect
(
0
,
0
,
cols
+
1
,
rows
+
1
));
sqsum
=
sqsum0
(
Rect
(
0
,
0
,
cols
,
rows
));
if
(
hasTiltedFeatures
)
if
(
hasTiltedFeatures
)
{
{
Mat
tilted
=
sum0
(
Rect
(
0
,
_sumSize
.
height
,
cols
+
1
,
rows
+
1
));
Mat
tilted
=
sum0
(
Rect
(
0
,
_sumSize
.
height
,
cols
+
1
,
rows
+
1
));
integral
(
_image
,
sum
,
sqsum
,
tilted
,
CV_32S
);
integral
(
_image
,
sum
,
noArray
()
,
tilted
,
CV_32S
);
tofs
=
(
int
)((
tilted
.
data
-
sum
.
data
)
/
sizeof
(
int
));
tofs
=
(
int
)((
tilted
.
data
-
sum
.
data
)
/
sizeof
(
int
));
}
}
else
else
integral
(
_image
,
sum
,
sqsum
,
noArray
(),
CV_32S
);
integral
(
_image
,
sum
,
noArray
()
,
noArray
(),
CV_32S
);
/*
sqrBoxFilter(_image, sqsum, CV_32S,
sqrBoxFilter
(
_image
,
sqsum
,
CV_32S
,
Size
(
normrect
.
width
,
normrect
.
height
),
Size
(
normrect
.
width
,
normrect
.
height
),
Point(0, 0), false);
*/
Point
(
0
,
0
),
false
);
sumStep
=
(
int
)(
sum
.
step
/
sum
.
elemSize
());
sumStep
=
(
int
)(
sum
.
step
/
sum
.
elemSize
());
}
}
...
@@ -592,7 +600,7 @@ bool HaarEvaluator::setImage( InputArray _image, Size _origWinSize, Size _sumSiz
...
@@ -592,7 +600,7 @@ bool HaarEvaluator::setImage( InputArray _image, Size _origWinSize, Size _sumSiz
optfeaturesPtr
[
fi
].
setOffsets
(
ff
[
fi
],
sumStep
,
tofs
);
optfeaturesPtr
[
fi
].
setOffsets
(
ff
[
fi
],
sumStep
,
tofs
);
}
}
if
(
_image
.
isUMat
()
&&
(
sumSize0
!=
_sumSize
||
ufbuf
.
empty
())
)
if
(
_image
.
isUMat
()
&&
(
sumSize0
!=
_sumSize
||
ufbuf
.
empty
())
)
copyVectorToUMat
(
ff
,
ufbuf
);
copyVectorToUMat
(
*
optfeatures
,
ufbuf
);
sumSize0
=
_sumSize
;
sumSize0
=
_sumSize
;
return
true
;
return
true
;
...
@@ -608,13 +616,7 @@ bool HaarEvaluator::setWindow( Point pt )
...
@@ -608,13 +616,7 @@ bool HaarEvaluator::setWindow( Point pt )
const
int
*
p
=
&
sum
.
at
<
int
>
(
pt
);
const
int
*
p
=
&
sum
.
at
<
int
>
(
pt
);
int
valsum
=
CALC_SUM_OFS
(
nofs
,
p
);
int
valsum
=
CALC_SUM_OFS
(
nofs
,
p
);
double
valsqsum
=
sqsum
.
at
<
int
>
(
pt
.
y
+
normrect
.
y
,
pt
.
x
+
normrect
.
x
);
int
nqofs
[
4
];
CV_SUM_OFS
(
nqofs
[
0
],
nqofs
[
1
],
nqofs
[
2
],
nqofs
[
3
],
0
,
normrect
,
(
int
)(
sqsum
.
step
/
sizeof
(
double
))
);
const
double
*
pq
=
&
sqsum
.
at
<
double
>
(
pt
);
double
valsqsum
=
CALC_SUM_OFS
(
nqofs
,
pq
);
//double valsqsum = sqsum.at<int>(pt.y + normrect.y, pt.x + normrect.x);
double
nf
=
(
double
)
normrect
.
area
()
*
valsqsum
-
(
double
)
valsum
*
valsum
;
double
nf
=
(
double
)
normrect
.
area
()
*
valsqsum
-
(
double
)
valsum
*
valsum
;
if
(
nf
>
0.
)
if
(
nf
>
0.
)
...
@@ -1131,8 +1133,6 @@ bool CascadeClassifierImpl::detectSingleScale( InputArray _image, Size processin
...
@@ -1131,8 +1133,6 @@ bool CascadeClassifierImpl::detectSingleScale( InputArray _image, Size processin
bool
CascadeClassifierImpl
::
ocl_detectSingleScale
(
InputArray
_image
,
Size
processingRectSize
,
bool
CascadeClassifierImpl
::
ocl_detectSingleScale
(
InputArray
_image
,
Size
processingRectSize
,
int
yStep
,
double
factor
,
Size
sumSize0
)
int
yStep
,
double
factor
,
Size
sumSize0
)
{
{
const
int
MAX_FACES
=
10000
;
Ptr
<
HaarEvaluator
>
haar
=
featureEvaluator
.
dynamicCast
<
HaarEvaluator
>
();
Ptr
<
HaarEvaluator
>
haar
=
featureEvaluator
.
dynamicCast
<
HaarEvaluator
>
();
if
(
haar
.
empty
()
)
if
(
haar
.
empty
()
)
return
false
;
return
false
;
...
@@ -1141,7 +1141,8 @@ bool CascadeClassifierImpl::ocl_detectSingleScale( InputArray _image, Size proce
...
@@ -1141,7 +1141,8 @@ bool CascadeClassifierImpl::ocl_detectSingleScale( InputArray _image, Size proce
if
(
cascadeKernel
.
empty
()
)
if
(
cascadeKernel
.
empty
()
)
{
{
//cascadeKernel.create(")
cascadeKernel
.
create
(
"runHaarClassifierStump"
,
ocl
::
objdetect
::
haarobjectdetect_oclsrc
,
format
(
"-D MAX_FACES=%d"
,
MAX_FACES
));
if
(
cascadeKernel
.
empty
()
)
if
(
cascadeKernel
.
empty
()
)
return
false
;
return
false
;
}
}
...
@@ -1152,30 +1153,35 @@ bool CascadeClassifierImpl::ocl_detectSingleScale( InputArray _image, Size proce
...
@@ -1152,30 +1153,35 @@ bool CascadeClassifierImpl::ocl_detectSingleScale( InputArray _image, Size proce
copyVectorToUMat
(
data
.
classifiers
,
uclassifiers
);
copyVectorToUMat
(
data
.
classifiers
,
uclassifiers
);
copyVectorToUMat
(
data
.
nodes
,
unodes
);
copyVectorToUMat
(
data
.
nodes
,
unodes
);
copyVectorToUMat
(
data
.
leaves
,
uleaves
);
copyVectorToUMat
(
data
.
leaves
,
uleaves
);
ufacepos
.
create
(
1
,
MAX_FACES
*
4
+
1
,
CV_32S
);
}
}
std
::
vector
<
UMat
>
bufs
;
std
::
vector
<
UMat
>
bufs
;
haar
->
getUMats
(
bufs
);
haar
->
getUMats
(
bufs
);
CV_Assert
(
bufs
.
size
()
==
3
);
CV_Assert
(
bufs
.
size
()
==
3
);
Rect
normrect
=
haar
->
getNormRect
();
//processingRectSize = Size(yStep, yStep);
size_t
globalsize
[]
=
{
processingRectSize
.
width
/
yStep
,
processingRectSize
.
height
/
yStep
};
size_t
globalsize
[]
=
{
processingRectSize
.
width
/
yStep
,
processingRectSize
.
height
/
yStep
};
return
cascadeKernel
.
args
(
ocl
::
KernelArg
::
ReadOnly
(
bufs
[
0
]),
// sum
cascadeKernel
.
args
(
ocl
::
KernelArg
::
ReadOnlyNoSize
(
bufs
[
0
]),
// sum
ocl
::
KernelArg
::
ReadOnly
(
bufs
[
1
]),
// sqsum
ocl
::
KernelArg
::
ReadOnly
NoSize
(
bufs
[
1
]),
// sqsum
ocl
::
KernelArg
::
PtrReadOnly
(
bufs
[
2
]),
// optfeatures
ocl
::
KernelArg
::
PtrReadOnly
(
bufs
[
2
]),
// optfeatures
// cascade classifier
// cascade classifier
(
int
)
data
.
stages
.
size
(),
ocl
::
KernelArg
::
PtrReadOnly
(
ustages
),
ocl
::
KernelArg
::
PtrReadOnly
(
ustages
),
ocl
::
KernelArg
::
PtrReadOnly
(
uclassifiers
),
ocl
::
KernelArg
::
PtrReadOnly
(
uclassifiers
),
ocl
::
KernelArg
::
PtrReadOnly
(
unodes
),
ocl
::
KernelArg
::
PtrReadOnly
(
unodes
),
ocl
::
KernelArg
::
PtrReadOnly
(
uleaves
),
ocl
::
KernelArg
::
PtrReadOnly
(
uleaves
),
ocl
::
KernelArg
::
WriteOnly
(
ufacepos
),
// positions
ocl
::
KernelArg
::
PtrWriteOnly
(
ufacepos
),
// positions
ocl
::
KernelArg
::
PtrReadOnly
(
uparams
),
processingRectSize
,
processingRectSize
.
width
,
yStep
,
(
float
)
factor
,
processingRectSize
.
height
,
normrect
,
data
.
origWinSize
);
yStep
,
(
float
)
factor
,
MAX_FACES
).
run
(
2
,
globalsize
,
0
,
false
);
bool
ok
=
cascadeKernel
.
run
(
2
,
globalsize
,
0
,
true
);
//CV_Assert(ok);
return
ok
;
}
}
bool
CascadeClassifierImpl
::
isOldFormatCascade
()
const
bool
CascadeClassifierImpl
::
isOldFormatCascade
()
const
...
@@ -1234,12 +1240,13 @@ void CascadeClassifierImpl::detectMultiScaleNoGrouping( InputArray _image, std::
...
@@ -1234,12 +1240,13 @@ void CascadeClassifierImpl::detectMultiScaleNoGrouping( InputArray _image, std::
if
(
maxObjectSize
.
height
==
0
||
maxObjectSize
.
width
==
0
)
if
(
maxObjectSize
.
height
==
0
||
maxObjectSize
.
width
==
0
)
maxObjectSize
=
imgsz
;
maxObjectSize
=
imgsz
;
bool
use_ocl
=
false
;
/*
ocl::useOpenCL() &&
bool
use_ocl
=
ocl
::
useOpenCL
()
&&
getFeatureType
()
==
FeatureEvaluator
::
HAAR
&&
getFeatureType
()
==
FeatureEvaluator
::
HAAR
&&
!
isOldFormatCascade
()
&&
!
isOldFormatCascade
()
&&
data
.
isStumpBased
&&
maskGenerator
.
empty
()
&&
maskGenerator
.
empty
()
&&
!
outputRejectLevels
&&
!
outputRejectLevels
&&
tryOpenCL;
*/
tryOpenCL
;
if
(
!
use_ocl
)
if
(
!
use_ocl
)
{
{
...
@@ -1268,13 +1275,20 @@ void CascadeClassifierImpl::detectMultiScaleNoGrouping( InputArray _image, std::
...
@@ -1268,13 +1275,20 @@ void CascadeClassifierImpl::detectMultiScaleNoGrouping( InputArray _image, std::
}
}
Size
sumSize0
((
imgsz
.
width
+
SUM_ALIGN
)
&
-
SUM_ALIGN
,
imgsz
.
height
+
1
);
Size
sumSize0
((
imgsz
.
width
+
SUM_ALIGN
)
&
-
SUM_ALIGN
,
imgsz
.
height
+
1
);
if
(
use_ocl
)
{
ufacepos
.
create
(
1
,
MAX_FACES
*
4
+
1
,
CV_32S
);
UMat
ufacecount
(
ufacepos
,
Rect
(
0
,
0
,
1
,
1
));
ufacecount
.
setTo
(
Scalar
::
all
(
0
));
}
for
(
double
factor
=
1
;
;
factor
*=
scaleFactor
)
for
(
double
factor
=
1
;
;
factor
*=
scaleFactor
)
{
{
Size
originalWindowSize
=
getOriginalWindowSize
();
Size
originalWindowSize
=
getOriginalWindowSize
();
Size
windowSize
(
cvRound
(
originalWindowSize
.
width
*
factor
),
cvRound
(
originalWindowSize
.
height
*
factor
)
);
Size
windowSize
(
cvRound
(
originalWindowSize
.
width
*
factor
),
cvRound
(
originalWindowSize
.
height
*
factor
)
);
Size
scaledImageSize
(
cvRound
(
grayImage
.
cols
/
factor
),
cvRound
(
grayImage
.
rows
/
factor
)
);
Size
scaledImageSize
(
cvRound
(
imgsz
.
width
/
factor
),
cvRound
(
imgsz
.
height
/
factor
)
);
Size
processingRectSize
(
scaledImageSize
.
width
-
originalWindowSize
.
width
,
Size
processingRectSize
(
scaledImageSize
.
width
-
originalWindowSize
.
width
,
scaledImageSize
.
height
-
originalWindowSize
.
height
);
scaledImageSize
.
height
-
originalWindowSize
.
height
);
...
@@ -1331,6 +1345,7 @@ void CascadeClassifierImpl::detectMultiScaleNoGrouping( InputArray _image, std::
...
@@ -1331,6 +1345,7 @@ void CascadeClassifierImpl::detectMultiScaleNoGrouping( InputArray _image, std::
Mat
facepos
=
ufacepos
.
getMat
(
ACCESS_READ
);
Mat
facepos
=
ufacepos
.
getMat
(
ACCESS_READ
);
const
int
*
fptr
=
facepos
.
ptr
<
int
>
();
const
int
*
fptr
=
facepos
.
ptr
<
int
>
();
int
i
,
nfaces
=
fptr
[
0
];
int
i
,
nfaces
=
fptr
[
0
];
printf
(
"nfaces = %d
\n
"
,
nfaces
);
for
(
i
=
0
;
i
<
nfaces
;
i
++
)
for
(
i
=
0
;
i
<
nfaces
;
i
++
)
{
{
candidates
.
push_back
(
Rect
(
fptr
[
i
*
4
+
1
],
fptr
[
i
*
4
+
2
],
fptr
[
i
*
4
+
3
],
fptr
[
i
*
4
+
4
]));
candidates
.
push_back
(
Rect
(
fptr
[
i
*
4
+
1
],
fptr
[
i
*
4
+
2
],
fptr
[
i
*
4
+
3
],
fptr
[
i
*
4
+
4
]));
...
@@ -1439,8 +1454,6 @@ bool CascadeClassifierImpl::Data::read(const FileNode &root)
...
@@ -1439,8 +1454,6 @@ bool CascadeClassifierImpl::Data::read(const FileNode &root)
origWinSize
.
height
=
(
int
)
root
[
CC_HEIGHT
];
origWinSize
.
height
=
(
int
)
root
[
CC_HEIGHT
];
CV_Assert
(
origWinSize
.
height
>
0
&&
origWinSize
.
width
>
0
);
CV_Assert
(
origWinSize
.
height
>
0
&&
origWinSize
.
width
>
0
);
isStumpBased
=
(
int
)(
root
[
CC_STAGE_PARAMS
][
CC_MAX_DEPTH
])
==
1
?
true
:
false
;
// load feature params
// load feature params
FileNode
fn
=
root
[
CC_FEATURE_PARAMS
];
FileNode
fn
=
root
[
CC_FEATURE_PARAMS
];
if
(
fn
.
empty
()
)
if
(
fn
.
empty
()
)
...
@@ -1460,6 +1473,7 @@ bool CascadeClassifierImpl::Data::read(const FileNode &root)
...
@@ -1460,6 +1473,7 @@ bool CascadeClassifierImpl::Data::read(const FileNode &root)
nodes
.
clear
();
nodes
.
clear
();
FileNodeIterator
it
=
fn
.
begin
(),
it_end
=
fn
.
end
();
FileNodeIterator
it
=
fn
.
begin
(),
it_end
=
fn
.
end
();
isStumpBased
=
true
;
for
(
int
si
=
0
;
it
!=
it_end
;
si
++
,
++
it
)
for
(
int
si
=
0
;
it
!=
it_end
;
si
++
,
++
it
)
{
{
...
@@ -1485,6 +1499,9 @@ bool CascadeClassifierImpl::Data::read(const FileNode &root)
...
@@ -1485,6 +1499,9 @@ bool CascadeClassifierImpl::Data::read(const FileNode &root)
DTree
tree
;
DTree
tree
;
tree
.
nodeCount
=
(
int
)
internalNodes
.
size
()
/
nodeStep
;
tree
.
nodeCount
=
(
int
)
internalNodes
.
size
()
/
nodeStep
;
if
(
tree
.
nodeCount
>
1
)
isStumpBased
=
false
;
classifiers
.
push_back
(
tree
);
classifiers
.
push_back
(
tree
);
nodes
.
reserve
(
nodes
.
size
()
+
tree
.
nodeCount
);
nodes
.
reserve
(
nodes
.
size
()
+
tree
.
nodeCount
);
...
...
modules/objdetect/src/cascadedetect.hpp
View file @
02fb3f0a
...
@@ -63,8 +63,8 @@ protected:
...
@@ -63,8 +63,8 @@ protected:
double
scaleFactor
,
Size
minObjectSize
,
Size
maxObjectSize
,
double
scaleFactor
,
Size
minObjectSize
,
Size
maxObjectSize
,
bool
outputRejectLevels
=
false
);
bool
outputRejectLevels
=
false
);
enum
{
BOOST
=
0
enum
{
MAX_FACES
=
10000
};
};
enum
{
BOOST
=
0
};
enum
{
DO_CANNY_PRUNING
=
CASCADE_DO_CANNY_PRUNING
,
enum
{
DO_CANNY_PRUNING
=
CASCADE_DO_CANNY_PRUNING
,
SCALE_IMAGE
=
CASCADE_SCALE_IMAGE
,
SCALE_IMAGE
=
CASCADE_SCALE_IMAGE
,
FIND_BIGGEST_OBJECT
=
CASCADE_FIND_BIGGEST_OBJECT
,
FIND_BIGGEST_OBJECT
=
CASCADE_FIND_BIGGEST_OBJECT
,
...
@@ -132,7 +132,7 @@ protected:
...
@@ -132,7 +132,7 @@ protected:
Ptr
<
MaskGenerator
>
maskGenerator
;
Ptr
<
MaskGenerator
>
maskGenerator
;
UMat
ugrayImage
,
uimageBuffer
;
UMat
ugrayImage
,
uimageBuffer
;
UMat
ufacepos
,
ustages
,
uclassifiers
,
unodes
,
uleaves
,
usubsets
,
uparams
;
UMat
ufacepos
,
ustages
,
uclassifiers
,
unodes
,
uleaves
,
usubsets
;
ocl
::
Kernel
cascadeKernel
;
ocl
::
Kernel
cascadeKernel
;
bool
tryOpenCL
;
bool
tryOpenCL
;
...
@@ -327,19 +327,19 @@ inline void HaarEvaluator::OptFeature :: setOffsets( const Feature& _f, int step
...
@@ -327,19 +327,19 @@ inline void HaarEvaluator::OptFeature :: setOffsets( const Feature& _f, int step
weight
[
0
]
=
_f
.
rect
[
0
].
weight
;
weight
[
0
]
=
_f
.
rect
[
0
].
weight
;
weight
[
1
]
=
_f
.
rect
[
1
].
weight
;
weight
[
1
]
=
_f
.
rect
[
1
].
weight
;
weight
[
2
]
=
_f
.
rect
[
2
].
weight
;
weight
[
2
]
=
_f
.
rect
[
2
].
weight
;
Rect
r2
=
weight
[
2
]
>
0
?
_f
.
rect
[
2
].
r
:
Rect
(
0
,
0
,
0
,
0
);
if
(
_f
.
tilted
)
if
(
_f
.
tilted
)
{
{
CV_TILTED_OFS
(
ofs
[
0
][
0
],
ofs
[
0
][
1
],
ofs
[
0
][
2
],
ofs
[
0
][
3
],
tofs
,
_f
.
rect
[
0
].
r
,
step
);
CV_TILTED_OFS
(
ofs
[
0
][
0
],
ofs
[
0
][
1
],
ofs
[
0
][
2
],
ofs
[
0
][
3
],
tofs
,
_f
.
rect
[
0
].
r
,
step
);
CV_TILTED_OFS
(
ofs
[
1
][
0
],
ofs
[
1
][
1
],
ofs
[
1
][
2
],
ofs
[
1
][
3
],
tofs
,
_f
.
rect
[
1
].
r
,
step
);
CV_TILTED_OFS
(
ofs
[
1
][
0
],
ofs
[
1
][
1
],
ofs
[
1
][
2
],
ofs
[
1
][
3
],
tofs
,
_f
.
rect
[
1
].
r
,
step
);
if
(
weight
[
2
])
CV_TILTED_PTRS
(
ofs
[
2
][
0
],
ofs
[
2
][
1
],
ofs
[
2
][
2
],
ofs
[
2
][
3
],
tofs
,
r2
,
step
);
CV_TILTED_PTRS
(
ofs
[
2
][
0
],
ofs
[
2
][
1
],
ofs
[
2
][
2
],
ofs
[
2
][
3
],
tofs
,
_f
.
rect
[
2
].
r
,
step
);
}
}
else
else
{
{
CV_SUM_OFS
(
ofs
[
0
][
0
],
ofs
[
0
][
1
],
ofs
[
0
][
2
],
ofs
[
0
][
3
],
0
,
_f
.
rect
[
0
].
r
,
step
);
CV_SUM_OFS
(
ofs
[
0
][
0
],
ofs
[
0
][
1
],
ofs
[
0
][
2
],
ofs
[
0
][
3
],
0
,
_f
.
rect
[
0
].
r
,
step
);
CV_SUM_OFS
(
ofs
[
1
][
0
],
ofs
[
1
][
1
],
ofs
[
1
][
2
],
ofs
[
1
][
3
],
0
,
_f
.
rect
[
1
].
r
,
step
);
CV_SUM_OFS
(
ofs
[
1
][
0
],
ofs
[
1
][
1
],
ofs
[
1
][
2
],
ofs
[
1
][
3
],
0
,
_f
.
rect
[
1
].
r
,
step
);
if
(
weight
[
2
])
CV_SUM_OFS
(
ofs
[
2
][
0
],
ofs
[
2
][
1
],
ofs
[
2
][
2
],
ofs
[
2
][
3
],
0
,
r2
,
step
);
CV_SUM_OFS
(
ofs
[
2
][
0
],
ofs
[
2
][
1
],
ofs
[
2
][
2
],
ofs
[
2
][
3
],
0
,
_f
.
rect
[
2
].
r
,
step
);
}
}
}
}
...
...
modules/objdetect/src/opencl/haarobjectdetect.cl
View file @
02fb3f0a
...
@@ -12,6 +12,7 @@
...
@@ -12,6 +12,7 @@
//
Nathan,
liujun@multicorewareinc.com
//
Nathan,
liujun@multicorewareinc.com
//
Peng
Xiao,
pengxiao@outlook.com
//
Peng
Xiao,
pengxiao@outlook.com
//
Erping
Pang,
erping@multicorewareinc.com
//
Erping
Pang,
erping@multicorewareinc.com
//
Vadim
Pisarevsky,
vadim.pisarevsky@itseez.com
//
Redistribution
and
use
in
source
and
binary
forms,
with
or
without
modification,
//
Redistribution
and
use
in
source
and
binary
forms,
with
or
without
modification,
//
are
permitted
provided
that
the
following
conditions
are
met:
//
are
permitted
provided
that
the
following
conditions
are
met:
//
//
...
@@ -38,559 +39,117 @@
...
@@ -38,559 +39,117 @@
//
//
//
//
#
define
CV_HAAR_FEATURE_MAX
3
typedef
struct
__attribute__
((
aligned
(
4
)))
OptFeature
{
#
define
calc_sum
(
rect,offset
)
(
sum[
(
rect
)
.
p0+offset]
-
sum[
(
rect
)
.
p1+offset]
-
sum[
(
rect
)
.
p2+offset]
+
sum[
(
rect
)
.
p3+offset]
)
int4
ofs[3]
__attribute__
((
aligned
(
4
)))
;
#
define
calc_sum1
(
rect,offset,i
)
(
sum[
(
rect
)
.
p0[i]+offset]
-
sum[
(
rect
)
.
p1[i]+offset]
-
sum[
(
rect
)
.
p2[i]+offset]
+
sum[
(
rect
)
.
p3[i]+offset]
)
float4
weight
__attribute__
((
aligned
(
4
)))
;
}
typedef
int
sumtype
;
OptFeature
;
typedef
float
sqsumtype
;
#
ifndef
STUMP_BASED
#
define
STUMP_BASED
1
#
endif
typedef
struct
__attribute__
((
aligned
(
128
)
))
GpuHidHaar
TreeNode
typedef
struct
__attribute__
((
aligned
(
4
)))
D
TreeNode
{
{
int
p[CV_HAAR_FEATURE_MAX][4]
__attribute__
((
aligned
(
64
)))
;
int
featureIdx
__attribute__
((
aligned
(
4
)))
;
float
weight[CV_HAAR_FEATURE_MAX]
;
float
threshold
__attribute__
((
aligned
(
4
)))
; // for ordered features only
float
threshold
;
float
alpha[3]
__attribute__
((
aligned
(
16
)))
;
int
left
__attribute__
((
aligned
(
4
)))
;
int
left
__attribute__
((
aligned
(
4
)))
;
int
right
__attribute__
((
aligned
(
4
)))
;
int
right
__attribute__
((
aligned
(
4
)))
;
}
}
GpuHidHaarTreeNode
;
DTreeNode
;
//typedef
struct
__attribute__
((
aligned
(
32
)))
GpuHidHaarClassifier
typedef
struct
__attribute__
((
aligned
(
4
)))
DTree
//{
//
int
count
__attribute__
((
aligned
(
4
)))
;
//
GpuHidHaarTreeNode*
node
__attribute__
((
aligned
(
8
)))
;
//
float*
alpha
__attribute__
((
aligned
(
8
)))
;
//}
//GpuHidHaarClassifier
;
typedef
struct
__attribute__
((
aligned
(
64
)))
GpuHidHaarStageClassifier
{
{
int
count
__attribute__
((
aligned
(
4
)))
;
int
nodeCount
__attribute__
((
aligned
(
4
)))
;
float
threshold
__attribute__
((
aligned
(
4
)))
;
int
two_rects
__attribute__
((
aligned
(
4
)))
;
int
reserved0
__attribute__
((
aligned
(
8
)))
;
int
reserved1
__attribute__
((
aligned
(
8
)))
;
int
reserved2
__attribute__
((
aligned
(
8
)))
;
int
reserved3
__attribute__
((
aligned
(
8
)))
;
}
}
GpuHidHaarStageClassifier
;
DTree
;
//typedef
struct
__attribute__
((
aligned
(
64
)))
GpuHidHaarClassifierCascade
//{
//
int
count
__attribute__
((
aligned
(
4
)))
;
//
int
is_stump_based
__attribute__
((
aligned
(
4
)))
;
//
int
has_tilted_features
__attribute__
((
aligned
(
4
)))
;
//
int
is_tree
__attribute__
((
aligned
(
4
)))
;
//
int
pq0
__attribute__
((
aligned
(
4
)))
;
//
int
pq1
__attribute__
((
aligned
(
4
)))
;
//
int
pq2
__attribute__
((
aligned
(
4
)))
;
//
int
pq3
__attribute__
((
aligned
(
4
)))
;
//
int
p0
__attribute__
((
aligned
(
4
)))
;
//
int
p1
__attribute__
((
aligned
(
4
)))
;
//
int
p2
__attribute__
((
aligned
(
4
)))
;
//
int
p3
__attribute__
((
aligned
(
4
)))
;
//
float
inv_window_area
__attribute__
((
aligned
(
4
)))
;
//}
GpuHidHaarClassifierCascade
;
#
ifdef
PACKED_CLASSIFIER
//
this
code
is
scalar,
one
pixel
->
one
workitem
__kernel
void
gpuRunHaarClassifierCascadePacked
(
global
const
GpuHidHaarStageClassifier
*
stagecascadeptr,
global
const
int4
*
info,
global
const
GpuHidHaarTreeNode
*
nodeptr,
global
const
int
*
restrict
sum,
global
const
float
*
restrict
sqsum,
volatile
global
int4
*
candidate,
const
int
pixelstep,
const
int
loopcount,
const
int
start_stage,
const
int
split_stage,
const
int
end_stage,
const
int
startnode,
const
int
splitnode,
const
int4
p,
const
int4
pq,
const
float
correction,
global
const
int*
pNodesPK,
global
const
int4*
pWGInfo
)
typedef
struct
__attribute__
((
aligned
(
4
)))
Stage
{
{
//
this
version
used
information
provided
for
each
workgroup
int
first
__attribute__
((
aligned
(
4
)))
;
//
no
empty
WG
int
ntrees
__attribute__
((
aligned
(
4
)))
;
int
gid
=
(
int
)
get_group_id
(
0
)
;
float
threshold
__attribute__
((
aligned
(
4
)))
;
int
lid_x
=
(
int
)
get_local_id
(
0
)
;
}
int
lid_y
=
(
int
)
get_local_id
(
1
)
;
Stage
;
int
lid
=
lid_y*LSx+lid_x
;
int4
WGInfo
=
pWGInfo[gid]
;
__kernel
void
runHaarClassifierStump
(
int
GroupX
=
(
WGInfo.y
>>
16
)
&0xFFFF
;
__global
const
int*
sum,
int
GroupY
=
(
WGInfo.y
>>
0
)
&
0xFFFF
;
int
sumstep,
int
sumoffset,
int
Width
=
(
WGInfo.x
>>
16
)
&0xFFFF
;
__global
const
int*
sqsum,
int
Height
=
(
WGInfo.x
>>
0
)
&
0xFFFF
;
int
sqsumstep,
int
sqsumoffset,
int
ImgOffset
=
WGInfo.z
;
__global
const
OptFeature*
optfeatures,
float
ScaleFactor
=
as_float
(
WGInfo.w
)
;
int
nstages,
#
define
DATA_SIZE_X
(
LSx+WND_SIZE_X
)
__global
const
Stage*
stages,
#
define
DATA_SIZE_Y
(
LSy+WND_SIZE_Y
)
__global
const
DTree*
trees,
#
define
DATA_SIZE
(
DATA_SIZE_X*DATA_SIZE_Y
)
__global
const
DTreeNode*
nodes,
__global
const
float*
leaves,
local
int
SumL[DATA_SIZE]
;
volatile
__global
int*
facepos,
//
read
input
data
window
into
local
mem
int2
imgsize,
int
xyscale,
float
factor,
for
(
int
i
=
0
; i<DATA_SIZE; i+=(LSx*LSy))
int4
normrect,
int2
windowsize
)
{
int
index
=
i+lid
; // index in shared local memory
if
(
index<DATA_SIZE
)
{//
calc
global
x,y
coordinat
and
read
data
from
there
int
x
=
min
(
GroupX
+
(
index
%
(
DATA_SIZE_X
))
,
Width-1
)
;
int
y
=
min
(
GroupY
+
(
index
/
(
DATA_SIZE_X
))
,
Height-1
)
;
SumL[index]
=
sum[ImgOffset+y*pixelstep+x]
;
}
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
//
calc
variance_norm_factor
for
all
stages
float
variance_norm_factor
;
int
nodecounter=
startnode
;
int4
info1
=
p
;
int4
info2
=
pq
;
{
int
xl
=
lid_x
;
int
yl
=
lid_y
;
int
OffsetLocal
=
yl
*
DATA_SIZE_X
+
xl
;
int
OffsetGlobal
=
(
GroupY+yl
)
*
pixelstep
+
(
GroupX+xl
)
;
//
add
shift
to
get
position
on
scaled
image
OffsetGlobal
+=
ImgOffset
;
float
mean
=
SumL[info1.y*DATA_SIZE_X+info1.x+OffsetLocal]
-
SumL[info1.y*DATA_SIZE_X+info1.z+OffsetLocal]
-
SumL[info1.w*DATA_SIZE_X+info1.x+OffsetLocal]
+
SumL[info1.w*DATA_SIZE_X+info1.z+OffsetLocal]
;
float
sq
=
sqsum[info2.y*pixelstep+info2.x+OffsetGlobal]
-
sqsum[info2.y*pixelstep+info2.z+OffsetGlobal]
-
sqsum[info2.w*pixelstep+info2.x+OffsetGlobal]
+
sqsum[info2.w*pixelstep+info2.z+OffsetGlobal]
;
mean
*=
correction
;
sq
*=
correction
;
variance_norm_factor
=
sq
-
mean
*
mean
;
variance_norm_factor
=
(
variance_norm_factor
>=0.f
)
?
sqrt
(
variance_norm_factor
)
:
1.f
;
}//
end
calc
variance_norm_factor
for
all
stages
int
result
=
(
1.0f>0.0f
)
;
for
(
int
stageloop
=
start_stage
; (stageloop < end_stage) && result; stageloop++ )
{//
iterate
until
candidate
is
exist
float
stage_sum
=
0.0f
;
__global
GpuHidHaarStageClassifier*
stageinfo
=
(
__global
GpuHidHaarStageClassifier*
)
((
__global
uchar*
)
stagecascadeptr+stageloop*sizeof
(
GpuHidHaarStageClassifier
))
;
int
stagecount
=
stageinfo->count
;
float
stagethreshold
=
stageinfo->threshold
;
int
lcl_off
=
(
lid_y*DATA_SIZE_X
)
+
(
lid_x
)
;
for
(
int
nodeloop
=
0
; nodeloop < stagecount; nodecounter++,nodeloop++ )
{
//
simple
macro
to
extract
shorts
from
int
#
define
M0
(
_t
)
((
_t
)
&0xFFFF
)
#
define
M1
(
_t
)
(((
_t
)
>>16
)
&0xFFFF
)
//
load
packed
node
data
from
global
memory
(
L3
)
into
registers
global
const
int4*
pN
=
(
__global
int4*
)(
pNodesPK+nodecounter*NODE_SIZE
)
;
int4
n0
=
pN[0]
;
int4
n1
=
pN[1]
;
int4
n2
=
pN[2]
;
float
nodethreshold
=
as_float
(
n2.y
)
*
variance_norm_factor
;
//
calc
sum
of
intensity
pixels
according
to
node
information
float
classsum
=
(
SumL[M0
(
n0.x
)
+lcl_off]
-
SumL[M1
(
n0.x
)
+lcl_off]
-
SumL[M0
(
n0.y
)
+lcl_off]
+
SumL[M1
(
n0.y
)
+lcl_off]
)
*
as_float
(
n1.z
)
+
(
SumL[M0
(
n0.z
)
+lcl_off]
-
SumL[M1
(
n0.z
)
+lcl_off]
-
SumL[M0
(
n0.w
)
+lcl_off]
+
SumL[M1
(
n0.w
)
+lcl_off]
)
*
as_float
(
n1.w
)
+
(
SumL[M0
(
n1.x
)
+lcl_off]
-
SumL[M1
(
n1.x
)
+lcl_off]
-
SumL[M0
(
n1.y
)
+lcl_off]
+
SumL[M1
(
n1.y
)
+lcl_off]
)
*
as_float
(
n2.x
)
;
//accumulate
stage
responce
stage_sum
+=
(
classsum
>=
nodethreshold
)
?
as_float
(
n2.w
)
:
as_float
(
n2.z
)
;
}
result
=
(
stage_sum
>=
stagethreshold
)
;
}//
next
stage
if
needed
if
(
result
)
{//
all
stages
will
be
passed
and
there
is
a
detected
face
on
the
tested
position
int
index
=
1+atomic_inc
((
volatile
global
int*
)
candidate
)
; //get index to write global data with face info
if
(
index<OUTPUTSZ
)
{
int
x
=
GroupX+lid_x
;
int
y
=
GroupY+lid_y
;
int4
candidate_result
;
candidate_result.x
=
convert_int_rtn
(
x*ScaleFactor
)
;
candidate_result.y
=
convert_int_rtn
(
y*ScaleFactor
)
;
candidate_result.z
=
convert_int_rtn
(
ScaleFactor*WND_SIZE_X
)
;
candidate_result.w
=
convert_int_rtn
(
ScaleFactor*WND_SIZE_Y
)
;
candidate[index]
=
candidate_result
;
}
}
}//end
gpuRunHaarClassifierCascade
#
else
__kernel
void
__attribute__
((
reqd_work_group_size
(
8
,
8
,
1
)))
gpuRunHaarClassifierCascade
(
global
GpuHidHaarStageClassifier
*
stagecascadeptr,
global
int4
*
info,
global
GpuHidHaarTreeNode
*
nodeptr,
global
const
int
*
restrict
sum1,
global
const
float
*
restrict
sqsum1,
global
int4
*
candidate,
const
int
pixelstep,
const
int
loopcount,
const
int
start_stage,
const
int
split_stage,
const
int
end_stage,
const
int
startnode,
const
int
splitnode,
const
int4
p,
const
int4
pq,
const
float
correction
)
{
{
int
grpszx
=
get_local_size
(
0
)
;
int
ix
=
get_global_id
(
0
)
*xyscale
;
int
grpszy
=
get_local_size
(
1
)
;
int
iy
=
get_global_id
(
1
)
*xyscale
;
int
grpnumx
=
get_num_groups
(
0
)
;
sumstep
/=
sizeof
(
int
)
;
int
grpidx
=
get_group_id
(
0
)
;
sqsumstep
/=
sizeof
(
int
)
;
int
lclidx
=
get_local_id
(
0
)
;
int
lclidy
=
get_local_id
(
1
)
;
if
(
ix
<
imgsize.x
&&
iy
<
imgsize.y
)
int
lcl_sz
=
mul24
(
grpszx,grpszy
)
;
int
lcl_id
=
mad24
(
lclidy,grpszx,lclidx
)
;
__local
int
lclshare[1024]
;
__local
int*
lcldata
=
lclshare
;//for save win data
__local
int*
glboutindex
=
lcldata
+
28*28
;//for save global out index
__local
int*
lclcount
=
glboutindex
+
1
;//for save the numuber of temp pass pixel
__local
int*
lcloutindex
=
lclcount
+
1
;//for save info of temp pass pixel
__local
float*
partialsum
=
(
__local
float*
)(
lcloutindex
+
(
lcl_sz<<1
))
;
glboutindex[0]=0
;
int
outputoff
=
mul24
(
grpidx,256
)
;
//assume
window
size
is
20X20
#
define
WINDOWSIZE
20+1
//make
sure
readwidth
is
the
multiple
of
4
//ystep
=1,
from
host
code
int
readwidth
=
((
grpszx-1
+
WINDOWSIZE+3
)
>>2
)
<<2
;
int
readheight
=
grpszy-1+WINDOWSIZE
;
int
read_horiz_cnt
=
readwidth
>>
2
;//each read int4
int
total_read
=
mul24
(
read_horiz_cnt,readheight
)
;
int
read_loop
=
(
total_read
+
lcl_sz
-
1
)
>>
6
;
candidate[outputoff+
(
lcl_id<<2
)
]
=
(
int4
)
0
;
candidate[outputoff+
(
lcl_id<<2
)
+1]
=
(
int4
)
0
;
candidate[outputoff+
(
lcl_id<<2
)
+2]
=
(
int4
)
0
;
candidate[outputoff+
(
lcl_id<<2
)
+3]
=
(
int4
)
0
;
for
(
int
scalei
=
0
; scalei <loopcount; scalei++)
{
{
int4
scaleinfo1=
info[scalei]
;
int
ntrees,
nodeOfs
=
0
,
leafOfs
=
0
;
int
height
=
scaleinfo1.x
&
0xffff
;
int
stageIdx,
i
;
int
grpnumperline
=
(
scaleinfo1.y
&
0xffff0000
)
>>
16
;
float
s
=
0.f
;
int
totalgrp
=
scaleinfo1.y
&
0xffff
;
__global
const
DTreeNode*
node
;
int
imgoff
=
scaleinfo1.z
;
__global
const
OptFeature*
f
;
float
factor
=
as_float
(
scaleinfo1.w
)
;
__global
const
int*
psum
=
sum
+
mad24
(
iy,
sumstep,
ix
)
;
__global
const
int
*
sum
=
sum1
+
imgoff
;
__global
const
int*
pnsum
=
psum
+
mad24
(
normrect.y,
sumstep,
normrect.x
)
;
__global
const
float
*
sqsum
=
sqsum1
+
imgoff
;
int
normarea
=
normrect.z
*
normrect.w
;
for
(
int
grploop=grpidx
; grploop<totalgrp; grploop+=grpnumx)
float
invarea
=
1.f/normarea
;
float
sval
=
(
pnsum[0]
-
pnsum[normrect.z]
-
pnsum[mul24
(
normrect.w,
sumstep
)
]
+
pnsum[mad24
(
normrect.w,
sumstep,
normrect.z
)
]
)
*invarea
;
float
sqval
=
(
sqsum[mad24
(
iy
+
normrect.y,
sqsumstep,
ix
+
normrect.x
)
]
)
*invarea
;
float
nf
=
(
float
)
normarea
*
sqrt
(
max
(
sqval
-
sval
*
sval,
0.f
))
;
float4
weight
;
int4
ofs
;
nf
=
nf
>
0
?
nf
:
1.f
;
for
(
stageIdx
=
0
; stageIdx < nstages; stageIdx++ )
{
{
int
grpidy
=
grploop
/
grpnumperline
;
ntrees
=
stages[stageIdx].ntrees
;
int
grpidx
=
grploop
-
mul24
(
grpidy,
grpnumperline
)
;
s
=
0.f
;
int
x
=
mad24
(
grpidx,grpszx,lclidx
)
;
for
(
i
=
0
; i < ntrees; i++, nodeOfs++, leafOfs += 2 )
int
y
=
mad24
(
grpidy,grpszy,lclidy
)
;
int
grpoffx
=
x-lclidx
;
int
grpoffy
=
y-lclidy
;
for
(
int
i=0
; i<read_loop; i++)
{
{
int
pos_id
=
mad24
(
i,lcl_sz,lcl_id
)
;
node
=
nodes
+
nodeOfs
;
pos_id
=
pos_id
<
total_read
?
pos_id
:
0
;
f
=
optfeatures
+
node->featureIdx
;
int
lcl_y
=
pos_id
/
read_horiz_cnt
;
weight
=
f->weight
;
int
lcl_x
=
pos_id
-
mul24
(
lcl_y,
read_horiz_cnt
)
;
ofs
=
f->ofs[0]
;
int
glb_x
=
grpoffx
+
(
lcl_x<<2
)
;
sval
=
(
psum[ofs.x]
-
psum[ofs.y]
-
psum[ofs.z]
+
psum[ofs.w]
)
*weight.x
;
int
glb_y
=
grpoffy
+
lcl_y
;
ofs
=
f->ofs[1]
;
sval
+=
(
psum[ofs.x]
-
psum[ofs.y]
-
psum[ofs.z]
+
psum[ofs.w]
)
*weight.y
;
int
glb_off
=
mad24
(
min
(
glb_y,
height
+
WINDOWSIZE
-
1
)
,
pixelstep,glb_x
)
;
if
(
weight.z
>
0
)
int4
data
=
*
(
__global
int4*
)
&sum[glb_off]
;
int
lcl_off
=
mad24
(
lcl_y,
readwidth,
lcl_x<<2
)
;
vstore4
(
data,
0
,
&lcldata[lcl_off]
)
;
}
lcloutindex[lcl_id]
=
0
;
lclcount[0]
=
0
;
int
result
=
1
;
int
nodecounter=
startnode
;
float
mean,
variance_norm_factor
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
int
lcl_off
=
mad24
(
lclidy,readwidth,lclidx
)
;
int4
cascadeinfo1,
cascadeinfo2
;
cascadeinfo1
=
p
;
cascadeinfo2
=
pq
;
cascadeinfo1.x
+=lcl_off
;
cascadeinfo1.z
+=lcl_off
;
mean
=
(
lcldata[mad24
(
cascadeinfo1.y,readwidth,cascadeinfo1.x
)
]
-
lcldata[mad24
(
cascadeinfo1.y,readwidth,cascadeinfo1.z
)
]
-
lcldata[mad24
(
cascadeinfo1.w,readwidth,cascadeinfo1.x
)
]
+
lcldata[mad24
(
cascadeinfo1.w,readwidth,cascadeinfo1.z
)
]
)
*correction
;
int
p_offset
=
mad24
(
y,
pixelstep,
x
)
;
cascadeinfo2.x
+=p_offset
;
cascadeinfo2.z
+=p_offset
;
variance_norm_factor
=sqsum[mad24
(
cascadeinfo2.y,
pixelstep,
cascadeinfo2.x
)
]
-
sqsum[mad24
(
cascadeinfo2.y,
pixelstep,
cascadeinfo2.z
)
]
-
sqsum[mad24
(
cascadeinfo2.w,
pixelstep,
cascadeinfo2.x
)
]
+
sqsum[mad24
(
cascadeinfo2.w,
pixelstep,
cascadeinfo2.z
)
]
;
variance_norm_factor
=
variance_norm_factor
*
correction
-
mean
*
mean
;
variance_norm_factor
=
variance_norm_factor
>=0.f
?
sqrt
(
variance_norm_factor
)
:
1.f
;
for
(
int
stageloop
=
start_stage
; (stageloop < split_stage) && result; stageloop++ )
{
float
stage_sum
=
0.f
;
__global
GpuHidHaarStageClassifier*
stageinfo
=
(
__global
GpuHidHaarStageClassifier*
)
((
__global
uchar*
)
stagecascadeptr+stageloop*sizeof
(
GpuHidHaarStageClassifier
))
;
int
stagecount
=
stageinfo->count
;
float
stagethreshold
=
stageinfo->threshold
;
for
(
int
nodeloop
=
0
; nodeloop < stagecount; )
{
__global
GpuHidHaarTreeNode*
currentnodeptr
=
(
__global
GpuHidHaarTreeNode*
)
(((
__global
uchar*
)
nodeptr
)
+
nodecounter
*
sizeof
(
GpuHidHaarTreeNode
))
;
int4
info1
=
*
(
__global
int4*
)(
&
(
currentnodeptr->p[0][0]
))
;
int4
info2
=
*
(
__global
int4*
)(
&
(
currentnodeptr->p[1][0]
))
;
int4
info3
=
*
(
__global
int4*
)(
&
(
currentnodeptr->p[2][0]
))
;
float4
w
=
*
(
__global
float4*
)(
&
(
currentnodeptr->weight[0]
))
;
float3
alpha3
=
*
(
__global
float3*
)(
&
(
currentnodeptr->alpha[0]
))
;
float
nodethreshold
=
w.w
*
variance_norm_factor
;
info1.x
+=lcl_off
;
info1.z
+=lcl_off
;
info2.x
+=lcl_off
;
info2.z
+=lcl_off
;
float
classsum
=
(
lcldata[mad24
(
info1.y,readwidth,info1.x
)
]
-
lcldata[mad24
(
info1.y,readwidth,info1.z
)
]
-
lcldata[mad24
(
info1.w,readwidth,info1.x
)
]
+
lcldata[mad24
(
info1.w,readwidth,info1.z
)
]
)
*
w.x
;
classsum
+=
(
lcldata[mad24
(
info2.y,readwidth,info2.x
)
]
-
lcldata[mad24
(
info2.y,readwidth,info2.z
)
]
-
lcldata[mad24
(
info2.w,readwidth,info2.x
)
]
+
lcldata[mad24
(
info2.w,readwidth,info2.z
)
]
)
*
w.y
;
info3.x
+=lcl_off
;
info3.z
+=lcl_off
;
classsum
+=
(
lcldata[mad24
(
info3.y,readwidth,info3.x
)
]
-
lcldata[mad24
(
info3.y,readwidth,info3.z
)
]
-
lcldata[mad24
(
info3.w,readwidth,info3.x
)
]
+
lcldata[mad24
(
info3.w,readwidth,info3.z
)
]
)
*
w.z
;
bool
passThres
=
classsum
>=
nodethreshold
;
#
if
STUMP_BASED
stage_sum
+=
passThres
?
alpha3.y
:
alpha3.x
;
nodecounter++
;
nodeloop++
;
#
else
bool
isRootNode
=
(
nodecounter
&
1
)
==
0
;
if
(
isRootNode
)
{
if
(
(
passThres
&&
currentnodeptr->right
)
|
|
(!passThres && currentnodeptr->left))
{
nodecounter ++;
}
else
{
stage_sum += alpha3.x;
nodecounter += 2;
nodeloop ++;
}
}
else
{
stage_sum += passThres ? alpha3.z : alpha3.y;
nodecounter ++;
nodeloop ++;
}
#endif
}
result = (stage_sum >= stagethreshold) ? 1 : 0;
}
if(factor < 2)
{
if(result && lclidx %2 ==0 && lclidy %2 ==0 )
{
int queueindex = atomic_inc(lclcount);
lcloutindex[queueindex<<1] = (lclidy << 16) |
lclidx
;
lcloutindex[
(
queueindex<<1
)
+1]
=
as_int
((
float
)
variance_norm_factor
)
;
}
}
else
{
if
(
result
)
{
{
int
queueindex
=
atomic_inc
(
lclcount
)
;
ofs
=
f->ofs[2]
;
lcloutindex[queueindex<<1]
=
(
lclidy
<<
16
)
| lclidx;
sval
+=
(
psum[ofs.x]
-
psum[ofs.y]
-
psum[ofs.z]
+
psum[ofs.w]
)
*weight.z
;
lcloutindex[(queueindex<<1)+1] = as_int((float)variance_norm_factor);
}
}
s
+=
leaves[
sval
<
node->threshold*nf
?
leafOfs
:
leafOfs
+
1
]
;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
int queuecount = lclcount[0];
if
(
s
<
stages[stageIdx].threshold
)
barrier(CLK_LOCAL_MEM_FENCE);
break
;
nodecounter = splitnode;
}
for(int stageloop = split_stage; stageloop< end_stage && queuecount>0; stageloop++)
{
if
(
stageIdx
==
nstages
)
lclcount[0]=0;
{
barrier(CLK_LOCAL_MEM_FENCE);
int
nfaces
=
atomic_inc
(
facepos
)
;
//printf
(
"detected face #d!!!!\n"
,
nfaces
)
;
//int2 stageinfo = *(global int2*)(stagecascadeptr+stageloop);
if
(
nfaces
<
MAX_FACES
)
__global GpuHidHaarStageClassifier* stageinfo = (__global GpuHidHaarStageClassifier*)
((__global uchar*)stagecascadeptr+stageloop*sizeof(GpuHidHaarStageClassifier));
int stagecount = stageinfo->count;
float stagethreshold = stageinfo->threshold;
int perfscale = queuecount > 4 ? 3 : 2;
int queuecount_loop = (queuecount + (1<<perfscale)-1) >> perfscale;
int lcl_compute_win = lcl_sz >> perfscale;
int lcl_compute_win_id = (lcl_id >>(6-perfscale));
int lcl_loops = (stagecount + lcl_compute_win -1) >> (6-perfscale);
int lcl_compute_id = lcl_id - (lcl_compute_win_id << (6-perfscale));
for(int queueloop=0; queueloop<queuecount_loop; queueloop++)
{
float stage_sum = 0.f;
int temp_coord = lcloutindex[lcl_compute_win_id<<1];
float variance_norm_factor = as_float(lcloutindex[(lcl_compute_win_id<<1)+1]);
int queue_pixel = mad24(((temp_coord & (int)0xffff0000)>>16),readwidth,temp_coord & 0xffff);
if(lcl_compute_win_id < queuecount)
{
int tempnodecounter = lcl_compute_id;
float part_sum = 0.f;
const int stump_factor = STUMP_BASED ? 1 : 2;
int root_offset = 0;
for(int lcl_loop=0; lcl_loop<lcl_loops && tempnodecounter<stagecount;)
{
__global GpuHidHaarTreeNode* currentnodeptr = (__global GpuHidHaarTreeNode*)
(((__global uchar*)nodeptr) + sizeof(GpuHidHaarTreeNode) * ((nodecounter + tempnodecounter) * stump_factor + root_offset));
int4 info1 = *(__global int4*)(&(currentnodeptr->p[0][0]));
int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0]));
int4 info3 = *(__global int4*)(&(currentnodeptr->p[2][0]));
float4 w = *(__global float4*)(&(currentnodeptr->weight[0]));
float3 alpha3 = *(__global float3*)(&(currentnodeptr->alpha[0]));
float nodethreshold = w.w * variance_norm_factor;
info1.x +=queue_pixel;
info1.z +=queue_pixel;
info2.x +=queue_pixel;
info2.z +=queue_pixel;
float classsum = (lcldata[mad24(info1.y,readwidth,info1.x)] - lcldata[mad24(info1.y,readwidth,info1.z)] -
lcldata[mad24(info1.w,readwidth,info1.x)] + lcldata[mad24(info1.w,readwidth,info1.z)]) * w.x;
classsum += (lcldata[mad24(info2.y,readwidth,info2.x)] - lcldata[mad24(info2.y,readwidth,info2.z)] -
lcldata[mad24(info2.w,readwidth,info2.x)] + lcldata[mad24(info2.w,readwidth,info2.z)]) * w.y;
info3.x +=queue_pixel;
info3.z +=queue_pixel;
classsum += (lcldata[mad24(info3.y,readwidth,info3.x)] - lcldata[mad24(info3.y,readwidth,info3.z)] -
lcldata[mad24(info3.w,readwidth,info3.x)] + lcldata[mad24(info3.w,readwidth,info3.z)]) * w.z;
bool passThres = classsum >= nodethreshold;
#if STUMP_BASED
part_sum += passThres ? alpha3.y : alpha3.x;
tempnodecounter += lcl_compute_win;
lcl_loop++;
#else
if(root_offset == 0)
{
if( (passThres && currentnodeptr->right) |
|
(
!passThres
&&
currentnodeptr->left
))
{
root_offset
=
1
;
}
else
{
part_sum
+=
alpha3.x
;
tempnodecounter
+=
lcl_compute_win
;
lcl_loop++
;
}
}
else
{
part_sum
+=
passThres
?
alpha3.z
:
alpha3.y
;
tempnodecounter
+=
lcl_compute_win
;
lcl_loop++
;
root_offset
=
0
;
}
#
endif
}//end
for
(
int
lcl_loop=0
;lcl_loop<lcl_loops;lcl_loop++)
partialsum[lcl_id]=part_sum
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
lcl_compute_win_id
<
queuecount
)
{
for
(
int
i=0
; i<lcl_compute_win && (lcl_compute_id==0); i++)
{
stage_sum
+=
partialsum[lcl_id+i]
;
}
if
(
stage_sum
>=
stagethreshold
&&
(
lcl_compute_id==0
))
{
int
queueindex
=
atomic_inc
(
lclcount
)
;
lcloutindex[queueindex<<1]
=
temp_coord
;
lcloutindex[
(
queueindex<<1
)
+1]
=
as_int
(
variance_norm_factor
)
;
}
lcl_compute_win_id
+=
(
1<<perfscale
)
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}//end
for
(
int
queueloop=0
;queueloop<queuecount_loop;queueloop++)
queuecount
=
lclcount[0]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
nodecounter
+=
stagecount
;
}//end
for
(
int
stageloop
=
splitstage
; stageloop< endstage && queuecount>0;stageloop++)
if
(
lcl_id<queuecount
)
{
{
int
temp
=
lcloutindex[lcl_id<<1]
;
volatile
__global
int*
face
=
facepos
+
1
+
nfaces*4
;
int
x
=
mad24
(
grpidx,grpszx,temp
&
0xffff
)
;
face[0]
=
convert_int_rte
(
ix*factor
)
;
int
y
=
mad24
(
grpidy,grpszy,
((
temp
&
(
int
)
0xffff0000
)
>>
16
))
;
face[1]
=
convert_int_rte
(
iy*factor
)
;
temp
=
glboutindex[0]
;
face[2]
=
convert_int_rte
(
windowsize.x*factor
)
;
int4
candidate_result
;
face[3]
=
convert_int_rte
(
windowsize.y*factor
)
;
candidate_result.zw
=
(
int2
)
convert_int_rte
(
factor*20.f
)
;
candidate_result.x
=
convert_int_rte
(
x*factor
)
;
candidate_result.y
=
convert_int_rte
(
y*factor
)
;
atomic_inc
(
glboutindex
)
;
int
i
=
outputoff+temp+lcl_id
;
if
(
candidate[i].z
==
0
)
{
candidate[i]
=
candidate_result
;
}
else
{
for
(
i=i+1
;;i++)
{
if
(
candidate[i].z
==
0
)
{
candidate[i]
=
candidate_result
;
break
;
}
}
}
}
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
}//end
for
(
int
grploop=grpidx
;grploop<totalgrp;grploop+=grpnumx)
}
}//end
for
(
int
scalei
=
0
; scalei <loopcount; scalei++)
}
}
#
endif
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment