Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
95967461
Commit
95967461
authored
Sep 04, 2012
by
Vadim Pisarevsky
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
restored SSE2 and added AVX optimization of the old haar face detector
parent
dea52eb7
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
427 additions
and
103 deletions
+427
-103
system.cpp
modules/core/src/system.cpp
+1
-1
haar.cpp
modules/objdetect/src/haar.cpp
+426
-102
No files found.
modules/core/src/system.cpp
View file @
95967461
...
...
@@ -178,7 +178,7 @@ struct HWFeatures
f
.
have
[
CV_CPU_SSE4_1
]
=
(
cpuid_data
[
2
]
&
(
1
<<
19
))
!=
0
;
f
.
have
[
CV_CPU_SSE4_2
]
=
(
cpuid_data
[
2
]
&
(
1
<<
20
))
!=
0
;
f
.
have
[
CV_CPU_POPCNT
]
=
(
cpuid_data
[
2
]
&
(
1
<<
23
))
!=
0
;
f
.
have
[
CV_CPU_AVX
]
=
(
cpuid_data
[
2
]
&
(
1
<<
28
))
!=
0
;
f
.
have
[
CV_CPU_AVX
]
=
(
((
cpuid_data
[
2
]
&
(
1
<<
28
))
!=
0
)
&&
((
cpuid_data
[
2
]
&
(
1
<<
27
))
!=
0
));
//OS uses XSAVE_XRSTORE and CPU support AVX
}
return
f
;
...
...
modules/objdetect/src/haar.cpp
View file @
95967461
...
...
@@ -43,26 +43,23 @@
#include "precomp.hpp"
#include <stdio.h>
/*
#if CV_SSE2
#if CV_SSE2 || CV_SSE3
# if !CV_SSE4_1 && !CV_SSE4_2
# define _mm_blendv_pd(a, b, m) _mm_xor_pd(a, _mm_and_pd(_mm_xor_pd(b, a), m))
# define _mm_blendv_ps(a, b, m) _mm_xor_ps(a, _mm_and_ps(_mm_xor_ps(b, a), m))
# endif
#endif
#if defined CV_ICC
# if defined CV_AVX
# if CV_AVX
# define CV_HAAR_USE_AVX 1
# else
# if
defined CV_SSE2 || defined CV_SSE4_1 || defined CV_SSE4_2
# if
CV_SSE2 || CV_SSE3
# define CV_HAAR_USE_SSE 1
# else
# define CV_HAAR_NO_SIMD 1
# endif
# endif
#endif
*/
/* these settings affect the quality of detection: change with care */
#define CV_ADJUST_FEATURES 1
#define CV_ADJUST_WEIGHTS 0
...
...
@@ -636,34 +633,163 @@ cvSetImagesForHaarClassifierCascade( CvHaarClassifierCascade* _cascade,
}
//AVX version icvEvalHidHaarClassifier. Process 8 CvHidHaarClassifiers per call. Check AVX support before invocation!!
#ifdef CV_HAAR_USE_AVX
CV_INLINE
double
icvEvalHidHaarClassifierAVX
(
CvHidHaarClassifier
*
classifier
,
double
variance_norm_factor
,
size_t
p_offset
)
{
int
CV_DECL_ALIGNED
(
32
)
idxV
[
8
]
=
{
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
};
char
flags
[
8
]
=
{
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
};
CvHidHaarTreeNode
*
nodes
[
8
];
double
res
=
0
;
char
exitConditionFlag
=
0
;
for
(;;)
{
float
CV_DECL_ALIGNED
(
32
)
tmp
[
8
]
=
{
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
};
nodes
[
0
]
=
classifier
->
node
+
idxV
[
0
];
nodes
[
1
]
=
(
classifier
+
1
)
->
node
+
idxV
[
1
];
nodes
[
2
]
=
(
classifier
+
2
)
->
node
+
idxV
[
2
];
nodes
[
3
]
=
(
classifier
+
3
)
->
node
+
idxV
[
3
];
nodes
[
4
]
=
(
classifier
+
4
)
->
node
+
idxV
[
4
];
nodes
[
5
]
=
(
classifier
+
5
)
->
node
+
idxV
[
5
];
nodes
[
6
]
=
(
classifier
+
6
)
->
node
+
idxV
[
6
];
nodes
[
7
]
=
(
classifier
+
7
)
->
node
+
idxV
[
7
];
__m256
t
=
_mm256_set1_ps
(
variance_norm_factor
);
t
=
_mm256_mul_ps
(
t
,
_mm256_set_ps
(
nodes
[
7
]
->
threshold
,
nodes
[
6
]
->
threshold
,
nodes
[
5
]
->
threshold
,
nodes
[
4
]
->
threshold
,
nodes
[
3
]
->
threshold
,
nodes
[
2
]
->
threshold
,
nodes
[
1
]
->
threshold
,
nodes
[
0
]
->
threshold
));
__m256
offset
=
_mm256_set_ps
(
calc_sum
(
nodes
[
7
]
->
feature
.
rect
[
0
],
p_offset
),
calc_sum
(
nodes
[
6
]
->
feature
.
rect
[
0
],
p_offset
),
calc_sum
(
nodes
[
5
]
->
feature
.
rect
[
0
],
p_offset
),
calc_sum
(
nodes
[
4
]
->
feature
.
rect
[
0
],
p_offset
),
calc_sum
(
nodes
[
3
]
->
feature
.
rect
[
0
],
p_offset
),
calc_sum
(
nodes
[
2
]
->
feature
.
rect
[
0
],
p_offset
),
calc_sum
(
nodes
[
1
]
->
feature
.
rect
[
0
],
p_offset
),
calc_sum
(
nodes
[
0
]
->
feature
.
rect
[
0
],
p_offset
));
__m256
weight
=
_mm256_set_ps
(
nodes
[
7
]
->
feature
.
rect
[
0
].
weight
,
nodes
[
6
]
->
feature
.
rect
[
0
].
weight
,
nodes
[
5
]
->
feature
.
rect
[
0
].
weight
,
nodes
[
4
]
->
feature
.
rect
[
0
].
weight
,
nodes
[
3
]
->
feature
.
rect
[
0
].
weight
,
nodes
[
2
]
->
feature
.
rect
[
0
].
weight
,
nodes
[
1
]
->
feature
.
rect
[
0
].
weight
,
nodes
[
0
]
->
feature
.
rect
[
0
].
weight
);
__m256
sum
=
_mm256_mul_ps
(
offset
,
weight
);
offset
=
_mm256_set_ps
(
calc_sum
(
nodes
[
7
]
->
feature
.
rect
[
1
],
p_offset
),
calc_sum
(
nodes
[
6
]
->
feature
.
rect
[
1
],
p_offset
),
calc_sum
(
nodes
[
5
]
->
feature
.
rect
[
1
],
p_offset
),
calc_sum
(
nodes
[
4
]
->
feature
.
rect
[
1
],
p_offset
),
calc_sum
(
nodes
[
3
]
->
feature
.
rect
[
1
],
p_offset
),
calc_sum
(
nodes
[
2
]
->
feature
.
rect
[
1
],
p_offset
),
calc_sum
(
nodes
[
1
]
->
feature
.
rect
[
1
],
p_offset
),
calc_sum
(
nodes
[
0
]
->
feature
.
rect
[
1
],
p_offset
));
weight
=
_mm256_set_ps
(
nodes
[
7
]
->
feature
.
rect
[
1
].
weight
,
nodes
[
6
]
->
feature
.
rect
[
1
].
weight
,
nodes
[
5
]
->
feature
.
rect
[
1
].
weight
,
nodes
[
4
]
->
feature
.
rect
[
1
].
weight
,
nodes
[
3
]
->
feature
.
rect
[
1
].
weight
,
nodes
[
2
]
->
feature
.
rect
[
1
].
weight
,
nodes
[
1
]
->
feature
.
rect
[
1
].
weight
,
nodes
[
0
]
->
feature
.
rect
[
1
].
weight
);
sum
=
_mm256_add_ps
(
sum
,
_mm256_mul_ps
(
offset
,
weight
));
if
(
nodes
[
0
]
->
feature
.
rect
[
2
].
p0
)
tmp
[
0
]
=
calc_sum
(
nodes
[
0
]
->
feature
.
rect
[
2
],
p_offset
)
*
nodes
[
0
]
->
feature
.
rect
[
2
].
weight
;
if
(
nodes
[
1
]
->
feature
.
rect
[
2
].
p0
)
tmp
[
1
]
=
calc_sum
(
nodes
[
1
]
->
feature
.
rect
[
2
],
p_offset
)
*
nodes
[
1
]
->
feature
.
rect
[
2
].
weight
;
if
(
nodes
[
2
]
->
feature
.
rect
[
2
].
p0
)
tmp
[
2
]
=
calc_sum
(
nodes
[
2
]
->
feature
.
rect
[
2
],
p_offset
)
*
nodes
[
2
]
->
feature
.
rect
[
2
].
weight
;
if
(
nodes
[
3
]
->
feature
.
rect
[
2
].
p0
)
tmp
[
3
]
=
calc_sum
(
nodes
[
3
]
->
feature
.
rect
[
2
],
p_offset
)
*
nodes
[
3
]
->
feature
.
rect
[
2
].
weight
;
if
(
nodes
[
4
]
->
feature
.
rect
[
2
].
p0
)
tmp
[
4
]
=
calc_sum
(
nodes
[
4
]
->
feature
.
rect
[
2
],
p_offset
)
*
nodes
[
4
]
->
feature
.
rect
[
2
].
weight
;
if
(
nodes
[
5
]
->
feature
.
rect
[
2
].
p0
)
tmp
[
5
]
=
calc_sum
(
nodes
[
5
]
->
feature
.
rect
[
2
],
p_offset
)
*
nodes
[
5
]
->
feature
.
rect
[
2
].
weight
;
if
(
nodes
[
6
]
->
feature
.
rect
[
2
].
p0
)
tmp
[
6
]
=
calc_sum
(
nodes
[
6
]
->
feature
.
rect
[
2
],
p_offset
)
*
nodes
[
6
]
->
feature
.
rect
[
2
].
weight
;
if
(
nodes
[
7
]
->
feature
.
rect
[
2
].
p0
)
tmp
[
7
]
=
calc_sum
(
nodes
[
7
]
->
feature
.
rect
[
2
],
p_offset
)
*
nodes
[
7
]
->
feature
.
rect
[
2
].
weight
;
sum
=
_mm256_add_ps
(
sum
,
_mm256_load_ps
(
tmp
));
__m256
left
=
_mm256_set_ps
(
nodes
[
7
]
->
left
,
nodes
[
6
]
->
left
,
nodes
[
5
]
->
left
,
nodes
[
4
]
->
left
,
nodes
[
3
]
->
left
,
nodes
[
2
]
->
left
,
nodes
[
1
]
->
left
,
nodes
[
0
]
->
left
);
__m256
right
=
_mm256_set_ps
(
nodes
[
7
]
->
right
,
nodes
[
6
]
->
right
,
nodes
[
5
]
->
right
,
nodes
[
4
]
->
right
,
nodes
[
3
]
->
right
,
nodes
[
2
]
->
right
,
nodes
[
1
]
->
right
,
nodes
[
0
]
->
right
);
_mm256_store_si256
((
__m256i
*
)
idxV
,
_mm256_cvttps_epi32
(
_mm256_blendv_ps
(
right
,
left
,
_mm256_cmp_ps
(
sum
,
t
,
_CMP_LT_OQ
))));
for
(
int
i
=
0
;
i
<
8
;
i
++
)
{
if
(
idxV
[
i
]
<=
0
)
{
if
(
!
flags
[
i
])
{
exitConditionFlag
++
;
flags
[
i
]
=
1
;
res
+=
((
classifier
+
i
)
->
alpha
[
-
idxV
[
i
]]);
}
idxV
[
i
]
=
0
;
}
}
if
(
exitConditionFlag
==
8
)
return
res
;
}
}
#endif
CV_INLINE
double
icvEvalHidHaarClassifier
(
CvHidHaarClassifier
*
classifier
,
double
variance_norm_factor
,
size_t
p_offset
)
{
int
idx
=
0
;
do
/*#if CV_HAAR_USE_SSE && !CV_HAAR_USE_AVX
if(cv::checkHardwareSupport(CV_CPU_SSE2))//based on old SSE variant. Works slow
{
double CV_DECL_ALIGNED(16) temp[2];
__m128d zero = _mm_setzero_pd();
do
{
CvHidHaarTreeNode* node = classifier->node + idx;
__m128d t = _mm_set1_pd((node->threshold)*variance_norm_factor);
__m128d left = _mm_set1_pd(node->left);
__m128d right = _mm_set1_pd(node->right);
double _sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight;
_sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight;
if( node->feature.rect[2].p0 )
_sum += calc_sum(node->feature.rect[2],p_offset) * node->feature.rect[2].weight;
__m128d sum = _mm_set1_pd(_sum);
t = _mm_cmplt_sd(sum, t);
sum = _mm_blendv_pd(right, left, t);
_mm_store_pd(temp, sum);
idx = (int)temp[0];
}
while(idx > 0 );
}
else
#endif*/
{
CvHidHaarTreeNode
*
node
=
classifier
->
node
+
idx
;
double
t
=
node
->
threshold
*
variance_norm_factor
;
do
{
CvHidHaarTreeNode
*
node
=
classifier
->
node
+
idx
;
double
t
=
node
->
threshold
*
variance_norm_factor
;
double
sum
=
calc_sum
(
node
->
feature
.
rect
[
0
],
p_offset
)
*
node
->
feature
.
rect
[
0
].
weight
;
sum
+=
calc_sum
(
node
->
feature
.
rect
[
1
],
p_offset
)
*
node
->
feature
.
rect
[
1
].
weight
;
double
sum
=
calc_sum
(
node
->
feature
.
rect
[
0
],
p_offset
)
*
node
->
feature
.
rect
[
0
].
weight
;
sum
+=
calc_sum
(
node
->
feature
.
rect
[
1
],
p_offset
)
*
node
->
feature
.
rect
[
1
].
weight
;
if
(
node
->
feature
.
rect
[
2
].
p0
)
sum
+=
calc_sum
(
node
->
feature
.
rect
[
2
],
p_offset
)
*
node
->
feature
.
rect
[
2
].
weight
;
if
(
node
->
feature
.
rect
[
2
].
p0
)
sum
+=
calc_sum
(
node
->
feature
.
rect
[
2
],
p_offset
)
*
node
->
feature
.
rect
[
2
].
weight
;
idx
=
sum
<
t
?
node
->
left
:
node
->
right
;
idx
=
sum
<
t
?
node
->
left
:
node
->
right
;
}
while
(
idx
>
0
);
}
while
(
idx
>
0
);
return
classifier
->
alpha
[
-
idx
];
}
static
int
cvRunHaarClassifierCascadeSum
(
const
CvHaarClassifierCascade
*
_cascade
,
CvPoint
pt
,
double
&
stage_sum
,
int
start_stage
)
{
#ifdef CV_HAAR_USE_AVX
bool
haveAVX
=
false
;
if
(
cv
::
checkHardwareSupport
(
CV_CPU_AVX
))
if
(
_xgetbv
(
_XCR_XFEATURE_ENABLED_MASK
)
&
0x6
)
// Check if the OS will save the YMM registers
{
haveAVX
=
true
;
}
#else
#ifdef CV_HAAR_USE_SSE
bool
haveSSE2
=
cv
::
checkHardwareSupport
(
CV_CPU_SSE2
);
#endif
#endif
int
p_offset
,
pq_offset
;
int
i
,
j
;
double
mean
,
variance_norm_factor
;
...
...
@@ -702,10 +828,20 @@ cvRunHaarClassifierCascadeSum( const CvHaarClassifierCascade* _cascade,
{
stage_sum
=
0.0
;
#ifdef CV_HAAR_USE_AVX
if
(
haveAVX
)
{
for
(
;
j
<
cascade
->
stage_classifier
[
i
].
count
-
8
;
j
+=
8
)
{
stage_sum
+=
icvEvalHidHaarClassifierAVX
(
cascade
->
stage_classifier
[
i
].
classifier
+
j
,
variance_norm_factor
,
p_offset
);
}
}
#endif
for
(
j
=
0
;
j
<
ptr
->
count
;
j
++
)
{
stage_sum
+=
icvEvalHidHaarClassifier
(
ptr
->
classifier
+
j
,
variance_norm_factor
,
p_offset
);
stage_sum
+=
icvEvalHidHaarClassifier
(
ptr
->
classifier
+
j
,
variance_norm_factor
,
p_offset
);
}
if
(
stage_sum
>=
ptr
->
threshold
)
...
...
@@ -723,99 +859,287 @@ cvRunHaarClassifierCascadeSum( const CvHaarClassifierCascade* _cascade,
}
else
if
(
cascade
->
isStumpBased
)
{
for
(
i
=
start_stage
;
i
<
cascade
->
count
;
i
++
)
{
#ifndef CV_HAAR_USE_SSE
stage_sum
=
0.0
;
#else
__m128d
stage_sum
=
_mm_setzero_pd
();
#endif
if
(
cascade
->
stage_classifier
[
i
].
two_rects
)
{
for
(
j
=
0
;
j
<
cascade
->
stage_classifier
[
i
].
count
;
j
++
)
{
CvHidHaarClassifier
*
classifier
=
cascade
->
stage_classifier
[
i
].
classifier
+
j
;
CvHidHaarTreeNode
*
node
=
classifier
->
node
;
#ifndef CV_HAAR_USE_SSE
double
t
=
node
->
threshold
*
variance_norm_factor
;
double
sum
=
calc_sum
(
node
->
feature
.
rect
[
0
],
p_offset
)
*
node
->
feature
.
rect
[
0
].
weight
;
sum
+=
calc_sum
(
node
->
feature
.
rect
[
1
],
p_offset
)
*
node
->
feature
.
rect
[
1
].
weight
;
stage_sum
+=
classifier
->
alpha
[
sum
>=
t
];
#else
// ayasin - NHM perf optim. Avoid use of costly flaky jcc
__m128d
t
=
_mm_set_sd
(
node
->
threshold
*
variance_norm_factor
);
__m128d
a
=
_mm_set_sd
(
classifier
->
alpha
[
0
]);
__m128d
b
=
_mm_set_sd
(
classifier
->
alpha
[
1
]);
__m128d
sum
=
_mm_set_sd
(
calc_sum
(
node
->
feature
.
rect
[
0
],
p_offset
)
*
node
->
feature
.
rect
[
0
].
weight
+
calc_sum
(
node
->
feature
.
rect
[
1
],
p_offset
)
*
node
->
feature
.
rect
[
1
].
weight
);
t
=
_mm_cmpgt_sd
(
t
,
sum
);
stage_sum
=
_mm_add_sd
(
stage_sum
,
_mm_blendv_pd
(
b
,
a
,
t
));
#endif
}
}
else
{
for
(
j
=
0
;
j
<
cascade
->
stage_classifier
[
i
].
count
;
j
++
)
{
CvHidHaarClassifier
*
classifier
=
cascade
->
stage_classifier
[
i
].
classifier
+
j
;
CvHidHaarTreeNode
*
node
=
classifier
->
node
;
#ifndef CV_HAAR_USE_SSE
double
t
=
node
->
threshold
*
variance_norm_factor
;
double
sum
=
calc_sum
(
node
->
feature
.
rect
[
0
],
p_offset
)
*
node
->
feature
.
rect
[
0
].
weight
;
sum
+=
calc_sum
(
node
->
feature
.
rect
[
1
],
p_offset
)
*
node
->
feature
.
rect
[
1
].
weight
;
if
(
node
->
feature
.
rect
[
2
].
p0
)
sum
+=
calc_sum
(
node
->
feature
.
rect
[
2
],
p_offset
)
*
node
->
feature
.
rect
[
2
].
weight
;
stage_sum
+=
classifier
->
alpha
[
sum
>=
t
];
#else
// ayasin - NHM perf optim. Avoid use of costly flaky jcc
__m128d
t
=
_mm_set_sd
(
node
->
threshold
*
variance_norm_factor
);
__m128d
a
=
_mm_set_sd
(
classifier
->
alpha
[
0
]);
__m128d
b
=
_mm_set_sd
(
classifier
->
alpha
[
1
]);
double
_sum
=
calc_sum
(
node
->
feature
.
rect
[
0
],
p_offset
)
*
node
->
feature
.
rect
[
0
].
weight
;
_sum
+=
calc_sum
(
node
->
feature
.
rect
[
1
],
p_offset
)
*
node
->
feature
.
rect
[
1
].
weight
;
if
(
node
->
feature
.
rect
[
2
].
p0
)
_sum
+=
calc_sum
(
node
->
feature
.
rect
[
2
],
p_offset
)
*
node
->
feature
.
rect
[
2
].
weight
;
__m128d
sum
=
_mm_set_sd
(
_sum
);
t
=
_mm_cmpgt_sd
(
t
,
sum
);
stage_sum
=
_mm_add_sd
(
stage_sum
,
_mm_blendv_pd
(
b
,
a
,
t
));
#endif
}
}
#ifndef CV_HAAR_USE_SSE
if
(
stage_sum
<
cascade
->
stage_classifier
[
i
].
threshold
)
#else
__m128d
i_threshold
=
_mm_set_sd
(
cascade
->
stage_classifier
[
i
].
threshold
);
if
(
_mm_comilt_sd
(
stage_sum
,
i_threshold
)
)
#endif
return
-
i
;
}
}
else
#ifdef CV_HAAR_USE_AVX
if
(
haveAVX
)
{
CvHidHaarClassifier
*
classifiers
[
8
];
CvHidHaarTreeNode
*
nodes
[
8
];
for
(
i
=
start_stage
;
i
<
cascade
->
count
;
i
++
)
{
stage_sum
=
0.0
;
int
j
=
0
;
float
CV_DECL_ALIGNED
(
32
)
buf
[
8
];
if
(
cascade
->
stage_classifier
[
i
].
two_rects
)
{
for
(
;
j
<=
cascade
->
stage_classifier
[
i
].
count
-
8
;
j
+=
8
)
{
//__m256 stage_sumPart = _mm256_setzero_ps();
classifiers
[
0
]
=
cascade
->
stage_classifier
[
i
].
classifier
+
j
;
nodes
[
0
]
=
classifiers
[
0
]
->
node
;
classifiers
[
1
]
=
cascade
->
stage_classifier
[
i
].
classifier
+
j
+
1
;
nodes
[
1
]
=
classifiers
[
1
]
->
node
;
classifiers
[
2
]
=
cascade
->
stage_classifier
[
i
].
classifier
+
j
+
2
;
nodes
[
2
]
=
classifiers
[
2
]
->
node
;
classifiers
[
3
]
=
cascade
->
stage_classifier
[
i
].
classifier
+
j
+
3
;
nodes
[
3
]
=
classifiers
[
3
]
->
node
;
classifiers
[
4
]
=
cascade
->
stage_classifier
[
i
].
classifier
+
j
+
4
;
nodes
[
4
]
=
classifiers
[
4
]
->
node
;
classifiers
[
5
]
=
cascade
->
stage_classifier
[
i
].
classifier
+
j
+
5
;
nodes
[
5
]
=
classifiers
[
5
]
->
node
;
classifiers
[
6
]
=
cascade
->
stage_classifier
[
i
].
classifier
+
j
+
6
;
nodes
[
6
]
=
classifiers
[
6
]
->
node
;
classifiers
[
7
]
=
cascade
->
stage_classifier
[
i
].
classifier
+
j
+
7
;
nodes
[
7
]
=
classifiers
[
7
]
->
node
;
__m256
t
=
_mm256_set1_ps
(
variance_norm_factor
);
t
=
_mm256_mul_ps
(
t
,
_mm256_set_ps
(
nodes
[
7
]
->
threshold
,
nodes
[
6
]
->
threshold
,
nodes
[
5
]
->
threshold
,
nodes
[
4
]
->
threshold
,
nodes
[
3
]
->
threshold
,
nodes
[
2
]
->
threshold
,
nodes
[
1
]
->
threshold
,
nodes
[
0
]
->
threshold
));
__m256
offset
=
_mm256_set_ps
(
calc_sum
(
nodes
[
7
]
->
feature
.
rect
[
0
],
p_offset
),
calc_sum
(
nodes
[
6
]
->
feature
.
rect
[
0
],
p_offset
),
calc_sum
(
nodes
[
5
]
->
feature
.
rect
[
0
],
p_offset
),
calc_sum
(
nodes
[
4
]
->
feature
.
rect
[
0
],
p_offset
),
calc_sum
(
nodes
[
3
]
->
feature
.
rect
[
0
],
p_offset
),
calc_sum
(
nodes
[
2
]
->
feature
.
rect
[
0
],
p_offset
),
calc_sum
(
nodes
[
1
]
->
feature
.
rect
[
0
],
p_offset
),
calc_sum
(
nodes
[
0
]
->
feature
.
rect
[
0
],
p_offset
));
__m256
weight
=
_mm256_set_ps
(
nodes
[
7
]
->
feature
.
rect
[
0
].
weight
,
nodes
[
6
]
->
feature
.
rect
[
0
].
weight
,
nodes
[
5
]
->
feature
.
rect
[
0
].
weight
,
nodes
[
4
]
->
feature
.
rect
[
0
].
weight
,
nodes
[
3
]
->
feature
.
rect
[
0
].
weight
,
nodes
[
2
]
->
feature
.
rect
[
0
].
weight
,
nodes
[
1
]
->
feature
.
rect
[
0
].
weight
,
nodes
[
0
]
->
feature
.
rect
[
0
].
weight
);
__m256
sum
=
_mm256_mul_ps
(
offset
,
weight
);
offset
=
_mm256_set_ps
(
calc_sum
(
nodes
[
7
]
->
feature
.
rect
[
1
],
p_offset
),
calc_sum
(
nodes
[
6
]
->
feature
.
rect
[
1
],
p_offset
),
calc_sum
(
nodes
[
5
]
->
feature
.
rect
[
1
],
p_offset
),
calc_sum
(
nodes
[
4
]
->
feature
.
rect
[
1
],
p_offset
),
calc_sum
(
nodes
[
3
]
->
feature
.
rect
[
1
],
p_offset
),
calc_sum
(
nodes
[
2
]
->
feature
.
rect
[
1
],
p_offset
),
calc_sum
(
nodes
[
1
]
->
feature
.
rect
[
1
],
p_offset
),
calc_sum
(
nodes
[
0
]
->
feature
.
rect
[
1
],
p_offset
));
weight
=
_mm256_set_ps
(
nodes
[
7
]
->
feature
.
rect
[
1
].
weight
,
nodes
[
6
]
->
feature
.
rect
[
1
].
weight
,
nodes
[
5
]
->
feature
.
rect
[
1
].
weight
,
nodes
[
4
]
->
feature
.
rect
[
1
].
weight
,
nodes
[
3
]
->
feature
.
rect
[
1
].
weight
,
nodes
[
2
]
->
feature
.
rect
[
1
].
weight
,
nodes
[
1
]
->
feature
.
rect
[
1
].
weight
,
nodes
[
0
]
->
feature
.
rect
[
1
].
weight
);
sum
=
_mm256_add_ps
(
sum
,
_mm256_mul_ps
(
offset
,
weight
));
__m256
alpha0
=
_mm256_set_ps
(
classifiers
[
7
]
->
alpha
[
0
],
classifiers
[
6
]
->
alpha
[
0
],
classifiers
[
5
]
->
alpha
[
0
],
classifiers
[
4
]
->
alpha
[
0
],
classifiers
[
3
]
->
alpha
[
0
],
classifiers
[
2
]
->
alpha
[
0
],
classifiers
[
1
]
->
alpha
[
0
],
classifiers
[
0
]
->
alpha
[
0
]);
__m256
alpha1
=
_mm256_set_ps
(
classifiers
[
7
]
->
alpha
[
1
],
classifiers
[
6
]
->
alpha
[
1
],
classifiers
[
5
]
->
alpha
[
1
],
classifiers
[
4
]
->
alpha
[
1
],
classifiers
[
3
]
->
alpha
[
1
],
classifiers
[
2
]
->
alpha
[
1
],
classifiers
[
1
]
->
alpha
[
1
],
classifiers
[
0
]
->
alpha
[
1
]);
_mm256_store_ps
(
buf
,
_mm256_blendv_ps
(
alpha0
,
alpha1
,
_mm256_cmp_ps
(
t
,
sum
,
_CMP_LE_OQ
)));
stage_sum
+=
(
buf
[
0
]
+
buf
[
1
]
+
buf
[
2
]
+
buf
[
3
]
+
buf
[
4
]
+
buf
[
5
]
+
buf
[
6
]
+
buf
[
7
]);
}
for
(
;
j
<
cascade
->
stage_classifier
[
i
].
count
;
j
++
)
{
CvHidHaarClassifier
*
classifier
=
cascade
->
stage_classifier
[
i
].
classifier
+
j
;
CvHidHaarTreeNode
*
node
=
classifier
->
node
;
double
t
=
node
->
threshold
*
variance_norm_factor
;
double
sum
=
calc_sum
(
node
->
feature
.
rect
[
0
],
p_offset
)
*
node
->
feature
.
rect
[
0
].
weight
;
sum
+=
calc_sum
(
node
->
feature
.
rect
[
1
],
p_offset
)
*
node
->
feature
.
rect
[
1
].
weight
;
stage_sum
+=
classifier
->
alpha
[
sum
>=
t
];
}
}
else
{
for
(
;
j
<=
(
cascade
->
stage_classifier
[
i
].
count
)
-
8
;
j
+=
8
)
{
float
CV_DECL_ALIGNED
(
32
)
tmp
[
8
]
=
{
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
};
classifiers
[
0
]
=
cascade
->
stage_classifier
[
i
].
classifier
+
j
;
nodes
[
0
]
=
classifiers
[
0
]
->
node
;
classifiers
[
1
]
=
cascade
->
stage_classifier
[
i
].
classifier
+
j
+
1
;
nodes
[
1
]
=
classifiers
[
1
]
->
node
;
classifiers
[
2
]
=
cascade
->
stage_classifier
[
i
].
classifier
+
j
+
2
;
nodes
[
2
]
=
classifiers
[
2
]
->
node
;
classifiers
[
3
]
=
cascade
->
stage_classifier
[
i
].
classifier
+
j
+
3
;
nodes
[
3
]
=
classifiers
[
3
]
->
node
;
classifiers
[
4
]
=
cascade
->
stage_classifier
[
i
].
classifier
+
j
+
4
;
nodes
[
4
]
=
classifiers
[
4
]
->
node
;
classifiers
[
5
]
=
cascade
->
stage_classifier
[
i
].
classifier
+
j
+
5
;
nodes
[
5
]
=
classifiers
[
5
]
->
node
;
classifiers
[
6
]
=
cascade
->
stage_classifier
[
i
].
classifier
+
j
+
6
;
nodes
[
6
]
=
classifiers
[
6
]
->
node
;
classifiers
[
7
]
=
cascade
->
stage_classifier
[
i
].
classifier
+
j
+
7
;
nodes
[
7
]
=
classifiers
[
7
]
->
node
;
__m256
t
=
_mm256_set1_ps
(
variance_norm_factor
);
t
=
_mm256_mul_ps
(
t
,
_mm256_set_ps
(
nodes
[
7
]
->
threshold
,
nodes
[
6
]
->
threshold
,
nodes
[
5
]
->
threshold
,
nodes
[
4
]
->
threshold
,
nodes
[
3
]
->
threshold
,
nodes
[
2
]
->
threshold
,
nodes
[
1
]
->
threshold
,
nodes
[
0
]
->
threshold
));
__m256
offset
=
_mm256_set_ps
(
calc_sum
(
nodes
[
7
]
->
feature
.
rect
[
0
],
p_offset
),
calc_sum
(
nodes
[
6
]
->
feature
.
rect
[
0
],
p_offset
),
calc_sum
(
nodes
[
5
]
->
feature
.
rect
[
0
],
p_offset
),
calc_sum
(
nodes
[
4
]
->
feature
.
rect
[
0
],
p_offset
),
calc_sum
(
nodes
[
3
]
->
feature
.
rect
[
0
],
p_offset
),
calc_sum
(
nodes
[
2
]
->
feature
.
rect
[
0
],
p_offset
),
calc_sum
(
nodes
[
1
]
->
feature
.
rect
[
0
],
p_offset
),
calc_sum
(
nodes
[
0
]
->
feature
.
rect
[
0
],
p_offset
));
__m256
weight
=
_mm256_set_ps
(
nodes
[
7
]
->
feature
.
rect
[
0
].
weight
,
nodes
[
6
]
->
feature
.
rect
[
0
].
weight
,
nodes
[
5
]
->
feature
.
rect
[
0
].
weight
,
nodes
[
4
]
->
feature
.
rect
[
0
].
weight
,
nodes
[
3
]
->
feature
.
rect
[
0
].
weight
,
nodes
[
2
]
->
feature
.
rect
[
0
].
weight
,
nodes
[
1
]
->
feature
.
rect
[
0
].
weight
,
nodes
[
0
]
->
feature
.
rect
[
0
].
weight
);
__m256
sum
=
_mm256_mul_ps
(
offset
,
weight
);
offset
=
_mm256_set_ps
(
calc_sum
(
nodes
[
7
]
->
feature
.
rect
[
1
],
p_offset
),
calc_sum
(
nodes
[
6
]
->
feature
.
rect
[
1
],
p_offset
),
calc_sum
(
nodes
[
5
]
->
feature
.
rect
[
1
],
p_offset
),
calc_sum
(
nodes
[
4
]
->
feature
.
rect
[
1
],
p_offset
),
calc_sum
(
nodes
[
3
]
->
feature
.
rect
[
1
],
p_offset
),
calc_sum
(
nodes
[
2
]
->
feature
.
rect
[
1
],
p_offset
),
calc_sum
(
nodes
[
1
]
->
feature
.
rect
[
1
],
p_offset
),
calc_sum
(
nodes
[
0
]
->
feature
.
rect
[
1
],
p_offset
));
weight
=
_mm256_set_ps
(
nodes
[
7
]
->
feature
.
rect
[
1
].
weight
,
nodes
[
6
]
->
feature
.
rect
[
1
].
weight
,
nodes
[
5
]
->
feature
.
rect
[
1
].
weight
,
nodes
[
4
]
->
feature
.
rect
[
1
].
weight
,
nodes
[
3
]
->
feature
.
rect
[
1
].
weight
,
nodes
[
2
]
->
feature
.
rect
[
1
].
weight
,
nodes
[
1
]
->
feature
.
rect
[
1
].
weight
,
nodes
[
0
]
->
feature
.
rect
[
1
].
weight
);
sum
=
_mm256_add_ps
(
sum
,
_mm256_mul_ps
(
offset
,
weight
));
if
(
nodes
[
0
]
->
feature
.
rect
[
2
].
p0
)
tmp
[
0
]
=
calc_sum
(
nodes
[
0
]
->
feature
.
rect
[
2
],
p_offset
)
*
nodes
[
0
]
->
feature
.
rect
[
2
].
weight
;
if
(
nodes
[
1
]
->
feature
.
rect
[
2
].
p0
)
tmp
[
1
]
=
calc_sum
(
nodes
[
1
]
->
feature
.
rect
[
2
],
p_offset
)
*
nodes
[
1
]
->
feature
.
rect
[
2
].
weight
;
if
(
nodes
[
2
]
->
feature
.
rect
[
2
].
p0
)
tmp
[
2
]
=
calc_sum
(
nodes
[
2
]
->
feature
.
rect
[
2
],
p_offset
)
*
nodes
[
2
]
->
feature
.
rect
[
2
].
weight
;
if
(
nodes
[
3
]
->
feature
.
rect
[
2
].
p0
)
tmp
[
3
]
=
calc_sum
(
nodes
[
3
]
->
feature
.
rect
[
2
],
p_offset
)
*
nodes
[
3
]
->
feature
.
rect
[
2
].
weight
;
if
(
nodes
[
4
]
->
feature
.
rect
[
2
].
p0
)
tmp
[
4
]
=
calc_sum
(
nodes
[
4
]
->
feature
.
rect
[
2
],
p_offset
)
*
nodes
[
4
]
->
feature
.
rect
[
2
].
weight
;
if
(
nodes
[
5
]
->
feature
.
rect
[
2
].
p0
)
tmp
[
5
]
=
calc_sum
(
nodes
[
5
]
->
feature
.
rect
[
2
],
p_offset
)
*
nodes
[
5
]
->
feature
.
rect
[
2
].
weight
;
if
(
nodes
[
6
]
->
feature
.
rect
[
2
].
p0
)
tmp
[
6
]
=
calc_sum
(
nodes
[
6
]
->
feature
.
rect
[
2
],
p_offset
)
*
nodes
[
6
]
->
feature
.
rect
[
2
].
weight
;
if
(
nodes
[
7
]
->
feature
.
rect
[
2
].
p0
)
tmp
[
7
]
=
calc_sum
(
nodes
[
7
]
->
feature
.
rect
[
2
],
p_offset
)
*
nodes
[
7
]
->
feature
.
rect
[
2
].
weight
;
sum
=
_mm256_add_ps
(
sum
,
_mm256_load_ps
(
tmp
));
__m256
alpha0
=
_mm256_set_ps
(
classifiers
[
7
]
->
alpha
[
0
],
classifiers
[
6
]
->
alpha
[
0
],
classifiers
[
5
]
->
alpha
[
0
],
classifiers
[
4
]
->
alpha
[
0
],
classifiers
[
3
]
->
alpha
[
0
],
classifiers
[
2
]
->
alpha
[
0
],
classifiers
[
1
]
->
alpha
[
0
],
classifiers
[
0
]
->
alpha
[
0
]);
__m256
alpha1
=
_mm256_set_ps
(
classifiers
[
7
]
->
alpha
[
1
],
classifiers
[
6
]
->
alpha
[
1
],
classifiers
[
5
]
->
alpha
[
1
],
classifiers
[
4
]
->
alpha
[
1
],
classifiers
[
3
]
->
alpha
[
1
],
classifiers
[
2
]
->
alpha
[
1
],
classifiers
[
1
]
->
alpha
[
1
],
classifiers
[
0
]
->
alpha
[
1
]);
__m256
outBuf
=
_mm256_blendv_ps
(
alpha0
,
alpha1
,
_mm256_cmp_ps
(
t
,
sum
,
_CMP_LE_OQ
));
outBuf
=
_mm256_hadd_ps
(
outBuf
,
outBuf
);
outBuf
=
_mm256_hadd_ps
(
outBuf
,
outBuf
);
_mm256_store_ps
(
buf
,
outBuf
);
stage_sum
+=
(
buf
[
0
]
+
buf
[
4
]);
//(buf[0]+buf[1]+buf[2]+buf[3]+buf[4]+buf[5]+buf[6]+buf[7]);
}
for
(
;
j
<
cascade
->
stage_classifier
[
i
].
count
;
j
++
)
{
CvHidHaarClassifier
*
classifier
=
cascade
->
stage_classifier
[
i
].
classifier
+
j
;
CvHidHaarTreeNode
*
node
=
classifier
->
node
;
double
t
=
node
->
threshold
*
variance_norm_factor
;
double
sum
=
calc_sum
(
node
->
feature
.
rect
[
0
],
p_offset
)
*
node
->
feature
.
rect
[
0
].
weight
;
sum
+=
calc_sum
(
node
->
feature
.
rect
[
1
],
p_offset
)
*
node
->
feature
.
rect
[
1
].
weight
;
if
(
node
->
feature
.
rect
[
2
].
p0
)
sum
+=
calc_sum
(
node
->
feature
.
rect
[
2
],
p_offset
)
*
node
->
feature
.
rect
[
2
].
weight
;
stage_sum
+=
classifier
->
alpha
[
sum
>=
t
];
}
}
if
(
stage_sum
<
cascade
->
stage_classifier
[
i
].
threshold
)
return
-
i
;
}
}
else
#endif
#ifdef CV_HAAR_USE_SSE && !CV_HAAR_USE_AVX //old SSE optimization
if
(
haveSSE2
)
{
for
(
i
=
start_stage
;
i
<
cascade
->
count
;
i
++
)
{
__m128d
stage_sum
=
_mm_setzero_pd
();
if
(
cascade
->
stage_classifier
[
i
].
two_rects
)
{
for
(
j
=
0
;
j
<
cascade
->
stage_classifier
[
i
].
count
;
j
++
)
{
CvHidHaarClassifier
*
classifier
=
cascade
->
stage_classifier
[
i
].
classifier
+
j
;
CvHidHaarTreeNode
*
node
=
classifier
->
node
;
// ayasin - NHM perf optim. Avoid use of costly flaky jcc
__m128d
t
=
_mm_set_sd
(
node
->
threshold
*
variance_norm_factor
);
__m128d
a
=
_mm_set_sd
(
classifier
->
alpha
[
0
]);
__m128d
b
=
_mm_set_sd
(
classifier
->
alpha
[
1
]);
__m128d
sum
=
_mm_set_sd
(
calc_sum
(
node
->
feature
.
rect
[
0
],
p_offset
)
*
node
->
feature
.
rect
[
0
].
weight
+
calc_sum
(
node
->
feature
.
rect
[
1
],
p_offset
)
*
node
->
feature
.
rect
[
1
].
weight
);
t
=
_mm_cmpgt_sd
(
t
,
sum
);
stage_sum
=
_mm_add_sd
(
stage_sum
,
_mm_blendv_pd
(
b
,
a
,
t
));
}
}
else
{
for
(
j
=
0
;
j
<
cascade
->
stage_classifier
[
i
].
count
;
j
++
)
{
CvHidHaarClassifier
*
classifier
=
cascade
->
stage_classifier
[
i
].
classifier
+
j
;
CvHidHaarTreeNode
*
node
=
classifier
->
node
;
// ayasin - NHM perf optim. Avoid use of costly flaky jcc
__m128d
t
=
_mm_set_sd
(
node
->
threshold
*
variance_norm_factor
);
__m128d
a
=
_mm_set_sd
(
classifier
->
alpha
[
0
]);
__m128d
b
=
_mm_set_sd
(
classifier
->
alpha
[
1
]);
double
_sum
=
calc_sum
(
node
->
feature
.
rect
[
0
],
p_offset
)
*
node
->
feature
.
rect
[
0
].
weight
;
_sum
+=
calc_sum
(
node
->
feature
.
rect
[
1
],
p_offset
)
*
node
->
feature
.
rect
[
1
].
weight
;
if
(
node
->
feature
.
rect
[
2
].
p0
)
_sum
+=
calc_sum
(
node
->
feature
.
rect
[
2
],
p_offset
)
*
node
->
feature
.
rect
[
2
].
weight
;
__m128d
sum
=
_mm_set_sd
(
_sum
);
t
=
_mm_cmpgt_sd
(
t
,
sum
);
stage_sum
=
_mm_add_sd
(
stage_sum
,
_mm_blendv_pd
(
b
,
a
,
t
));
}
}
__m128d
i_threshold
=
_mm_set1_pd
(
cascade
->
stage_classifier
[
i
].
threshold
);
if
(
_mm_comilt_sd
(
stage_sum
,
i_threshold
)
)
return
-
i
;
}
}
else
#endif
{
for
(
i
=
start_stage
;
i
<
cascade
->
count
;
i
++
)
{
stage_sum
=
0.0
;
if
(
cascade
->
stage_classifier
[
i
].
two_rects
)
{
for
(
j
=
0
;
j
<
cascade
->
stage_classifier
[
i
].
count
;
j
++
)
{
CvHidHaarClassifier
*
classifier
=
cascade
->
stage_classifier
[
i
].
classifier
+
j
;
CvHidHaarTreeNode
*
node
=
classifier
->
node
;
double
t
=
node
->
threshold
*
variance_norm_factor
;
double
sum
=
calc_sum
(
node
->
feature
.
rect
[
0
],
p_offset
)
*
node
->
feature
.
rect
[
0
].
weight
;
sum
+=
calc_sum
(
node
->
feature
.
rect
[
1
],
p_offset
)
*
node
->
feature
.
rect
[
1
].
weight
;
stage_sum
+=
classifier
->
alpha
[
sum
>=
t
];
}
}
else
{
for
(
j
=
0
;
j
<
cascade
->
stage_classifier
[
i
].
count
;
j
++
)
{
CvHidHaarClassifier
*
classifier
=
cascade
->
stage_classifier
[
i
].
classifier
+
j
;
CvHidHaarTreeNode
*
node
=
classifier
->
node
;
double
t
=
node
->
threshold
*
variance_norm_factor
;
double
sum
=
calc_sum
(
node
->
feature
.
rect
[
0
],
p_offset
)
*
node
->
feature
.
rect
[
0
].
weight
;
sum
+=
calc_sum
(
node
->
feature
.
rect
[
1
],
p_offset
)
*
node
->
feature
.
rect
[
1
].
weight
;
if
(
node
->
feature
.
rect
[
2
].
p0
)
sum
+=
calc_sum
(
node
->
feature
.
rect
[
2
],
p_offset
)
*
node
->
feature
.
rect
[
2
].
weight
;
stage_sum
+=
classifier
->
alpha
[
sum
>=
t
];
}
}
if
(
stage_sum
<
cascade
->
stage_classifier
[
i
].
threshold
)
return
-
i
;
}
}
}
else
{
for
(
i
=
start_stage
;
i
<
cascade
->
count
;
i
++
)
{
stage_sum
=
0.0
;
for
(
j
=
0
;
j
<
cascade
->
stage_classifier
[
i
].
count
;
j
++
)
{
stage_sum
+=
icvEvalHidHaarClassifier
(
cascade
->
stage_classifier
[
i
].
classifier
+
j
,
variance_norm_factor
,
p_offset
);
}
int
j
=
0
;
#ifdef CV_HAAR_USE_AVX
if
(
haveAVX
)
{
for
(
;
j
<
cascade
->
stage_classifier
[
i
].
count
-
8
;
j
+=
8
)
{
stage_sum
+=
icvEvalHidHaarClassifierAVX
(
cascade
->
stage_classifier
[
i
].
classifier
+
j
,
variance_norm_factor
,
p_offset
);
}
}
#endif
for
(;
j
<
cascade
->
stage_classifier
[
i
].
count
;
j
++
)
{
stage_sum
+=
icvEvalHidHaarClassifier
(
cascade
->
stage_classifier
[
i
].
classifier
+
j
,
variance_norm_factor
,
p_offset
);
}
if
(
stage_sum
<
cascade
->
stage_classifier
[
i
].
threshold
)
return
-
i
;
}
}
//_mm256_zeroupper();
return
1
;
}
CV_IMPL
int
cvRunHaarClassifierCascade
(
const
CvHaarClassifierCascade
*
_cascade
,
CvPoint
pt
,
int
start_stage
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment