Commit 089de14e authored by Andrey Kamaev's avatar Andrey Kamaev

Fix copy-paste bug in AVX optimization of haar

parent f32eb05e
...@@ -45,7 +45,6 @@ ...@@ -45,7 +45,6 @@
#include <stdio.h> #include <stdio.h>
#include "opencv2/core/internal.hpp" #include "opencv2/core/internal.hpp"
#if CV_SSE2 || CV_SSE3 #if CV_SSE2 || CV_SSE3
# if !CV_SSE4_1 && !CV_SSE4_2 # if !CV_SSE4_1 && !CV_SSE4_2
# define _mm_blendv_pd(a, b, m) _mm_xor_pd(a, _mm_and_pd(_mm_xor_pd(b, a), m)) # define _mm_blendv_pd(a, b, m) _mm_xor_pd(a, _mm_and_pd(_mm_xor_pd(b, a), m))
...@@ -53,13 +52,13 @@ ...@@ -53,13 +52,13 @@
# endif # endif
#endif #endif
# if CV_AVX #if CV_AVX
# define CV_HAAR_USE_AVX 1 # define CV_HAAR_USE_AVX 1
# else #else
# if CV_SSE2 || CV_SSE3 # if CV_SSE2 || CV_SSE3
# define CV_HAAR_USE_SSE 1 # define CV_HAAR_USE_SSE 1
# endif # endif
# endif #endif
/* these settings affect the quality of detection: change with care */ /* these settings affect the quality of detection: change with care */
#define CV_ADJUST_FEATURES 1 #define CV_ADJUST_FEATURES 1
...@@ -76,8 +75,7 @@ typedef struct CvHidHaarFeature ...@@ -76,8 +75,7 @@ typedef struct CvHidHaarFeature
float weight; float weight;
} }
rect[CV_HAAR_FEATURE_MAX]; rect[CV_HAAR_FEATURE_MAX];
} } CvHidHaarFeature;
CvHidHaarFeature;
typedef struct CvHidHaarTreeNode typedef struct CvHidHaarTreeNode
...@@ -86,8 +84,7 @@ typedef struct CvHidHaarTreeNode ...@@ -86,8 +84,7 @@ typedef struct CvHidHaarTreeNode
float threshold; float threshold;
int left; int left;
int right; int right;
} } CvHidHaarTreeNode;
CvHidHaarTreeNode;
typedef struct CvHidHaarClassifier typedef struct CvHidHaarClassifier
...@@ -96,8 +93,7 @@ typedef struct CvHidHaarClassifier ...@@ -96,8 +93,7 @@ typedef struct CvHidHaarClassifier
//CvHaarFeature* orig_feature; //CvHaarFeature* orig_feature;
CvHidHaarTreeNode* node; CvHidHaarTreeNode* node;
float* alpha; float* alpha;
} } CvHidHaarClassifier;
CvHidHaarClassifier;
typedef struct CvHidHaarStageClassifier typedef struct CvHidHaarStageClassifier
...@@ -110,11 +106,10 @@ typedef struct CvHidHaarStageClassifier ...@@ -110,11 +106,10 @@ typedef struct CvHidHaarStageClassifier
struct CvHidHaarStageClassifier* next; struct CvHidHaarStageClassifier* next;
struct CvHidHaarStageClassifier* child; struct CvHidHaarStageClassifier* child;
struct CvHidHaarStageClassifier* parent; struct CvHidHaarStageClassifier* parent;
} } CvHidHaarStageClassifier;
CvHidHaarStageClassifier;
struct CvHidHaarClassifierCascade typedef struct CvHidHaarClassifierCascade
{ {
int count; int count;
int isStumpBased; int isStumpBased;
...@@ -127,7 +122,7 @@ struct CvHidHaarClassifierCascade ...@@ -127,7 +122,7 @@ struct CvHidHaarClassifierCascade
sumtype *p0, *p1, *p2, *p3; sumtype *p0, *p1, *p2, *p3;
void** ipp_stages; void** ipp_stages;
}; } CvHidHaarClassifierCascade;
const int icv_object_win_border = 1; const int icv_object_win_border = 1;
...@@ -634,21 +629,21 @@ cvSetImagesForHaarClassifierCascade( CvHaarClassifierCascade* _cascade, ...@@ -634,21 +629,21 @@ cvSetImagesForHaarClassifierCascade( CvHaarClassifierCascade* _cascade,
} }
//AVX version icvEvalHidHaarClassifier. Process 8 CvHidHaarClassifiers per call. Check AVX support before invocation!! // AVX version icvEvalHidHaarClassifier. Process 8 CvHidHaarClassifiers per call. Check AVX support before invocation!!
#ifdef CV_HAAR_USE_AVX #ifdef CV_HAAR_USE_AVX
CV_INLINE CV_INLINE
double icvEvalHidHaarClassifierAVX( CvHidHaarClassifier* classifier, double icvEvalHidHaarClassifierAVX( CvHidHaarClassifier* classifier,
double variance_norm_factor, size_t p_offset ) double variance_norm_factor, size_t p_offset )
{ {
int CV_DECL_ALIGNED(32) idxV[8] = {0,0,0,0,0,0,0,0}; int CV_DECL_ALIGNED(32) idxV[8] = {0,0,0,0,0,0,0,0};
char flags[8] = {0,0,0,0,0,0,0,0}; uchar flags[8] = {0,0,0,0,0,0,0,0};
CvHidHaarTreeNode* nodes[8]; CvHidHaarTreeNode* nodes[8];
double res = 0; double res = 0;
char exitConditionFlag = 0; uchar exitConditionFlag = 0;
for(;;) for(;;)
{ {
float CV_DECL_ALIGNED(32) tmp[8] = {0,0,0,0,0,0,0,0}; float CV_DECL_ALIGNED(32) tmp[8] = {0,0,0,0,0,0,0,0};
nodes[0] = classifier ->node + idxV[0]; nodes[0] = (classifier+0)->node + idxV[0];
nodes[1] = (classifier+1)->node + idxV[1]; nodes[1] = (classifier+1)->node + idxV[1];
nodes[2] = (classifier+2)->node + idxV[2]; nodes[2] = (classifier+2)->node + idxV[2];
nodes[3] = (classifier+3)->node + idxV[3]; nodes[3] = (classifier+3)->node + idxV[3];
...@@ -658,46 +653,79 @@ double icvEvalHidHaarClassifierAVX( CvHidHaarClassifier* classifier, ...@@ -658,46 +653,79 @@ double icvEvalHidHaarClassifierAVX( CvHidHaarClassifier* classifier,
nodes[7] = (classifier+7)->node + idxV[7]; nodes[7] = (classifier+7)->node + idxV[7];
__m256 t = _mm256_set1_ps(variance_norm_factor); __m256 t = _mm256_set1_ps(variance_norm_factor);
t = _mm256_mul_ps(t, _mm256_set_ps(nodes[7]->threshold,nodes[6]->threshold,nodes[5]->threshold,nodes[4]->threshold,nodes[3]->threshold,nodes[2]->threshold,nodes[1]->threshold,nodes[0]->threshold));
__m256 offset = _mm256_set_ps(calc_sum(nodes[7]->feature.rect[0],p_offset), calc_sum(nodes[6]->feature.rect[0],p_offset), calc_sum(nodes[5]->feature.rect[0],p_offset), t = _mm256_mul_ps(t, _mm256_set_ps(nodes[7]->threshold,
calc_sum(nodes[4]->feature.rect[0],p_offset), calc_sum(nodes[3]->feature.rect[0],p_offset), calc_sum(nodes[2]->feature.rect[0],p_offset), calc_sum(nodes[1]->feature.rect[0], nodes[6]->threshold,
p_offset),calc_sum(nodes[0]->feature.rect[0],p_offset)); nodes[5]->threshold,
__m256 weight = _mm256_set_ps(nodes[7]->feature.rect[0].weight, nodes[6]->feature.rect[0].weight, nodes[5]->feature.rect[0].weight, nodes[4]->threshold,
nodes[4]->feature.rect[0].weight, nodes[3]->feature.rect[0].weight, nodes[2]->feature.rect[0].weight, nodes[1]->feature.rect[0].weight, nodes[0]->feature.rect[0].weight); nodes[3]->threshold,
__m256 sum = _mm256_mul_ps(offset, weight); nodes[2]->threshold,
nodes[1]->threshold,
nodes[0]->threshold));
__m256 offset = _mm256_set_ps(calc_sum(nodes[7]->feature.rect[0], p_offset),
calc_sum(nodes[6]->feature.rect[0], p_offset),
calc_sum(nodes[5]->feature.rect[0], p_offset),
calc_sum(nodes[4]->feature.rect[0], p_offset),
calc_sum(nodes[3]->feature.rect[0], p_offset),
calc_sum(nodes[2]->feature.rect[0], p_offset),
calc_sum(nodes[1]->feature.rect[0], p_offset),
calc_sum(nodes[0]->feature.rect[0], p_offset));
__m256 weight = _mm256_set_ps(nodes[7]->feature.rect[0].weight,
nodes[6]->feature.rect[0].weight,
nodes[5]->feature.rect[0].weight,
nodes[4]->feature.rect[0].weight,
nodes[3]->feature.rect[0].weight,
nodes[2]->feature.rect[0].weight,
nodes[1]->feature.rect[0].weight,
nodes[0]->feature.rect[0].weight);
offset = _mm256_set_ps(calc_sum(nodes[7]->feature.rect[1],p_offset),calc_sum(nodes[6]->feature.rect[1],p_offset),calc_sum(nodes[5]->feature.rect[1],p_offset), __m256 sum = _mm256_mul_ps(offset, weight);
calc_sum(nodes[4]->feature.rect[1],p_offset),calc_sum(nodes[3]->feature.rect[1],p_offset),calc_sum(nodes[2]->feature.rect[1],p_offset),calc_sum(nodes[1]->feature.rect[1],p_offset),
calc_sum(nodes[0]->feature.rect[1],p_offset));
weight = _mm256_set_ps(nodes[7]->feature.rect[1].weight, nodes[6]->feature.rect[1].weight, nodes[5]->feature.rect[1].weight, nodes[4]->feature.rect[1].weight,
nodes[3]->feature.rect[1].weight, nodes[2]->feature.rect[1].weight, nodes[1]->feature.rect[1].weight, nodes[0]->feature.rect[1].weight);
sum = _mm256_add_ps(sum, _mm256_mul_ps(offset,weight)); offset = _mm256_set_ps(calc_sum(nodes[7]->feature.rect[1], p_offset),
calc_sum(nodes[6]->feature.rect[1], p_offset),
calc_sum(nodes[5]->feature.rect[1], p_offset),
calc_sum(nodes[4]->feature.rect[1], p_offset),
calc_sum(nodes[3]->feature.rect[1], p_offset),
calc_sum(nodes[2]->feature.rect[1], p_offset),
calc_sum(nodes[1]->feature.rect[1], p_offset),
calc_sum(nodes[0]->feature.rect[1], p_offset));
weight = _mm256_set_ps(nodes[7]->feature.rect[1].weight,
nodes[6]->feature.rect[1].weight,
nodes[5]->feature.rect[1].weight,
nodes[4]->feature.rect[1].weight,
nodes[3]->feature.rect[1].weight,
nodes[2]->feature.rect[1].weight,
nodes[1]->feature.rect[1].weight,
nodes[0]->feature.rect[1].weight);
sum = _mm256_add_ps(sum, _mm256_mul_ps(offset, weight));
if( nodes[0]->feature.rect[2].p0 ) if( nodes[0]->feature.rect[2].p0 )
tmp[0] = calc_sum(nodes[0]->feature.rect[2],p_offset) * nodes[0]->feature.rect[2].weight; tmp[0] = calc_sum(nodes[0]->feature.rect[2], p_offset) * nodes[0]->feature.rect[2].weight;
if( nodes[1]->feature.rect[2].p0 ) if( nodes[1]->feature.rect[2].p0 )
tmp[1] = calc_sum(nodes[1]->feature.rect[2],p_offset) * nodes[1]->feature.rect[2].weight; tmp[1] = calc_sum(nodes[1]->feature.rect[2], p_offset) * nodes[1]->feature.rect[2].weight;
if( nodes[2]->feature.rect[2].p0 ) if( nodes[2]->feature.rect[2].p0 )
tmp[2] = calc_sum(nodes[2]->feature.rect[2],p_offset) * nodes[2]->feature.rect[2].weight; tmp[2] = calc_sum(nodes[2]->feature.rect[2], p_offset) * nodes[2]->feature.rect[2].weight;
if( nodes[3]->feature.rect[2].p0 ) if( nodes[3]->feature.rect[2].p0 )
tmp[3] = calc_sum(nodes[3]->feature.rect[2],p_offset) * nodes[3]->feature.rect[2].weight; tmp[3] = calc_sum(nodes[3]->feature.rect[2], p_offset) * nodes[3]->feature.rect[2].weight;
if( nodes[4]->feature.rect[2].p0 ) if( nodes[4]->feature.rect[2].p0 )
tmp[4] = calc_sum(nodes[4]->feature.rect[2],p_offset) * nodes[4]->feature.rect[2].weight; tmp[4] = calc_sum(nodes[4]->feature.rect[2], p_offset) * nodes[4]->feature.rect[2].weight;
if( nodes[5]->feature.rect[2].p0 ) if( nodes[5]->feature.rect[2].p0 )
tmp[5] = calc_sum(nodes[5]->feature.rect[2],p_offset) * nodes[5]->feature.rect[2].weight; tmp[5] = calc_sum(nodes[5]->feature.rect[2], p_offset) * nodes[5]->feature.rect[2].weight;
if( nodes[6]->feature.rect[2].p0 ) if( nodes[6]->feature.rect[2].p0 )
tmp[6] = calc_sum(nodes[6]->feature.rect[2],p_offset) * nodes[6]->feature.rect[2].weight; tmp[6] = calc_sum(nodes[6]->feature.rect[2], p_offset) * nodes[6]->feature.rect[2].weight;
if( nodes[7]->feature.rect[2].p0 ) if( nodes[7]->feature.rect[2].p0 )
tmp[7] = calc_sum(nodes[7]->feature.rect[2],p_offset) * nodes[7]->feature.rect[2].weight; tmp[7] = calc_sum(nodes[7]->feature.rect[2], p_offset) * nodes[7]->feature.rect[2].weight;
sum = _mm256_add_ps(sum,_mm256_load_ps(tmp)); sum = _mm256_add_ps(sum,_mm256_load_ps(tmp));
__m256 left = _mm256_set_ps(nodes[7]->left,nodes[6]->left,nodes[5]->left,nodes[4]->left,nodes[3]->left,nodes[2]->left,nodes[1]->left,nodes[0]->left); __m256 left = _mm256_set_ps(nodes[7]->left, nodes[6]->left, nodes[5]->left, nodes[4]->left, nodes[3]->left, nodes[2]->left, nodes[1]->left, nodes[0]->left );
__m256 right = _mm256_set_ps(nodes[7]->right,nodes[6]->right,nodes[5]->right,nodes[4]->right,nodes[3]->right,nodes[2]->right,nodes[1]->right,nodes[0]->right); __m256 right = _mm256_set_ps(nodes[7]->right,nodes[6]->right,nodes[5]->right,nodes[4]->right,nodes[3]->right,nodes[2]->right,nodes[1]->right,nodes[0]->right);
_mm256_store_si256((__m256i*)idxV,_mm256_cvttps_epi32(_mm256_blendv_ps(right, left,_mm256_cmp_ps(sum, t, _CMP_LT_OQ )))); _mm256_store_si256((__m256i*)idxV, _mm256_cvttps_epi32(_mm256_blendv_ps(right, left, _mm256_cmp_ps(sum, t, _CMP_LT_OQ))));
for(int i = 0; i < 8; i++) for(int i = 0; i < 8; i++)
{ {
...@@ -706,17 +734,17 @@ double icvEvalHidHaarClassifierAVX( CvHidHaarClassifier* classifier, ...@@ -706,17 +734,17 @@ double icvEvalHidHaarClassifierAVX( CvHidHaarClassifier* classifier,
if(!flags[i]) if(!flags[i])
{ {
exitConditionFlag++; exitConditionFlag++;
flags[i]=1; flags[i] = 1;
res+=((classifier+i)->alpha[-idxV[i]]); res += (classifier+i)->alpha[-idxV[i]];
} }
idxV[i]=0; idxV[i]=0;
} }
} }
if(exitConditionFlag==8) if(exitConditionFlag == 8)
return res; return res;
} }
} }
#endif #endif //CV_HAAR_USE_AVX
CV_INLINE CV_INLINE
double icvEvalHidHaarClassifier( CvHidHaarClassifier* classifier, double icvEvalHidHaarClassifier( CvHidHaarClassifier* classifier,
...@@ -778,18 +806,16 @@ static int ...@@ -778,18 +806,16 @@ static int
cvRunHaarClassifierCascadeSum( const CvHaarClassifierCascade* _cascade, cvRunHaarClassifierCascadeSum( const CvHaarClassifierCascade* _cascade,
CvPoint pt, double& stage_sum, int start_stage ) CvPoint pt, double& stage_sum, int start_stage )
{ {
#ifdef CV_HAAR_USE_AVX #ifdef CV_HAAR_USE_AVX
bool haveAVX = false; bool haveAVX = false;
if(cv::checkHardwareSupport(CV_CPU_AVX)) if(cv::checkHardwareSupport(CV_CPU_AVX))
if(__xgetbv()&0x6)// Check if the OS will save the YMM registers if(__xgetbv()&0x6)// Check if the OS will save the YMM registers
{ haveAVX = true;
haveAVX = true; #else
} # ifdef CV_HAAR_USE_SSE
#else bool haveSSE2 = cv::checkHardwareSupport(CV_CPU_SSE2);
#ifdef CV_HAAR_USE_SSE # endif
bool haveSSE2 = cv::checkHardwareSupport(CV_CPU_SSE2); #endif
#endif
#endif
int p_offset, pq_offset; int p_offset, pq_offset;
int i, j; int i, j;
...@@ -828,19 +854,20 @@ cvRunHaarClassifierCascadeSum( const CvHaarClassifierCascade* _cascade, ...@@ -828,19 +854,20 @@ cvRunHaarClassifierCascadeSum( const CvHaarClassifierCascade* _cascade,
while( ptr ) while( ptr )
{ {
stage_sum = 0.0; stage_sum = 0.0;
j = 0;
#ifdef CV_HAAR_USE_AVX #ifdef CV_HAAR_USE_AVX
if(haveAVX) if(haveAVX)
{ {
for( ; j < cascade->stage_classifier[i].count-8; j+=8 ) for( ; j <= ptr->count - 8; j += 8 )
{ {
stage_sum += icvEvalHidHaarClassifierAVX( stage_sum += icvEvalHidHaarClassifierAVX(
cascade->stage_classifier[i].classifier+j, ptr->classifier + j,
variance_norm_factor, p_offset ); variance_norm_factor, p_offset );
} }
} }
#endif #endif
for( j = 0; j < ptr->count; j++ ) for( ; j < ptr->count; j++ )
{ {
stage_sum += icvEvalHidHaarClassifier( ptr->classifier + j, variance_norm_factor, p_offset ); stage_sum += icvEvalHidHaarClassifier( ptr->classifier + j, variance_norm_factor, p_offset );
} }
...@@ -860,283 +887,369 @@ cvRunHaarClassifierCascadeSum( const CvHaarClassifierCascade* _cascade, ...@@ -860,283 +887,369 @@ cvRunHaarClassifierCascadeSum( const CvHaarClassifierCascade* _cascade,
} }
else if( cascade->isStumpBased ) else if( cascade->isStumpBased )
{ {
#ifdef CV_HAAR_USE_AVX #ifdef CV_HAAR_USE_AVX
if(haveAVX) if(haveAVX)
{
CvHidHaarClassifier* classifiers[8];
CvHidHaarTreeNode* nodes[8];
for( i = start_stage; i < cascade->count; i++ )
{ {
CvHidHaarClassifier* classifiers[8]; stage_sum = 0.0;
CvHidHaarTreeNode* nodes[8]; j = 0;
for( i = start_stage; i < cascade->count; i++ ) float CV_DECL_ALIGNED(32) buf[8];
if( cascade->stage_classifier[i].two_rects )
{ {
stage_sum = 0.0; for( ; j <= cascade->stage_classifier[i].count - 8; j += 8 )
j = 0;
float CV_DECL_ALIGNED(32) buf[8];
if( cascade->stage_classifier[i].two_rects )
{ {
for( ; j <= cascade->stage_classifier[i].count-8; j+=8 ) classifiers[0] = cascade->stage_classifier[i].classifier + j;
{ nodes[0] = classifiers[0]->node;
//__m256 stage_sumPart = _mm256_setzero_ps(); classifiers[1] = cascade->stage_classifier[i].classifier + j + 1;
classifiers[0] = cascade->stage_classifier[i].classifier + j; nodes[1] = classifiers[1]->node;
nodes[0] = classifiers[0]->node; classifiers[2] = cascade->stage_classifier[i].classifier + j + 2;
classifiers[1] = cascade->stage_classifier[i].classifier + j + 1; nodes[2] = classifiers[2]->node;
nodes[1] = classifiers[1]->node; classifiers[3] = cascade->stage_classifier[i].classifier + j + 3;
classifiers[2] = cascade->stage_classifier[i].classifier + j + 2; nodes[3] = classifiers[3]->node;
nodes[2]= classifiers[2]->node; classifiers[4] = cascade->stage_classifier[i].classifier + j + 4;
classifiers[3] = cascade->stage_classifier[i].classifier + j + 3; nodes[4] = classifiers[4]->node;
nodes[3] = classifiers[3]->node; classifiers[5] = cascade->stage_classifier[i].classifier + j + 5;
classifiers[4] = cascade->stage_classifier[i].classifier + j + 4; nodes[5] = classifiers[5]->node;
nodes[4] = classifiers[4]->node; classifiers[6] = cascade->stage_classifier[i].classifier + j + 6;
classifiers[5] = cascade->stage_classifier[i].classifier + j + 5; nodes[6] = classifiers[6]->node;
nodes[5] = classifiers[5]->node; classifiers[7] = cascade->stage_classifier[i].classifier + j + 7;
classifiers[6] = cascade->stage_classifier[i].classifier + j + 6; nodes[7] = classifiers[7]->node;
nodes[6] = classifiers[6]->node;
classifiers[7] = cascade->stage_classifier[i].classifier + j + 7; __m256 t = _mm256_set1_ps(variance_norm_factor);
nodes[7] = classifiers[7]->node; t = _mm256_mul_ps(t, _mm256_set_ps(nodes[7]->threshold,
nodes[6]->threshold,
__m256 t = _mm256_set1_ps(variance_norm_factor); nodes[5]->threshold,
t = _mm256_mul_ps(t, _mm256_set_ps(nodes[7]->threshold,nodes[6]->threshold,nodes[5]->threshold,nodes[4]->threshold,nodes[3]->threshold,nodes[2]->threshold,nodes[1]->threshold,nodes[0]->threshold)); nodes[4]->threshold,
nodes[3]->threshold,
__m256 offset = _mm256_set_ps(calc_sum(nodes[7]->feature.rect[0],p_offset), calc_sum(nodes[6]->feature.rect[0],p_offset), calc_sum(nodes[5]->feature.rect[0],p_offset), nodes[2]->threshold,
calc_sum(nodes[4]->feature.rect[0],p_offset), calc_sum(nodes[3]->feature.rect[0],p_offset), calc_sum(nodes[2]->feature.rect[0],p_offset), calc_sum(nodes[1]->feature.rect[0], nodes[1]->threshold,
p_offset),calc_sum(nodes[0]->feature.rect[0],p_offset)); nodes[0]->threshold));
__m256 weight = _mm256_set_ps(nodes[7]->feature.rect[0].weight, nodes[6]->feature.rect[0].weight, nodes[5]->feature.rect[0].weight,
nodes[4]->feature.rect[0].weight, nodes[3]->feature.rect[0].weight, nodes[2]->feature.rect[0].weight, nodes[1]->feature.rect[0].weight, nodes[0]->feature.rect[0].weight); __m256 offset = _mm256_set_ps(calc_sum(nodes[7]->feature.rect[0], p_offset),
__m256 sum = _mm256_mul_ps(offset, weight); calc_sum(nodes[6]->feature.rect[0], p_offset),
calc_sum(nodes[5]->feature.rect[0], p_offset),
offset = _mm256_set_ps(calc_sum(nodes[7]->feature.rect[1],p_offset),calc_sum(nodes[6]->feature.rect[1],p_offset),calc_sum(nodes[5]->feature.rect[1],p_offset), calc_sum(nodes[4]->feature.rect[0], p_offset),
calc_sum(nodes[4]->feature.rect[1],p_offset),calc_sum(nodes[3]->feature.rect[1],p_offset),calc_sum(nodes[2]->feature.rect[1],p_offset),calc_sum(nodes[1]->feature.rect[1],p_offset), calc_sum(nodes[3]->feature.rect[0], p_offset),
calc_sum(nodes[0]->feature.rect[1],p_offset)); calc_sum(nodes[2]->feature.rect[0], p_offset),
weight = _mm256_set_ps(nodes[7]->feature.rect[1].weight, nodes[6]->feature.rect[1].weight, nodes[5]->feature.rect[1].weight, nodes[4]->feature.rect[1].weight, calc_sum(nodes[1]->feature.rect[0], p_offset),
nodes[3]->feature.rect[1].weight, nodes[2]->feature.rect[1].weight, nodes[1]->feature.rect[1].weight, nodes[0]->feature.rect[1].weight); calc_sum(nodes[0]->feature.rect[0], p_offset));
sum = _mm256_add_ps(sum, _mm256_mul_ps(offset,weight));
__m256 weight = _mm256_set_ps(nodes[7]->feature.rect[0].weight,
__m256 alpha0 = _mm256_set_ps(classifiers[7]->alpha[0],classifiers[6]->alpha[0],classifiers[5]->alpha[0],classifiers[4]->alpha[0],classifiers[3]->alpha[0], nodes[6]->feature.rect[0].weight,
classifiers[2]->alpha[0],classifiers[1]->alpha[0],classifiers[0]->alpha[0]); nodes[5]->feature.rect[0].weight,
__m256 alpha1 = _mm256_set_ps(classifiers[7]->alpha[1],classifiers[6]->alpha[1],classifiers[5]->alpha[1],classifiers[4]->alpha[1],classifiers[3]->alpha[1], nodes[4]->feature.rect[0].weight,
classifiers[2]->alpha[1],classifiers[1]->alpha[1],classifiers[0]->alpha[1]); nodes[3]->feature.rect[0].weight,
nodes[2]->feature.rect[0].weight,
_mm256_store_ps(buf, _mm256_blendv_ps(alpha0, alpha1, _mm256_cmp_ps(t, sum, _CMP_LE_OQ ))); nodes[1]->feature.rect[0].weight,
stage_sum+=(buf[0]+buf[1]+buf[2]+buf[3]+buf[4]+buf[5]+buf[6]+buf[7]); nodes[0]->feature.rect[0].weight);
} __m256 sum = _mm256_mul_ps(offset, weight);
offset = _mm256_set_ps(calc_sum(nodes[7]->feature.rect[1], p_offset),
calc_sum(nodes[6]->feature.rect[1], p_offset),
calc_sum(nodes[5]->feature.rect[1], p_offset),
calc_sum(nodes[4]->feature.rect[1], p_offset),
calc_sum(nodes[3]->feature.rect[1], p_offset),
calc_sum(nodes[2]->feature.rect[1], p_offset),
calc_sum(nodes[1]->feature.rect[1], p_offset),
calc_sum(nodes[0]->feature.rect[1], p_offset));
weight = _mm256_set_ps(nodes[7]->feature.rect[1].weight,
nodes[6]->feature.rect[1].weight,
nodes[5]->feature.rect[1].weight,
nodes[4]->feature.rect[1].weight,
nodes[3]->feature.rect[1].weight,
nodes[2]->feature.rect[1].weight,
nodes[1]->feature.rect[1].weight,
nodes[0]->feature.rect[1].weight);
sum = _mm256_add_ps(sum, _mm256_mul_ps(offset,weight));
__m256 alpha0 = _mm256_set_ps(classifiers[7]->alpha[0],
classifiers[6]->alpha[0],
classifiers[5]->alpha[0],
classifiers[4]->alpha[0],
classifiers[3]->alpha[0],
classifiers[2]->alpha[0],
classifiers[1]->alpha[0],
classifiers[0]->alpha[0]);
__m256 alpha1 = _mm256_set_ps(classifiers[7]->alpha[1],
classifiers[6]->alpha[1],
classifiers[5]->alpha[1],
classifiers[4]->alpha[1],
classifiers[3]->alpha[1],
classifiers[2]->alpha[1],
classifiers[1]->alpha[1],
classifiers[0]->alpha[1]);
_mm256_store_ps(buf, _mm256_blendv_ps(alpha0, alpha1, _mm256_cmp_ps(t, sum, _CMP_LE_OQ)));
stage_sum += (buf[0]+buf[1]+buf[2]+buf[3]+buf[4]+buf[5]+buf[6]+buf[7]);
}
for( ; j < cascade->stage_classifier[i].count; j++ ) for( ; j < cascade->stage_classifier[i].count; j++ )
{ {
CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j; CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j;
CvHidHaarTreeNode* node = classifier->node; CvHidHaarTreeNode* node = classifier->node;
double t = node->threshold*variance_norm_factor; double t = node->threshold*variance_norm_factor;
double sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight; double sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight;
sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight; sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight;
stage_sum += classifier->alpha[sum >= t]; stage_sum += classifier->alpha[sum >= t];
}
} }
else }
else
{
for( ; j <= (cascade->stage_classifier[i].count)-8; j+=8 )
{ {
for( ; j <= (cascade->stage_classifier[i].count)-8; j+=8 ) float CV_DECL_ALIGNED(32) tmp[8] = {0,0,0,0,0,0,0,0};
{
float CV_DECL_ALIGNED(32) tmp[8] = {0,0,0,0,0,0,0,0}; classifiers[0] = cascade->stage_classifier[i].classifier + j;
nodes[0] = classifiers[0]->node;
classifiers[0] = cascade->stage_classifier[i].classifier + j; classifiers[1] = cascade->stage_classifier[i].classifier + j + 1;
nodes[0] = classifiers[0]->node; nodes[1] = classifiers[1]->node;
classifiers[1] = cascade->stage_classifier[i].classifier + j + 1; classifiers[2] = cascade->stage_classifier[i].classifier + j + 2;
nodes[1] = classifiers[1]->node; nodes[2] = classifiers[2]->node;
classifiers[2] = cascade->stage_classifier[i].classifier + j + 2; classifiers[3] = cascade->stage_classifier[i].classifier + j + 3;
nodes[2]= classifiers[2]->node; nodes[3] = classifiers[3]->node;
classifiers[3] = cascade->stage_classifier[i].classifier + j + 3; classifiers[4] = cascade->stage_classifier[i].classifier + j + 4;
nodes[3] = classifiers[3]->node; nodes[4] = classifiers[4]->node;
classifiers[4] = cascade->stage_classifier[i].classifier + j + 4; classifiers[5] = cascade->stage_classifier[i].classifier + j + 5;
nodes[4] = classifiers[4]->node; nodes[5] = classifiers[5]->node;
classifiers[5] = cascade->stage_classifier[i].classifier + j + 5; classifiers[6] = cascade->stage_classifier[i].classifier + j + 6;
nodes[5] = classifiers[5]->node; nodes[6] = classifiers[6]->node;
classifiers[6] = cascade->stage_classifier[i].classifier + j + 6; classifiers[7] = cascade->stage_classifier[i].classifier + j + 7;
nodes[6] = classifiers[6]->node; nodes[7] = classifiers[7]->node;
classifiers[7] = cascade->stage_classifier[i].classifier + j + 7;
nodes[7] = classifiers[7]->node; __m256 t = _mm256_set1_ps(variance_norm_factor);
__m256 t = _mm256_set1_ps(variance_norm_factor); t = _mm256_mul_ps(t, _mm256_set_ps(nodes[7]->threshold,
t = _mm256_mul_ps(t, _mm256_set_ps(nodes[7]->threshold,nodes[6]->threshold,nodes[5]->threshold,nodes[4]->threshold,nodes[3]->threshold,nodes[2]->threshold,nodes[1]->threshold,nodes[0]->threshold)); nodes[6]->threshold,
nodes[5]->threshold,
__m256 offset = _mm256_set_ps(calc_sum(nodes[7]->feature.rect[0],p_offset), calc_sum(nodes[6]->feature.rect[0],p_offset), calc_sum(nodes[5]->feature.rect[0],p_offset), nodes[4]->threshold,
calc_sum(nodes[4]->feature.rect[0],p_offset), calc_sum(nodes[3]->feature.rect[0],p_offset), calc_sum(nodes[2]->feature.rect[0],p_offset), calc_sum(nodes[1]->feature.rect[0], nodes[3]->threshold,
p_offset),calc_sum(nodes[0]->feature.rect[0],p_offset)); nodes[2]->threshold,
__m256 weight = _mm256_set_ps(nodes[7]->feature.rect[0].weight, nodes[6]->feature.rect[0].weight, nodes[5]->feature.rect[0].weight, nodes[1]->threshold,
nodes[4]->feature.rect[0].weight, nodes[3]->feature.rect[0].weight, nodes[2]->feature.rect[0].weight, nodes[1]->feature.rect[0].weight, nodes[0]->feature.rect[0].weight); nodes[0]->threshold));
__m256 sum = _mm256_mul_ps(offset, weight);
__m256 offset = _mm256_set_ps(calc_sum(nodes[7]->feature.rect[0], p_offset),
offset = _mm256_set_ps(calc_sum(nodes[7]->feature.rect[1],p_offset),calc_sum(nodes[6]->feature.rect[1],p_offset),calc_sum(nodes[5]->feature.rect[1],p_offset), calc_sum(nodes[6]->feature.rect[0], p_offset),
calc_sum(nodes[4]->feature.rect[1],p_offset),calc_sum(nodes[3]->feature.rect[1],p_offset),calc_sum(nodes[2]->feature.rect[1],p_offset),calc_sum(nodes[1]->feature.rect[1],p_offset), calc_sum(nodes[5]->feature.rect[0], p_offset),
calc_sum(nodes[0]->feature.rect[1],p_offset)); calc_sum(nodes[4]->feature.rect[0], p_offset),
weight = _mm256_set_ps(nodes[7]->feature.rect[1].weight, nodes[6]->feature.rect[1].weight, nodes[5]->feature.rect[1].weight, nodes[4]->feature.rect[1].weight, calc_sum(nodes[3]->feature.rect[0], p_offset),
nodes[3]->feature.rect[1].weight, nodes[2]->feature.rect[1].weight, nodes[1]->feature.rect[1].weight, nodes[0]->feature.rect[1].weight); calc_sum(nodes[2]->feature.rect[0], p_offset),
calc_sum(nodes[1]->feature.rect[0], p_offset),
sum = _mm256_add_ps(sum, _mm256_mul_ps(offset,weight)); calc_sum(nodes[0]->feature.rect[0], p_offset));
if( nodes[0]->feature.rect[2].p0 ) __m256 weight = _mm256_set_ps(nodes[7]->feature.rect[0].weight,
tmp[0] = calc_sum(nodes[0]->feature.rect[2],p_offset) * nodes[0]->feature.rect[2].weight; nodes[6]->feature.rect[0].weight,
if( nodes[1]->feature.rect[2].p0 ) nodes[5]->feature.rect[0].weight,
tmp[1] = calc_sum(nodes[1]->feature.rect[2],p_offset) * nodes[1]->feature.rect[2].weight; nodes[4]->feature.rect[0].weight,
if( nodes[2]->feature.rect[2].p0 ) nodes[3]->feature.rect[0].weight,
tmp[2] = calc_sum(nodes[2]->feature.rect[2],p_offset) * nodes[2]->feature.rect[2].weight; nodes[2]->feature.rect[0].weight,
if( nodes[3]->feature.rect[2].p0 ) nodes[1]->feature.rect[0].weight,
tmp[3] = calc_sum(nodes[3]->feature.rect[2],p_offset) * nodes[3]->feature.rect[2].weight; nodes[0]->feature.rect[0].weight);
if( nodes[4]->feature.rect[2].p0 )
tmp[4] = calc_sum(nodes[4]->feature.rect[2],p_offset) * nodes[4]->feature.rect[2].weight; __m256 sum = _mm256_mul_ps(offset, weight);
if( nodes[5]->feature.rect[2].p0 )
tmp[5] = calc_sum(nodes[5]->feature.rect[2],p_offset) * nodes[5]->feature.rect[2].weight; offset = _mm256_set_ps(calc_sum(nodes[7]->feature.rect[1], p_offset),
if( nodes[6]->feature.rect[2].p0 ) calc_sum(nodes[6]->feature.rect[1], p_offset),
tmp[6] = calc_sum(nodes[6]->feature.rect[2],p_offset) * nodes[6]->feature.rect[2].weight; calc_sum(nodes[5]->feature.rect[1], p_offset),
if( nodes[7]->feature.rect[2].p0 ) calc_sum(nodes[4]->feature.rect[1], p_offset),
tmp[7] = calc_sum(nodes[7]->feature.rect[2],p_offset) * nodes[7]->feature.rect[2].weight; calc_sum(nodes[3]->feature.rect[1], p_offset),
calc_sum(nodes[2]->feature.rect[1], p_offset),
sum = _mm256_add_ps(sum, _mm256_load_ps(tmp)); calc_sum(nodes[1]->feature.rect[1], p_offset),
calc_sum(nodes[0]->feature.rect[1], p_offset));
__m256 alpha0 = _mm256_set_ps(classifiers[7]->alpha[0],classifiers[6]->alpha[0],classifiers[5]->alpha[0],classifiers[4]->alpha[0],classifiers[3]->alpha[0],
classifiers[2]->alpha[0],classifiers[1]->alpha[0],classifiers[0]->alpha[0]); weight = _mm256_set_ps(nodes[7]->feature.rect[1].weight,
__m256 alpha1 = _mm256_set_ps(classifiers[7]->alpha[1],classifiers[6]->alpha[1],classifiers[5]->alpha[1],classifiers[4]->alpha[1],classifiers[3]->alpha[1], nodes[6]->feature.rect[1].weight,
classifiers[2]->alpha[1],classifiers[1]->alpha[1],classifiers[0]->alpha[1]); nodes[5]->feature.rect[1].weight,
nodes[4]->feature.rect[1].weight,
__m256 outBuf = _mm256_blendv_ps(alpha0, alpha1, _mm256_cmp_ps(t, sum, _CMP_LE_OQ )); nodes[3]->feature.rect[1].weight,
outBuf = _mm256_hadd_ps(outBuf, outBuf); nodes[2]->feature.rect[1].weight,
outBuf = _mm256_hadd_ps(outBuf, outBuf); nodes[1]->feature.rect[1].weight,
_mm256_store_ps(buf, outBuf); nodes[0]->feature.rect[1].weight);
stage_sum+=(buf[0]+buf[4]);//(buf[0]+buf[1]+buf[2]+buf[3]+buf[4]+buf[5]+buf[6]+buf[7]);
} sum = _mm256_add_ps(sum, _mm256_mul_ps(offset, weight));
if( nodes[0]->feature.rect[2].p0 )
tmp[0] = calc_sum(nodes[0]->feature.rect[2],p_offset) * nodes[0]->feature.rect[2].weight;
if( nodes[1]->feature.rect[2].p0 )
tmp[1] = calc_sum(nodes[1]->feature.rect[2],p_offset) * nodes[1]->feature.rect[2].weight;
if( nodes[2]->feature.rect[2].p0 )
tmp[2] = calc_sum(nodes[2]->feature.rect[2],p_offset) * nodes[2]->feature.rect[2].weight;
if( nodes[3]->feature.rect[2].p0 )
tmp[3] = calc_sum(nodes[3]->feature.rect[2],p_offset) * nodes[3]->feature.rect[2].weight;
if( nodes[4]->feature.rect[2].p0 )
tmp[4] = calc_sum(nodes[4]->feature.rect[2],p_offset) * nodes[4]->feature.rect[2].weight;
if( nodes[5]->feature.rect[2].p0 )
tmp[5] = calc_sum(nodes[5]->feature.rect[2],p_offset) * nodes[5]->feature.rect[2].weight;
if( nodes[6]->feature.rect[2].p0 )
tmp[6] = calc_sum(nodes[6]->feature.rect[2],p_offset) * nodes[6]->feature.rect[2].weight;
if( nodes[7]->feature.rect[2].p0 )
tmp[7] = calc_sum(nodes[7]->feature.rect[2],p_offset) * nodes[7]->feature.rect[2].weight;
sum = _mm256_add_ps(sum, _mm256_load_ps(tmp));
__m256 alpha0 = _mm256_set_ps(classifiers[7]->alpha[0],
classifiers[6]->alpha[0],
classifiers[5]->alpha[0],
classifiers[4]->alpha[0],
classifiers[3]->alpha[0],
classifiers[2]->alpha[0],
classifiers[1]->alpha[0],
classifiers[0]->alpha[0]);
__m256 alpha1 = _mm256_set_ps(classifiers[7]->alpha[1],
classifiers[6]->alpha[1],
classifiers[5]->alpha[1],
classifiers[4]->alpha[1],
classifiers[3]->alpha[1],
classifiers[2]->alpha[1],
classifiers[1]->alpha[1],
classifiers[0]->alpha[1]);
__m256 outBuf = _mm256_blendv_ps(alpha0, alpha1, _mm256_cmp_ps(t, sum, _CMP_LE_OQ ));
outBuf = _mm256_hadd_ps(outBuf, outBuf);
outBuf = _mm256_hadd_ps(outBuf, outBuf);
_mm256_store_ps(buf, outBuf);
stage_sum += (buf[0] + buf[4]);
}
for( ; j < cascade->stage_classifier[i].count; j++ ) for( ; j < cascade->stage_classifier[i].count; j++ )
{ {
CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j; CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j;
CvHidHaarTreeNode* node = classifier->node; CvHidHaarTreeNode* node = classifier->node;
double t = node->threshold*variance_norm_factor; double t = node->threshold*variance_norm_factor;
double sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight; double sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight;
sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight; sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight;
if( node->feature.rect[2].p0 ) if( node->feature.rect[2].p0 )
sum += calc_sum(node->feature.rect[2],p_offset) * node->feature.rect[2].weight; sum += calc_sum(node->feature.rect[2],p_offset) * node->feature.rect[2].weight;
stage_sum += classifier->alpha[sum >= t]; stage_sum += classifier->alpha[sum >= t];
}
} }
if( stage_sum < cascade->stage_classifier[i].threshold )
return -i;
} }
if( stage_sum < cascade->stage_classifier[i].threshold )
return -i;
} }
else }
#endif else
#if defined CV_HAAR_USE_SSE && CV_HAAR_USE_SSE && (!defined CV_HAAR_USE_AVX || !CV_HAAR_USE_AVX) //old SSE optimization #elif defined CV_HAAR_USE_SSE //old SSE optimization
if(haveSSE2) if(haveSSE2)
{
for( i = start_stage; i < cascade->count; i++ )
{ {
for( i = start_stage; i < cascade->count; i++ ) __m128d vstage_sum = _mm_setzero_pd();
if( cascade->stage_classifier[i].two_rects )
{ {
__m128d vstage_sum = _mm_setzero_pd(); for( j = 0; j < cascade->stage_classifier[i].count; j++ )
if( cascade->stage_classifier[i].two_rects )
{ {
for( j = 0; j < cascade->stage_classifier[i].count; j++ ) CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j;
{ CvHidHaarTreeNode* node = classifier->node;
CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j;
CvHidHaarTreeNode* node = classifier->node; // ayasin - NHM perf optim. Avoid use of costly flaky jcc
__m128d t = _mm_set_sd(node->threshold*variance_norm_factor);
// ayasin - NHM perf optim. Avoid use of costly flaky jcc __m128d a = _mm_set_sd(classifier->alpha[0]);
__m128d t = _mm_set_sd(node->threshold*variance_norm_factor); __m128d b = _mm_set_sd(classifier->alpha[1]);
__m128d a = _mm_set_sd(classifier->alpha[0]); __m128d sum = _mm_set_sd(calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight +
__m128d b = _mm_set_sd(classifier->alpha[1]); calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight);
__m128d sum = _mm_set_sd(calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight + t = _mm_cmpgt_sd(t, sum);
calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight); vstage_sum = _mm_add_sd(vstage_sum, _mm_blendv_pd(b, a, t));
t = _mm_cmpgt_sd(t, sum);
vstage_sum = _mm_add_sd(vstage_sum, _mm_blendv_pd(b, a, t));
}
} }
else }
else
{
for( j = 0; j < cascade->stage_classifier[i].count; j++ )
{ {
for( j = 0; j < cascade->stage_classifier[i].count; j++ ) CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j;
{ CvHidHaarTreeNode* node = classifier->node;
CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j; // ayasin - NHM perf optim. Avoid use of costly flaky jcc
CvHidHaarTreeNode* node = classifier->node; __m128d t = _mm_set_sd(node->threshold*variance_norm_factor);
// ayasin - NHM perf optim. Avoid use of costly flaky jcc __m128d a = _mm_set_sd(classifier->alpha[0]);
__m128d t = _mm_set_sd(node->threshold*variance_norm_factor); __m128d b = _mm_set_sd(classifier->alpha[1]);
__m128d a = _mm_set_sd(classifier->alpha[0]); double _sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight;
__m128d b = _mm_set_sd(classifier->alpha[1]); _sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight;
double _sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight; if( node->feature.rect[2].p0 )
_sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight; _sum += calc_sum(node->feature.rect[2],p_offset) * node->feature.rect[2].weight;
if( node->feature.rect[2].p0 ) __m128d sum = _mm_set_sd(_sum);
_sum += calc_sum(node->feature.rect[2],p_offset) * node->feature.rect[2].weight;
__m128d sum = _mm_set_sd(_sum); t = _mm_cmpgt_sd(t, sum);
vstage_sum = _mm_add_sd(vstage_sum, _mm_blendv_pd(b, a, t));
t = _mm_cmpgt_sd(t, sum);
vstage_sum = _mm_add_sd(vstage_sum, _mm_blendv_pd(b, a, t));
}
} }
__m128d i_threshold = _mm_set1_pd(cascade->stage_classifier[i].threshold);
if( _mm_comilt_sd(vstage_sum, i_threshold) )
return -i;
} }
__m128d i_threshold = _mm_set1_pd(cascade->stage_classifier[i].threshold);
if( _mm_comilt_sd(vstage_sum, i_threshold) )
return -i;
} }
else }
#endif else
#endif // AVX or SSE
{
for( i = start_stage; i < cascade->count; i++ )
{ {
for( i = start_stage; i < cascade->count; i++ ) stage_sum = 0.0;
if( cascade->stage_classifier[i].two_rects )
{ {
stage_sum = 0.0; for( j = 0; j < cascade->stage_classifier[i].count; j++ )
if( cascade->stage_classifier[i].two_rects )
{ {
for( j = 0; j < cascade->stage_classifier[i].count; j++ ) CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j;
{ CvHidHaarTreeNode* node = classifier->node;
CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j; double t = node->threshold*variance_norm_factor;
CvHidHaarTreeNode* node = classifier->node; double sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight;
double t = node->threshold*variance_norm_factor; sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight;
double sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight; stage_sum += classifier->alpha[sum >= t];
sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight;
stage_sum += classifier->alpha[sum >= t];
}
} }
else }
else
{
for( j = 0; j < cascade->stage_classifier[i].count; j++ )
{ {
for( j = 0; j < cascade->stage_classifier[i].count; j++ ) CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j;
{ CvHidHaarTreeNode* node = classifier->node;
CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j; double t = node->threshold*variance_norm_factor;
CvHidHaarTreeNode* node = classifier->node; double sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight;
double t = node->threshold*variance_norm_factor; sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight;
double sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight; if( node->feature.rect[2].p0 )
sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight; sum += calc_sum(node->feature.rect[2],p_offset) * node->feature.rect[2].weight;
if( node->feature.rect[2].p0 ) stage_sum += classifier->alpha[sum >= t];
sum += calc_sum(node->feature.rect[2],p_offset) * node->feature.rect[2].weight;
stage_sum += classifier->alpha[sum >= t];
}
} }
if( stage_sum < cascade->stage_classifier[i].threshold )
return -i;
} }
if( stage_sum < cascade->stage_classifier[i].threshold )
return -i;
} }
}
} }
else else
{ {
for( i = start_stage; i < cascade->count; i++ ) for( i = start_stage; i < cascade->count; i++ )
{ {
stage_sum = 0.0; stage_sum = 0.0;
int k = 0; int k = 0;
#ifdef CV_HAAR_USE_AVX
#ifdef CV_HAAR_USE_AVX
if(haveAVX) if(haveAVX)
{ {
for( ; k < cascade->stage_classifier[i].count-8; k+=8 ) for( ; k < cascade->stage_classifier[i].count - 8; k += 8 )
{ {
stage_sum += icvEvalHidHaarClassifierAVX( stage_sum += icvEvalHidHaarClassifierAVX(
cascade->stage_classifier[i].classifier+k, cascade->stage_classifier[i].classifier + k,
variance_norm_factor, p_offset ); variance_norm_factor, p_offset );
} }
} }
#endif #endif
for(; k < cascade->stage_classifier[i].count; k++ ) for(; k < cascade->stage_classifier[i].count; k++ )
{ {
stage_sum += icvEvalHidHaarClassifier( stage_sum += icvEvalHidHaarClassifier(
cascade->stage_classifier[i].classifier + k, cascade->stage_classifier[i].classifier + k,
variance_norm_factor, p_offset ); variance_norm_factor, p_offset );
} }
if( stage_sum < cascade->stage_classifier[i].threshold ) if( stage_sum < cascade->stage_classifier[i].threshold )
return -i; return -i;
} }
} }
//_mm256_zeroupper();
return 1; return 1;
} }
...@@ -1186,7 +1299,7 @@ struct HaarDetectObjects_ScaleImage_Invoker ...@@ -1186,7 +1299,7 @@ struct HaarDetectObjects_ScaleImage_Invoker
Size ssz(sum1.cols - 1 - winSize0.width, y2 - y1); Size ssz(sum1.cols - 1 - winSize0.width, y2 - y1);
int x, y, ystep = factor > 2 ? 1 : 2; int x, y, ystep = factor > 2 ? 1 : 2;
#ifdef HAVE_IPP #ifdef HAVE_IPP
if( cascade->hid_cascade->ipp_stages ) if( cascade->hid_cascade->ipp_stages )
{ {
IppiRect iequRect = {equRect.x, equRect.y, equRect.width, equRect.height}; IppiRect iequRect = {equRect.x, equRect.y, equRect.width, equRect.height};
...@@ -1241,7 +1354,7 @@ struct HaarDetectObjects_ScaleImage_Invoker ...@@ -1241,7 +1354,7 @@ struct HaarDetectObjects_ScaleImage_Invoker
} }
} }
else else
#endif #endif // IPP
for( y = y1; y < y2; y += ystep ) for( y = y1; y < y2; y += ystep )
for( x = 0; x < ssz.width; x += ystep ) for( x = 0; x < ssz.width; x += ystep )
{ {
...@@ -1880,18 +1993,18 @@ cvReleaseHaarClassifierCascade( CvHaarClassifierCascade** _cascade ) ...@@ -1880,18 +1993,18 @@ cvReleaseHaarClassifierCascade( CvHaarClassifierCascade** _cascade )
#define ICV_HAAR_SIZE_NAME "size" #define ICV_HAAR_SIZE_NAME "size"
#define ICV_HAAR_STAGES_NAME "stages" #define ICV_HAAR_STAGES_NAME "stages"
#define ICV_HAAR_TREES_NAME "trees" #define ICV_HAAR_TREES_NAME "trees"
#define ICV_HAAR_FEATURE_NAME "feature" #define ICV_HAAR_FEATURE_NAME "feature"
#define ICV_HAAR_RECTS_NAME "rects" #define ICV_HAAR_RECTS_NAME "rects"
#define ICV_HAAR_TILTED_NAME "tilted" #define ICV_HAAR_TILTED_NAME "tilted"
#define ICV_HAAR_THRESHOLD_NAME "threshold" #define ICV_HAAR_THRESHOLD_NAME "threshold"
#define ICV_HAAR_LEFT_NODE_NAME "left_node" #define ICV_HAAR_LEFT_NODE_NAME "left_node"
#define ICV_HAAR_LEFT_VAL_NAME "left_val" #define ICV_HAAR_LEFT_VAL_NAME "left_val"
#define ICV_HAAR_RIGHT_NODE_NAME "right_node" #define ICV_HAAR_RIGHT_NODE_NAME "right_node"
#define ICV_HAAR_RIGHT_VAL_NAME "right_val" #define ICV_HAAR_RIGHT_VAL_NAME "right_val"
#define ICV_HAAR_STAGE_THRESHOLD_NAME "stage_threshold" #define ICV_HAAR_STAGE_THRESHOLD_NAME "stage_threshold"
#define ICV_HAAR_PARENT_NAME "parent" #define ICV_HAAR_PARENT_NAME "parent"
#define ICV_HAAR_NEXT_NAME "next" #define ICV_HAAR_NEXT_NAME "next"
static int static int
icvIsHaarClassifier( const void* struct_ptr ) icvIsHaarClassifier( const void* struct_ptr )
...@@ -2418,45 +2531,4 @@ CvType haar_type( CV_TYPE_NAME_HAAR, icvIsHaarClassifier, ...@@ -2418,45 +2531,4 @@ CvType haar_type( CV_TYPE_NAME_HAAR, icvIsHaarClassifier,
icvReadHaarClassifier, icvWriteHaarClassifier, icvReadHaarClassifier, icvWriteHaarClassifier,
icvCloneHaarClassifier ); icvCloneHaarClassifier );
#if 0
namespace cv
{
HaarClassifierCascade::HaarClassifierCascade() {}
HaarClassifierCascade::HaarClassifierCascade(const String& filename)
{ load(filename); }
bool HaarClassifierCascade::load(const String& filename)
{
cascade = Ptr<CvHaarClassifierCascade>((CvHaarClassifierCascade*)cvLoad(filename.c_str(), 0, 0, 0));
return (CvHaarClassifierCascade*)cascade != 0;
}
void HaarClassifierCascade::detectMultiScale( const Mat& image,
Vector<Rect>& objects, double scaleFactor,
int minNeighbors, int flags,
Size minSize )
{
MemStorage storage(cvCreateMemStorage(0));
CvMat _image = image;
CvSeq* _objects = cvHaarDetectObjects( &_image, cascade, storage, scaleFactor,
minNeighbors, flags, minSize );
Seq<Rect>(_objects).copyTo(objects);
}
int HaarClassifierCascade::runAt(Point pt, int startStage, int) const
{
return cvRunHaarClassifierCascade(cascade, pt, startStage);
}
void HaarClassifierCascade::setImages( const Mat& sum, const Mat& sqsum,
const Mat& tilted, double scale )
{
CvMat _sum = sum, _sqsum = sqsum, _tilted = tilted;
cvSetImagesForHaarClassifierCascade( cascade, &_sum, &_sqsum, &_tilted, scale );
}
}
#endif
/* End of file. */ /* End of file. */
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment