Commit 95967461 authored by Vadim Pisarevsky's avatar Vadim Pisarevsky

restored SSE2 and added AVX optimization of the old haar face detector

parent dea52eb7
...@@ -178,7 +178,7 @@ struct HWFeatures ...@@ -178,7 +178,7 @@ struct HWFeatures
f.have[CV_CPU_SSE4_1] = (cpuid_data[2] & (1<<19)) != 0; f.have[CV_CPU_SSE4_1] = (cpuid_data[2] & (1<<19)) != 0;
f.have[CV_CPU_SSE4_2] = (cpuid_data[2] & (1<<20)) != 0; f.have[CV_CPU_SSE4_2] = (cpuid_data[2] & (1<<20)) != 0;
f.have[CV_CPU_POPCNT] = (cpuid_data[2] & (1<<23)) != 0; f.have[CV_CPU_POPCNT] = (cpuid_data[2] & (1<<23)) != 0;
f.have[CV_CPU_AVX] = (cpuid_data[2] & (1<<28)) != 0; f.have[CV_CPU_AVX] = (((cpuid_data[2] & (1<<28)) != 0)&&((cpuid_data[2] & (1<<27)) != 0));//OS uses XSAVE_XRSTORE and CPU support AVX
} }
return f; return f;
......
...@@ -43,26 +43,23 @@ ...@@ -43,26 +43,23 @@
#include "precomp.hpp" #include "precomp.hpp"
#include <stdio.h> #include <stdio.h>
/*
#if CV_SSE2
#if CV_SSE2 || CV_SSE3
# if !CV_SSE4_1 && !CV_SSE4_2 # if !CV_SSE4_1 && !CV_SSE4_2
# define _mm_blendv_pd(a, b, m) _mm_xor_pd(a, _mm_and_pd(_mm_xor_pd(b, a), m)) # define _mm_blendv_pd(a, b, m) _mm_xor_pd(a, _mm_and_pd(_mm_xor_pd(b, a), m))
# define _mm_blendv_ps(a, b, m) _mm_xor_ps(a, _mm_and_ps(_mm_xor_ps(b, a), m)) # define _mm_blendv_ps(a, b, m) _mm_xor_ps(a, _mm_and_ps(_mm_xor_ps(b, a), m))
# endif # endif
#endif #endif
#if defined CV_ICC # if CV_AVX
# if defined CV_AVX
# define CV_HAAR_USE_AVX 1 # define CV_HAAR_USE_AVX 1
# else # else
# if defined CV_SSE2 || defined CV_SSE4_1 || defined CV_SSE4_2 # if CV_SSE2 || CV_SSE3
# define CV_HAAR_USE_SSE 1 # define CV_HAAR_USE_SSE 1
# else
# define CV_HAAR_NO_SIMD 1
# endif # endif
# endif # endif
#endif
*/
/* these settings affect the quality of detection: change with care */ /* these settings affect the quality of detection: change with care */
#define CV_ADJUST_FEATURES 1 #define CV_ADJUST_FEATURES 1
#define CV_ADJUST_WEIGHTS 0 #define CV_ADJUST_WEIGHTS 0
...@@ -636,12 +633,126 @@ cvSetImagesForHaarClassifierCascade( CvHaarClassifierCascade* _cascade, ...@@ -636,12 +633,126 @@ cvSetImagesForHaarClassifierCascade( CvHaarClassifierCascade* _cascade,
} }
//AVX version icvEvalHidHaarClassifier. Process 8 CvHidHaarClassifiers per call. Check AVX support before invocation!!
#ifdef CV_HAAR_USE_AVX
CV_INLINE
double icvEvalHidHaarClassifierAVX( CvHidHaarClassifier* classifier,
double variance_norm_factor, size_t p_offset )
{
int CV_DECL_ALIGNED(32) idxV[8] = {0,0,0,0,0,0,0,0};
char flags[8] = {0,0,0,0,0,0,0,0};
CvHidHaarTreeNode* nodes[8];
double res = 0;
char exitConditionFlag = 0;
for(;;)
{
float CV_DECL_ALIGNED(32) tmp[8] = {0,0,0,0,0,0,0,0};
nodes[0] = classifier ->node + idxV[0];
nodes[1] = (classifier+1)->node + idxV[1];
nodes[2] = (classifier+2)->node + idxV[2];
nodes[3] = (classifier+3)->node + idxV[3];
nodes[4] = (classifier+4)->node + idxV[4];
nodes[5] = (classifier+5)->node + idxV[5];
nodes[6] = (classifier+6)->node + idxV[6];
nodes[7] = (classifier+7)->node + idxV[7];
__m256 t = _mm256_set1_ps(variance_norm_factor);
t = _mm256_mul_ps(t, _mm256_set_ps(nodes[7]->threshold,nodes[6]->threshold,nodes[5]->threshold,nodes[4]->threshold,nodes[3]->threshold,nodes[2]->threshold,nodes[1]->threshold,nodes[0]->threshold));
__m256 offset = _mm256_set_ps(calc_sum(nodes[7]->feature.rect[0],p_offset), calc_sum(nodes[6]->feature.rect[0],p_offset), calc_sum(nodes[5]->feature.rect[0],p_offset),
calc_sum(nodes[4]->feature.rect[0],p_offset), calc_sum(nodes[3]->feature.rect[0],p_offset), calc_sum(nodes[2]->feature.rect[0],p_offset), calc_sum(nodes[1]->feature.rect[0],
p_offset),calc_sum(nodes[0]->feature.rect[0],p_offset));
__m256 weight = _mm256_set_ps(nodes[7]->feature.rect[0].weight, nodes[6]->feature.rect[0].weight, nodes[5]->feature.rect[0].weight,
nodes[4]->feature.rect[0].weight, nodes[3]->feature.rect[0].weight, nodes[2]->feature.rect[0].weight, nodes[1]->feature.rect[0].weight, nodes[0]->feature.rect[0].weight);
__m256 sum = _mm256_mul_ps(offset, weight);
offset = _mm256_set_ps(calc_sum(nodes[7]->feature.rect[1],p_offset),calc_sum(nodes[6]->feature.rect[1],p_offset),calc_sum(nodes[5]->feature.rect[1],p_offset),
calc_sum(nodes[4]->feature.rect[1],p_offset),calc_sum(nodes[3]->feature.rect[1],p_offset),calc_sum(nodes[2]->feature.rect[1],p_offset),calc_sum(nodes[1]->feature.rect[1],p_offset),
calc_sum(nodes[0]->feature.rect[1],p_offset));
weight = _mm256_set_ps(nodes[7]->feature.rect[1].weight, nodes[6]->feature.rect[1].weight, nodes[5]->feature.rect[1].weight, nodes[4]->feature.rect[1].weight,
nodes[3]->feature.rect[1].weight, nodes[2]->feature.rect[1].weight, nodes[1]->feature.rect[1].weight, nodes[0]->feature.rect[1].weight);
sum = _mm256_add_ps(sum, _mm256_mul_ps(offset,weight));
if( nodes[0]->feature.rect[2].p0 )
tmp[0] = calc_sum(nodes[0]->feature.rect[2],p_offset) * nodes[0]->feature.rect[2].weight;
if( nodes[1]->feature.rect[2].p0 )
tmp[1] = calc_sum(nodes[1]->feature.rect[2],p_offset) * nodes[1]->feature.rect[2].weight;
if( nodes[2]->feature.rect[2].p0 )
tmp[2] = calc_sum(nodes[2]->feature.rect[2],p_offset) * nodes[2]->feature.rect[2].weight;
if( nodes[3]->feature.rect[2].p0 )
tmp[3] = calc_sum(nodes[3]->feature.rect[2],p_offset) * nodes[3]->feature.rect[2].weight;
if( nodes[4]->feature.rect[2].p0 )
tmp[4] = calc_sum(nodes[4]->feature.rect[2],p_offset) * nodes[4]->feature.rect[2].weight;
if( nodes[5]->feature.rect[2].p0 )
tmp[5] = calc_sum(nodes[5]->feature.rect[2],p_offset) * nodes[5]->feature.rect[2].weight;
if( nodes[6]->feature.rect[2].p0 )
tmp[6] = calc_sum(nodes[6]->feature.rect[2],p_offset) * nodes[6]->feature.rect[2].weight;
if( nodes[7]->feature.rect[2].p0 )
tmp[7] = calc_sum(nodes[7]->feature.rect[2],p_offset) * nodes[7]->feature.rect[2].weight;
sum = _mm256_add_ps(sum,_mm256_load_ps(tmp));
__m256 left = _mm256_set_ps(nodes[7]->left,nodes[6]->left,nodes[5]->left,nodes[4]->left,nodes[3]->left,nodes[2]->left,nodes[1]->left,nodes[0]->left);
__m256 right = _mm256_set_ps(nodes[7]->right,nodes[6]->right,nodes[5]->right,nodes[4]->right,nodes[3]->right,nodes[2]->right,nodes[1]->right,nodes[0]->right);
_mm256_store_si256((__m256i*)idxV,_mm256_cvttps_epi32(_mm256_blendv_ps(right, left,_mm256_cmp_ps(sum, t, _CMP_LT_OQ ))));
for(int i = 0; i < 8; i++)
{
if(idxV[i]<=0)
{
if(!flags[i])
{
exitConditionFlag++;
flags[i]=1;
res+=((classifier+i)->alpha[-idxV[i]]);
}
idxV[i]=0;
}
}
if(exitConditionFlag==8)
return res;
}
}
#endif
CV_INLINE CV_INLINE
double icvEvalHidHaarClassifier( CvHidHaarClassifier* classifier, double icvEvalHidHaarClassifier( CvHidHaarClassifier* classifier,
double variance_norm_factor, double variance_norm_factor,
size_t p_offset ) size_t p_offset )
{ {
int idx = 0; int idx = 0;
/*#if CV_HAAR_USE_SSE && !CV_HAAR_USE_AVX
if(cv::checkHardwareSupport(CV_CPU_SSE2))//based on old SSE variant. Works slow
{
double CV_DECL_ALIGNED(16) temp[2];
__m128d zero = _mm_setzero_pd();
do
{
CvHidHaarTreeNode* node = classifier->node + idx;
__m128d t = _mm_set1_pd((node->threshold)*variance_norm_factor);
__m128d left = _mm_set1_pd(node->left);
__m128d right = _mm_set1_pd(node->right);
double _sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight;
_sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight;
if( node->feature.rect[2].p0 )
_sum += calc_sum(node->feature.rect[2],p_offset) * node->feature.rect[2].weight;
__m128d sum = _mm_set1_pd(_sum);
t = _mm_cmplt_sd(sum, t);
sum = _mm_blendv_pd(right, left, t);
_mm_store_pd(temp, sum);
idx = (int)temp[0];
}
while(idx > 0 );
}
else
#endif*/
{
do do
{ {
CvHidHaarTreeNode* node = classifier->node + idx; CvHidHaarTreeNode* node = classifier->node + idx;
...@@ -656,14 +767,29 @@ double icvEvalHidHaarClassifier( CvHidHaarClassifier* classifier, ...@@ -656,14 +767,29 @@ double icvEvalHidHaarClassifier( CvHidHaarClassifier* classifier,
idx = sum < t ? node->left : node->right; idx = sum < t ? node->left : node->right;
} }
while( idx > 0 ); while( idx > 0 );
}
return classifier->alpha[-idx]; return classifier->alpha[-idx];
} }
static int static int
cvRunHaarClassifierCascadeSum( const CvHaarClassifierCascade* _cascade, cvRunHaarClassifierCascadeSum( const CvHaarClassifierCascade* _cascade,
CvPoint pt, double& stage_sum, int start_stage ) CvPoint pt, double& stage_sum, int start_stage )
{ {
#ifdef CV_HAAR_USE_AVX
bool haveAVX = false;
if(cv::checkHardwareSupport(CV_CPU_AVX))
if(_xgetbv(_XCR_XFEATURE_ENABLED_MASK)&0x6)// Check if the OS will save the YMM registers
{
haveAVX = true;
}
#else
#ifdef CV_HAAR_USE_SSE
bool haveSSE2 = cv::checkHardwareSupport(CV_CPU_SSE2);
#endif
#endif
int p_offset, pq_offset; int p_offset, pq_offset;
int i, j; int i, j;
double mean, variance_norm_factor; double mean, variance_norm_factor;
...@@ -702,11 +828,21 @@ cvRunHaarClassifierCascadeSum( const CvHaarClassifierCascade* _cascade, ...@@ -702,11 +828,21 @@ cvRunHaarClassifierCascadeSum( const CvHaarClassifierCascade* _cascade,
{ {
stage_sum = 0.0; stage_sum = 0.0;
for( j = 0; j < ptr->count; j++ ) #ifdef CV_HAAR_USE_AVX
if(haveAVX)
{ {
stage_sum += icvEvalHidHaarClassifier( ptr->classifier + j, for( ; j < cascade->stage_classifier[i].count-8; j+=8 )
{
stage_sum += icvEvalHidHaarClassifierAVX(
cascade->stage_classifier[i].classifier+j,
variance_norm_factor, p_offset ); variance_norm_factor, p_offset );
} }
}
#endif
for( j = 0; j < ptr->count; j++ )
{
stage_sum += icvEvalHidHaarClassifier( ptr->classifier + j, variance_norm_factor, p_offset );
}
if( stage_sum >= ptr->threshold ) if( stage_sum >= ptr->threshold )
{ {
...@@ -723,27 +859,180 @@ cvRunHaarClassifierCascadeSum( const CvHaarClassifierCascade* _cascade, ...@@ -723,27 +859,180 @@ cvRunHaarClassifierCascadeSum( const CvHaarClassifierCascade* _cascade,
} }
else if( cascade->isStumpBased ) else if( cascade->isStumpBased )
{ {
#ifdef CV_HAAR_USE_AVX
if(haveAVX)
{
CvHidHaarClassifier* classifiers[8];
CvHidHaarTreeNode* nodes[8];
for( i = start_stage; i < cascade->count; i++ ) for( i = start_stage; i < cascade->count; i++ )
{ {
#ifndef CV_HAAR_USE_SSE
stage_sum = 0.0; stage_sum = 0.0;
#else int j = 0;
__m128d stage_sum = _mm_setzero_pd(); float CV_DECL_ALIGNED(32) buf[8];
#endif
if( cascade->stage_classifier[i].two_rects ) if( cascade->stage_classifier[i].two_rects )
{ {
for( j = 0; j < cascade->stage_classifier[i].count; j++ ) for( ; j <= cascade->stage_classifier[i].count-8; j+=8 )
{
//__m256 stage_sumPart = _mm256_setzero_ps();
classifiers[0] = cascade->stage_classifier[i].classifier + j;
nodes[0] = classifiers[0]->node;
classifiers[1] = cascade->stage_classifier[i].classifier + j + 1;
nodes[1] = classifiers[1]->node;
classifiers[2] = cascade->stage_classifier[i].classifier + j + 2;
nodes[2]= classifiers[2]->node;
classifiers[3] = cascade->stage_classifier[i].classifier + j + 3;
nodes[3] = classifiers[3]->node;
classifiers[4] = cascade->stage_classifier[i].classifier + j + 4;
nodes[4] = classifiers[4]->node;
classifiers[5] = cascade->stage_classifier[i].classifier + j + 5;
nodes[5] = classifiers[5]->node;
classifiers[6] = cascade->stage_classifier[i].classifier + j + 6;
nodes[6] = classifiers[6]->node;
classifiers[7] = cascade->stage_classifier[i].classifier + j + 7;
nodes[7] = classifiers[7]->node;
__m256 t = _mm256_set1_ps(variance_norm_factor);
t = _mm256_mul_ps(t, _mm256_set_ps(nodes[7]->threshold,nodes[6]->threshold,nodes[5]->threshold,nodes[4]->threshold,nodes[3]->threshold,nodes[2]->threshold,nodes[1]->threshold,nodes[0]->threshold));
__m256 offset = _mm256_set_ps(calc_sum(nodes[7]->feature.rect[0],p_offset), calc_sum(nodes[6]->feature.rect[0],p_offset), calc_sum(nodes[5]->feature.rect[0],p_offset),
calc_sum(nodes[4]->feature.rect[0],p_offset), calc_sum(nodes[3]->feature.rect[0],p_offset), calc_sum(nodes[2]->feature.rect[0],p_offset), calc_sum(nodes[1]->feature.rect[0],
p_offset),calc_sum(nodes[0]->feature.rect[0],p_offset));
__m256 weight = _mm256_set_ps(nodes[7]->feature.rect[0].weight, nodes[6]->feature.rect[0].weight, nodes[5]->feature.rect[0].weight,
nodes[4]->feature.rect[0].weight, nodes[3]->feature.rect[0].weight, nodes[2]->feature.rect[0].weight, nodes[1]->feature.rect[0].weight, nodes[0]->feature.rect[0].weight);
__m256 sum = _mm256_mul_ps(offset, weight);
offset = _mm256_set_ps(calc_sum(nodes[7]->feature.rect[1],p_offset),calc_sum(nodes[6]->feature.rect[1],p_offset),calc_sum(nodes[5]->feature.rect[1],p_offset),
calc_sum(nodes[4]->feature.rect[1],p_offset),calc_sum(nodes[3]->feature.rect[1],p_offset),calc_sum(nodes[2]->feature.rect[1],p_offset),calc_sum(nodes[1]->feature.rect[1],p_offset),
calc_sum(nodes[0]->feature.rect[1],p_offset));
weight = _mm256_set_ps(nodes[7]->feature.rect[1].weight, nodes[6]->feature.rect[1].weight, nodes[5]->feature.rect[1].weight, nodes[4]->feature.rect[1].weight,
nodes[3]->feature.rect[1].weight, nodes[2]->feature.rect[1].weight, nodes[1]->feature.rect[1].weight, nodes[0]->feature.rect[1].weight);
sum = _mm256_add_ps(sum, _mm256_mul_ps(offset,weight));
__m256 alpha0 = _mm256_set_ps(classifiers[7]->alpha[0],classifiers[6]->alpha[0],classifiers[5]->alpha[0],classifiers[4]->alpha[0],classifiers[3]->alpha[0],
classifiers[2]->alpha[0],classifiers[1]->alpha[0],classifiers[0]->alpha[0]);
__m256 alpha1 = _mm256_set_ps(classifiers[7]->alpha[1],classifiers[6]->alpha[1],classifiers[5]->alpha[1],classifiers[4]->alpha[1],classifiers[3]->alpha[1],
classifiers[2]->alpha[1],classifiers[1]->alpha[1],classifiers[0]->alpha[1]);
_mm256_store_ps(buf, _mm256_blendv_ps(alpha0, alpha1, _mm256_cmp_ps(t, sum, _CMP_LE_OQ )));
stage_sum+=(buf[0]+buf[1]+buf[2]+buf[3]+buf[4]+buf[5]+buf[6]+buf[7]);
}
for( ; j < cascade->stage_classifier[i].count; j++ )
{ {
CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j; CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j;
CvHidHaarTreeNode* node = classifier->node; CvHidHaarTreeNode* node = classifier->node;
#ifndef CV_HAAR_USE_SSE
double t = node->threshold*variance_norm_factor; double t = node->threshold*variance_norm_factor;
double sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight; double sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight;
sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight; sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight;
stage_sum += classifier->alpha[sum >= t]; stage_sum += classifier->alpha[sum >= t];
#else }
}
else
{
for( ; j <= (cascade->stage_classifier[i].count)-8; j+=8 )
{
float CV_DECL_ALIGNED(32) tmp[8] = {0,0,0,0,0,0,0,0};
classifiers[0] = cascade->stage_classifier[i].classifier + j;
nodes[0] = classifiers[0]->node;
classifiers[1] = cascade->stage_classifier[i].classifier + j + 1;
nodes[1] = classifiers[1]->node;
classifiers[2] = cascade->stage_classifier[i].classifier + j + 2;
nodes[2]= classifiers[2]->node;
classifiers[3] = cascade->stage_classifier[i].classifier + j + 3;
nodes[3] = classifiers[3]->node;
classifiers[4] = cascade->stage_classifier[i].classifier + j + 4;
nodes[4] = classifiers[4]->node;
classifiers[5] = cascade->stage_classifier[i].classifier + j + 5;
nodes[5] = classifiers[5]->node;
classifiers[6] = cascade->stage_classifier[i].classifier + j + 6;
nodes[6] = classifiers[6]->node;
classifiers[7] = cascade->stage_classifier[i].classifier + j + 7;
nodes[7] = classifiers[7]->node;
__m256 t = _mm256_set1_ps(variance_norm_factor);
t = _mm256_mul_ps(t, _mm256_set_ps(nodes[7]->threshold,nodes[6]->threshold,nodes[5]->threshold,nodes[4]->threshold,nodes[3]->threshold,nodes[2]->threshold,nodes[1]->threshold,nodes[0]->threshold));
__m256 offset = _mm256_set_ps(calc_sum(nodes[7]->feature.rect[0],p_offset), calc_sum(nodes[6]->feature.rect[0],p_offset), calc_sum(nodes[5]->feature.rect[0],p_offset),
calc_sum(nodes[4]->feature.rect[0],p_offset), calc_sum(nodes[3]->feature.rect[0],p_offset), calc_sum(nodes[2]->feature.rect[0],p_offset), calc_sum(nodes[1]->feature.rect[0],
p_offset),calc_sum(nodes[0]->feature.rect[0],p_offset));
__m256 weight = _mm256_set_ps(nodes[7]->feature.rect[0].weight, nodes[6]->feature.rect[0].weight, nodes[5]->feature.rect[0].weight,
nodes[4]->feature.rect[0].weight, nodes[3]->feature.rect[0].weight, nodes[2]->feature.rect[0].weight, nodes[1]->feature.rect[0].weight, nodes[0]->feature.rect[0].weight);
__m256 sum = _mm256_mul_ps(offset, weight);
offset = _mm256_set_ps(calc_sum(nodes[7]->feature.rect[1],p_offset),calc_sum(nodes[6]->feature.rect[1],p_offset),calc_sum(nodes[5]->feature.rect[1],p_offset),
calc_sum(nodes[4]->feature.rect[1],p_offset),calc_sum(nodes[3]->feature.rect[1],p_offset),calc_sum(nodes[2]->feature.rect[1],p_offset),calc_sum(nodes[1]->feature.rect[1],p_offset),
calc_sum(nodes[0]->feature.rect[1],p_offset));
weight = _mm256_set_ps(nodes[7]->feature.rect[1].weight, nodes[6]->feature.rect[1].weight, nodes[5]->feature.rect[1].weight, nodes[4]->feature.rect[1].weight,
nodes[3]->feature.rect[1].weight, nodes[2]->feature.rect[1].weight, nodes[1]->feature.rect[1].weight, nodes[0]->feature.rect[1].weight);
sum = _mm256_add_ps(sum, _mm256_mul_ps(offset,weight));
if( nodes[0]->feature.rect[2].p0 )
tmp[0] = calc_sum(nodes[0]->feature.rect[2],p_offset) * nodes[0]->feature.rect[2].weight;
if( nodes[1]->feature.rect[2].p0 )
tmp[1] = calc_sum(nodes[1]->feature.rect[2],p_offset) * nodes[1]->feature.rect[2].weight;
if( nodes[2]->feature.rect[2].p0 )
tmp[2] = calc_sum(nodes[2]->feature.rect[2],p_offset) * nodes[2]->feature.rect[2].weight;
if( nodes[3]->feature.rect[2].p0 )
tmp[3] = calc_sum(nodes[3]->feature.rect[2],p_offset) * nodes[3]->feature.rect[2].weight;
if( nodes[4]->feature.rect[2].p0 )
tmp[4] = calc_sum(nodes[4]->feature.rect[2],p_offset) * nodes[4]->feature.rect[2].weight;
if( nodes[5]->feature.rect[2].p0 )
tmp[5] = calc_sum(nodes[5]->feature.rect[2],p_offset) * nodes[5]->feature.rect[2].weight;
if( nodes[6]->feature.rect[2].p0 )
tmp[6] = calc_sum(nodes[6]->feature.rect[2],p_offset) * nodes[6]->feature.rect[2].weight;
if( nodes[7]->feature.rect[2].p0 )
tmp[7] = calc_sum(nodes[7]->feature.rect[2],p_offset) * nodes[7]->feature.rect[2].weight;
sum = _mm256_add_ps(sum, _mm256_load_ps(tmp));
__m256 alpha0 = _mm256_set_ps(classifiers[7]->alpha[0],classifiers[6]->alpha[0],classifiers[5]->alpha[0],classifiers[4]->alpha[0],classifiers[3]->alpha[0],
classifiers[2]->alpha[0],classifiers[1]->alpha[0],classifiers[0]->alpha[0]);
__m256 alpha1 = _mm256_set_ps(classifiers[7]->alpha[1],classifiers[6]->alpha[1],classifiers[5]->alpha[1],classifiers[4]->alpha[1],classifiers[3]->alpha[1],
classifiers[2]->alpha[1],classifiers[1]->alpha[1],classifiers[0]->alpha[1]);
__m256 outBuf = _mm256_blendv_ps(alpha0, alpha1, _mm256_cmp_ps(t, sum, _CMP_LE_OQ ));
outBuf = _mm256_hadd_ps(outBuf, outBuf);
outBuf = _mm256_hadd_ps(outBuf, outBuf);
_mm256_store_ps(buf, outBuf);
stage_sum+=(buf[0]+buf[4]);//(buf[0]+buf[1]+buf[2]+buf[3]+buf[4]+buf[5]+buf[6]+buf[7]);
}
for( ; j < cascade->stage_classifier[i].count; j++ )
{
CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j;
CvHidHaarTreeNode* node = classifier->node;
double t = node->threshold*variance_norm_factor;
double sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight;
sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight;
if( node->feature.rect[2].p0 )
sum += calc_sum(node->feature.rect[2],p_offset) * node->feature.rect[2].weight;
stage_sum += classifier->alpha[sum >= t];
}
}
if( stage_sum < cascade->stage_classifier[i].threshold )
return -i;
}
}
else
#endif
#ifdef CV_HAAR_USE_SSE && !CV_HAAR_USE_AVX //old SSE optimization
if(haveSSE2)
{
for( i = start_stage; i < cascade->count; i++ )
{
__m128d stage_sum = _mm_setzero_pd();
if( cascade->stage_classifier[i].two_rects )
{
for( j = 0; j < cascade->stage_classifier[i].count; j++ )
{
CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j;
CvHidHaarTreeNode* node = classifier->node;
// ayasin - NHM perf optim. Avoid use of costly flaky jcc // ayasin - NHM perf optim. Avoid use of costly flaky jcc
__m128d t = _mm_set_sd(node->threshold*variance_norm_factor); __m128d t = _mm_set_sd(node->threshold*variance_norm_factor);
__m128d a = _mm_set_sd(classifier->alpha[0]); __m128d a = _mm_set_sd(classifier->alpha[0]);
...@@ -752,8 +1041,6 @@ cvRunHaarClassifierCascadeSum( const CvHaarClassifierCascade* _cascade, ...@@ -752,8 +1041,6 @@ cvRunHaarClassifierCascadeSum( const CvHaarClassifierCascade* _cascade,
calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight); calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight);
t = _mm_cmpgt_sd(t, sum); t = _mm_cmpgt_sd(t, sum);
stage_sum = _mm_add_sd(stage_sum, _mm_blendv_pd(b, a, t)); stage_sum = _mm_add_sd(stage_sum, _mm_blendv_pd(b, a, t));
#endif
} }
} }
else else
...@@ -762,15 +1049,6 @@ cvRunHaarClassifierCascadeSum( const CvHaarClassifierCascade* _cascade, ...@@ -762,15 +1049,6 @@ cvRunHaarClassifierCascadeSum( const CvHaarClassifierCascade* _cascade,
{ {
CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j; CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j;
CvHidHaarTreeNode* node = classifier->node; CvHidHaarTreeNode* node = classifier->node;
#ifndef CV_HAAR_USE_SSE
double t = node->threshold*variance_norm_factor;
double sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight;
sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight;
if( node->feature.rect[2].p0 )
sum += calc_sum(node->feature.rect[2],p_offset) * node->feature.rect[2].weight;
stage_sum += classifier->alpha[sum >= t];
#else
// ayasin - NHM perf optim. Avoid use of costly flaky jcc // ayasin - NHM perf optim. Avoid use of costly flaky jcc
__m128d t = _mm_set_sd(node->threshold*variance_norm_factor); __m128d t = _mm_set_sd(node->threshold*variance_norm_factor);
__m128d a = _mm_set_sd(classifier->alpha[0]); __m128d a = _mm_set_sd(classifier->alpha[0]);
...@@ -783,27 +1061,71 @@ cvRunHaarClassifierCascadeSum( const CvHaarClassifierCascade* _cascade, ...@@ -783,27 +1061,71 @@ cvRunHaarClassifierCascadeSum( const CvHaarClassifierCascade* _cascade,
t = _mm_cmpgt_sd(t, sum); t = _mm_cmpgt_sd(t, sum);
stage_sum = _mm_add_sd(stage_sum, _mm_blendv_pd(b, a, t)); stage_sum = _mm_add_sd(stage_sum, _mm_blendv_pd(b, a, t));
#endif
} }
} }
__m128d i_threshold = _mm_set1_pd(cascade->stage_classifier[i].threshold);
#ifndef CV_HAAR_USE_SSE
if( stage_sum < cascade->stage_classifier[i].threshold )
#else
__m128d i_threshold = _mm_set_sd(cascade->stage_classifier[i].threshold);
if( _mm_comilt_sd(stage_sum, i_threshold) ) if( _mm_comilt_sd(stage_sum, i_threshold) )
#endif
return -i; return -i;
} }
} }
else else
#endif
{ {
for( i = start_stage; i < cascade->count; i++ ) for( i = start_stage; i < cascade->count; i++ )
{ {
stage_sum = 0.0; stage_sum = 0.0;
if( cascade->stage_classifier[i].two_rects )
{
for( j = 0; j < cascade->stage_classifier[i].count; j++ ) for( j = 0; j < cascade->stage_classifier[i].count; j++ )
{ {
CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j;
CvHidHaarTreeNode* node = classifier->node;
double t = node->threshold*variance_norm_factor;
double sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight;
sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight;
stage_sum += classifier->alpha[sum >= t];
}
}
else
{
for( j = 0; j < cascade->stage_classifier[i].count; j++ )
{
CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j;
CvHidHaarTreeNode* node = classifier->node;
double t = node->threshold*variance_norm_factor;
double sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight;
sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight;
if( node->feature.rect[2].p0 )
sum += calc_sum(node->feature.rect[2],p_offset) * node->feature.rect[2].weight;
stage_sum += classifier->alpha[sum >= t];
}
}
if( stage_sum < cascade->stage_classifier[i].threshold )
return -i;
}
}
}
else
{
for( i = start_stage; i < cascade->count; i++ )
{
stage_sum = 0.0;
int j = 0;
#ifdef CV_HAAR_USE_AVX
if(haveAVX)
{
for( ; j < cascade->stage_classifier[i].count-8; j+=8 )
{
stage_sum += icvEvalHidHaarClassifierAVX(
cascade->stage_classifier[i].classifier+j,
variance_norm_factor, p_offset );
}
}
#endif
for(; j < cascade->stage_classifier[i].count; j++ )
{
stage_sum += icvEvalHidHaarClassifier( stage_sum += icvEvalHidHaarClassifier(
cascade->stage_classifier[i].classifier + j, cascade->stage_classifier[i].classifier + j,
variance_norm_factor, p_offset ); variance_norm_factor, p_offset );
...@@ -813,9 +1135,11 @@ cvRunHaarClassifierCascadeSum( const CvHaarClassifierCascade* _cascade, ...@@ -813,9 +1135,11 @@ cvRunHaarClassifierCascadeSum( const CvHaarClassifierCascade* _cascade,
return -i; return -i;
} }
} }
//_mm256_zeroupper();
return 1; return 1;
} }
CV_IMPL int CV_IMPL int
cvRunHaarClassifierCascade( const CvHaarClassifierCascade* _cascade, cvRunHaarClassifierCascade( const CvHaarClassifierCascade* _cascade,
CvPoint pt, int start_stage ) CvPoint pt, int start_stage )
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment