Commit fac3d999 authored by Vadim Pisarevsky's avatar Vadim Pisarevsky

integrated another portion of SSE optimizations from Grigory Frolov

parent 5f2ce22f
...@@ -139,7 +139,7 @@ if(CMAKE_COMPILER_IS_GNUCXX) ...@@ -139,7 +139,7 @@ if(CMAKE_COMPILER_IS_GNUCXX)
if(ENABLE_SSSE3) if(ENABLE_SSSE3)
add_extra_compiler_option(-mssse3) add_extra_compiler_option(-mssse3)
endif() endif()
if(HAVE_GCC43_OR_NEWER) if(HAVE_GCC43_OR_NEWER OR APPLE)
if(ENABLE_SSE41) if(ENABLE_SSE41)
add_extra_compiler_option(-msse4.1) add_extra_compiler_option(-msse4.1)
endif() endif()
......
...@@ -120,17 +120,23 @@ CV_INLINE IppiSize ippiSize(int width, int height) ...@@ -120,17 +120,23 @@ CV_INLINE IppiSize ippiSize(int width, int height)
# else # else
# define CV_SSSE3 0 # define CV_SSSE3 0
# endif # endif
# if defined __SSE4_1__ || (defined _MSC_VER && _MSC_VER >= 1600) # if defined __SSE4_1__ || (defined _MSC_VER && _MSC_VER >= 1500)
# include <smmintrin.h> # include <smmintrin.h>
# define CV_SSE4_1 1 # define CV_SSE4_1 1
# else
# define CV_SSE4_1 0
# endif # endif
# if defined __SSE4_2__ || (defined _MSC_VER && _MSC_VER >= 1600) # if defined __SSE4_2__ || (defined _MSC_VER && _MSC_VER >= 1500)
# include <nmmintrin.h> # include <nmmintrin.h>
# define CV_SSE4_2 1 # define CV_SSE4_2 1
# else
# define CV_SSE4_2 0
# endif # endif
# if defined __AVX__ || (defined _MSC_VER && _MSC_VER >= 1600) # if defined __AVX__ || (defined _MSC_FULL_VER && _MSC_FULL_VER >= 160040219)
# include <immintrin.h> # include <immintrin.h>
# define CV_AVX 1 # define CV_AVX 1
# else
# define CV_AVX 0
# endif # endif
# else # else
# define CV_SSE 0 # define CV_SSE 0
......
...@@ -954,7 +954,7 @@ double cv::invert( InputArray _src, OutputArray _dst, int method ) ...@@ -954,7 +954,7 @@ double cv::invert( InputArray _src, OutputArray _dst, int method )
size_t esz = CV_ELEM_SIZE(type); size_t esz = CV_ELEM_SIZE(type);
int m = src.rows, n = src.cols; int m = src.rows, n = src.cols;
if( method == DECOMP_SVD ) if( method == DECOMP_SVD )
{ {
int nm = std::min(m, n); int nm = std::min(m, n);
...@@ -1010,82 +1010,84 @@ double cv::invert( InputArray _src, OutputArray _dst, int method ) ...@@ -1010,82 +1010,84 @@ double cv::invert( InputArray _src, OutputArray _dst, int method )
if( type == CV_32FC1 ) if( type == CV_32FC1 )
{ {
double d = det2(Sf); double d = det2(Sf);
#if CV_SSE4_2 if( d != 0. )
if(USE_SSE4_2)
{
__m128 zero = _mm_setzero_ps();
__m128 t0 = _mm_loadl_pi(zero, (const __m64*)srcdata); //t0 = sf(0,0) sf(0,1)
__m128 t1 = _mm_loadh_pi(zero,(const __m64*)((const float*)(srcdata+srcstep))); //t1 = sf(1,0) sf(1,1)
__m128 s0 = _mm_blend_ps(t0,t1,12);
d = 1./d;
result = true;
__m128 det =_mm_set1_ps((float)d);
s0 = _mm_mul_ps(s0, det);
const uchar CV_DECL_ALIGNED(16) inv[16] = {0,0,0,0,0,0,0,0x80,0,0,0,0x80,0,0,0,0};
__m128 pattern = _mm_load_ps((const float*)inv);
s0 = _mm_xor_ps(s0, pattern);//==-1*s0
s0 = _mm_shuffle_ps(s0, s0, _MM_SHUFFLE(0,2,1,3));
_mm_storel_pi((__m64*)dstdata, s0);
_mm_storeh_pi((__m64*)((float*)(dstdata+dststep)), s0);
}
#else
if( d != 0. )
{ {
double t0, t1; result = true;
result = true; d = 1./d;
d = 1./d;
t0 = Sf(0,0)*d; #if CV_SSE2
t1 = Sf(1,1)*d; if(USE_SSE2)
Df(1,1) = (float)t0; {
Df(0,0) = (float)t1; __m128 zero = _mm_setzero_ps();
t0 = -Sf(0,1)*d; __m128 t0 = _mm_loadl_pi(zero, (const __m64*)srcdata); //t0 = sf(0,0) sf(0,1)
t1 = -Sf(1,0)*d; __m128 t1 = _mm_loadh_pi(zero, (const __m64*)(srcdata+srcstep)); //t1 = sf(1,0) sf(1,1)
Df(0,1) = (float)t0; __m128 s0 = _mm_or_ps(t0, t1);
Df(1,0) = (float)t1; __m128 det =_mm_set1_ps((float)d);
} s0 = _mm_mul_ps(s0, det);
#endif const uchar CV_DECL_ALIGNED(16) inv[16] = {0,0,0,0,0,0,0,0x80,0,0,0,0x80,0,0,0,0};
__m128 pattern = _mm_load_ps((const float*)inv);
s0 = _mm_xor_ps(s0, pattern);//==-1*s0
s0 = _mm_shuffle_ps(s0, s0, _MM_SHUFFLE(0,2,1,3));
_mm_storel_pi((__m64*)dstdata, s0);
_mm_storeh_pi((__m64*)((float*)(dstdata+dststep)), s0);
}
else
#endif
{
double t0, t1;
t0 = Sf(0,0)*d;
t1 = Sf(1,1)*d;
Df(1,1) = (float)t0;
Df(0,0) = (float)t1;
t0 = -Sf(0,1)*d;
t1 = -Sf(1,0)*d;
Df(0,1) = (float)t0;
Df(1,0) = (float)t1;
}
}
} }
else else
{ {
double d = det2(Sd); double d = det2(Sd);
#if CV_SSE2 if( d != 0. )
if(USE_SSE2)
{
__m128d s0 = _mm_loadu_pd((const double*)srcdata); //s0 = sf(0,0) sf(0,1)
__m128d s1 = _mm_loadu_pd ((const double*)(srcdata+srcstep));//s1 = sf(1,0) sf(1,1)
__m128d sm = _mm_shuffle_pd(s0, s1, _MM_SHUFFLE2(1,0)); //sm = sf(0,0) sf(1,1) - main diagonal
__m128d ss = _mm_shuffle_pd(s0, s1, _MM_SHUFFLE2(0,1)); //sm = sf(0,1) sf(1,0) - secondary diagonal
result = true;
d = 1./d;
__m128d det = _mm_load1_pd((const double*)&d);
sm = _mm_mul_pd(sm, det);
//__m128d pattern = _mm_set1_pd(-1.);
static const uchar CV_DECL_ALIGNED(16) inv[8] = {0,0,0,0,0,0,0,0x80};
__m128d pattern = _mm_load1_pd((double*)inv);
ss = _mm_mul_pd(ss, det);
ss = _mm_xor_pd(ss, pattern);//==-1*ss
//ss = _mm_mul_pd(ss,pattern);
s0 = _mm_shuffle_pd(sm, ss, _MM_SHUFFLE2(0,1));
s1 = _mm_shuffle_pd(ss, sm, _MM_SHUFFLE2(0,1));
_mm_store_pd((double*)dstdata, s0);
_mm_store_pd((double*)(dstdata+dststep), s1);
}
#else
if( d != 0. )
{ {
double t0, t1; result = true;
result = true; d = 1./d;
d = 1./d; #if CV_SSE2
t0 = Sd(0,0)*d; if(USE_SSE2)
t1 = Sd(1,1)*d; {
Dd(1,1) = t0; __m128d s0 = _mm_loadu_pd((const double*)srcdata); //s0 = sf(0,0) sf(0,1)
Dd(0,0) = t1; __m128d s1 = _mm_loadu_pd ((const double*)(srcdata+srcstep));//s1 = sf(1,0) sf(1,1)
t0 = -Sd(0,1)*d; __m128d sm = _mm_unpacklo_pd(s0, _mm_load_sd((const double*)(srcdata+srcstep)+1)); //sm = sf(0,0) sf(1,1) - main diagonal
t1 = -Sd(1,0)*d; __m128d ss = _mm_shuffle_pd(s0, s1, _MM_SHUFFLE2(0,1)); //ss = sf(0,1) sf(1,0) - secondary diagonal
Dd(0,1) = t0; __m128d det = _mm_load1_pd((const double*)&d);
Dd(1,0) = t1; sm = _mm_mul_pd(sm, det);
}
#endif uchar CV_DECL_ALIGNED(16) inv[8] = {0,0,0,0,0,0,0,0x80};
__m128d pattern = _mm_load1_pd((double*)inv);
ss = _mm_mul_pd(ss, det);
ss = _mm_xor_pd(ss, pattern);//==-1*ss
s0 = _mm_shuffle_pd(sm, ss, _MM_SHUFFLE2(0,1));
s1 = _mm_shuffle_pd(ss, sm, _MM_SHUFFLE2(0,1));
_mm_storeu_pd((double*)dstdata, s0);
_mm_storeu_pd((double*)(dstdata+dststep), s1);
}
else
#endif
{
double t0, t1;
t0 = Sd(0,0)*d;
t1 = Sd(1,1)*d;
Dd(1,1) = t0;
Dd(0,0) = t1;
t0 = -Sd(0,1)*d;
t1 = -Sd(1,0)*d;
Dd(0,1) = t0;
Dd(1,0) = t1;
}
}
} }
} }
else if( n == 3 ) else if( n == 3 )
...@@ -1095,18 +1097,17 @@ double cv::invert( InputArray _src, OutputArray _dst, int method ) ...@@ -1095,18 +1097,17 @@ double cv::invert( InputArray _src, OutputArray _dst, int method )
double d = det3(Sf); double d = det3(Sf);
if( d != 0. ) if( d != 0. )
{ {
float t[9];
result = true; result = true;
d = 1./d; d = 1./d;
float t[9];
t[0] = (float)(((double)Sf(1,1) * Sf(2,2) - (double)Sf(1,2) * Sf(2,1)) * d); t[0] = (float)(((double)Sf(1,1) * Sf(2,2) - (double)Sf(1,2) * Sf(2,1)) * d);
t[1] = (float)(((double)Sf(0,2) * Sf(2,1) - (double)Sf(0,1) * Sf(2,2)) * d); t[1] = (float)(((double)Sf(0,2) * Sf(2,1) - (double)Sf(0,1) * Sf(2,2)) * d);
t[2] = (float)(((double)Sf(0,1) * Sf(1,2) - (double)Sf(0,2) * Sf(1,1)) * d); t[2] = (float)(((double)Sf(0,1) * Sf(1,2) - (double)Sf(0,2) * Sf(1,1)) * d);
t[3] = (float)(((double)Sf(1,2) * Sf(2,0) - (double)Sf(1,0) * Sf(2,2)) * d); t[3] = (float)(((double)Sf(1,2) * Sf(2,0) - (double)Sf(1,0) * Sf(2,2)) * d);
t[4] = (float)(((double)Sf(0,0) * Sf(2,2) - (double)Sf(0,2) * Sf(2,0)) * d); t[4] = (float)(((double)Sf(0,0) * Sf(2,2) - (double)Sf(0,2) * Sf(2,0)) * d);
t[5] = (float)(((double)Sf(0,2) * Sf(1,0) - (double)Sf(0,0) * Sf(1,2)) * d); t[5] = (float)(((double)Sf(0,2) * Sf(1,0) - (double)Sf(0,0) * Sf(1,2)) * d);
t[6] = (float)(((double)Sf(1,0) * Sf(2,1) - (double)Sf(1,1) * Sf(2,0)) * d); t[6] = (float)(((double)Sf(1,0) * Sf(2,1) - (double)Sf(1,1) * Sf(2,0)) * d);
t[7] = (float)(((double)Sf(0,1) * Sf(2,0) - (double)Sf(0,0) * Sf(2,1)) * d); t[7] = (float)(((double)Sf(0,1) * Sf(2,0) - (double)Sf(0,0) * Sf(2,1)) * d);
t[8] = (float)(((double)Sf(0,0) * Sf(1,1) - (double)Sf(0,1) * Sf(1,0)) * d); t[8] = (float)(((double)Sf(0,0) * Sf(1,1) - (double)Sf(0,1) * Sf(1,0)) * d);
...@@ -1121,18 +1122,18 @@ double cv::invert( InputArray _src, OutputArray _dst, int method ) ...@@ -1121,18 +1122,18 @@ double cv::invert( InputArray _src, OutputArray _dst, int method )
double d = det3(Sd); double d = det3(Sd);
if( d != 0. ) if( d != 0. )
{ {
result = true;
d = 1./d;
double t[9]; double t[9];
result = true;
d = 1./d;
t[0] = (Sd(1,1) * Sd(2,2) - Sd(1,2) * Sd(2,1)) * d; t[0] = (Sd(1,1) * Sd(2,2) - Sd(1,2) * Sd(2,1)) * d;
t[1] = (Sd(0,2) * Sd(2,1) - Sd(0,1) * Sd(2,2)) * d; t[1] = (Sd(0,2) * Sd(2,1) - Sd(0,1) * Sd(2,2)) * d;
t[2] = (Sd(0,1) * Sd(1,2) - Sd(0,2) * Sd(1,1)) * d; t[2] = (Sd(0,1) * Sd(1,2) - Sd(0,2) * Sd(1,1)) * d;
t[3] = (Sd(1,2) * Sd(2,0) - Sd(1,0) * Sd(2,2)) * d; t[3] = (Sd(1,2) * Sd(2,0) - Sd(1,0) * Sd(2,2)) * d;
t[4] = (Sd(0,0) * Sd(2,2) - Sd(0,2) * Sd(2,0)) * d; t[4] = (Sd(0,0) * Sd(2,2) - Sd(0,2) * Sd(2,0)) * d;
t[5] = (Sd(0,2) * Sd(1,0) - Sd(0,0) * Sd(1,2)) * d; t[5] = (Sd(0,2) * Sd(1,0) - Sd(0,0) * Sd(1,2)) * d;
t[6] = (Sd(1,0) * Sd(2,1) - Sd(1,1) * Sd(2,0)) * d; t[6] = (Sd(1,0) * Sd(2,1) - Sd(1,1) * Sd(2,0)) * d;
t[7] = (Sd(0,1) * Sd(2,0) - Sd(0,0) * Sd(2,1)) * d; t[7] = (Sd(0,1) * Sd(2,0) - Sd(0,0) * Sd(2,1)) * d;
t[8] = (Sd(0,0) * Sd(1,1) - Sd(0,1) * Sd(1,0)) * d; t[8] = (Sd(0,0) * Sd(1,1) - Sd(0,1) * Sd(1,0)) * d;
...@@ -1171,7 +1172,7 @@ double cv::invert( InputArray _src, OutputArray _dst, int method ) ...@@ -1171,7 +1172,7 @@ double cv::invert( InputArray _src, OutputArray _dst, int method )
return result; return result;
} }
int elem_size = CV_ELEM_SIZE(type); int elem_size = CV_ELEM_SIZE(type);
AutoBuffer<uchar> buf(n*n*elem_size); AutoBuffer<uchar> buf(n*n*elem_size);
Mat src1(n, n, type, (uchar*)buf); Mat src1(n, n, type, (uchar*)buf);
src.copyTo(src1); src.copyTo(src1);
...@@ -1193,6 +1194,7 @@ double cv::invert( InputArray _src, OutputArray _dst, int method ) ...@@ -1193,6 +1194,7 @@ double cv::invert( InputArray _src, OutputArray _dst, int method )
} }
/****************************************************************************************\ /****************************************************************************************\
* Solving a linear system * * Solving a linear system *
\****************************************************************************************/ \****************************************************************************************/
...@@ -1603,7 +1605,7 @@ void SVD::backSubst( InputArray _w, InputArray _u, InputArray _vt, ...@@ -1603,7 +1605,7 @@ void SVD::backSubst( InputArray _w, InputArray _u, InputArray _vt,
Mat w = _w.getMat(), u = _u.getMat(), vt = _vt.getMat(), rhs = _rhs.getMat(); Mat w = _w.getMat(), u = _u.getMat(), vt = _vt.getMat(), rhs = _rhs.getMat();
int type = w.type(), esz = (int)w.elemSize(); int type = w.type(), esz = (int)w.elemSize();
int m = u.rows, n = vt.cols, nb = rhs.data ? rhs.cols : m, nm = std::min(m, n); int m = u.rows, n = vt.cols, nb = rhs.data ? rhs.cols : m, nm = std::min(m, n);
size_t wstep = w.rows == 1 ? esz : w.cols == 1 ? (size_t)w.step : (size_t)w.step + esz; size_t wstep = w.rows == 1 ? (size_t)esz : w.cols == 1 ? (size_t)w.step : (size_t)w.step + esz;
AutoBuffer<uchar> buffer(nb*sizeof(double) + 16); AutoBuffer<uchar> buffer(nb*sizeof(double) + 16);
CV_Assert( w.type() == u.type() && u.type() == vt.type() && u.data && vt.data && w.data ); CV_Assert( w.type() == u.type() && u.type() == vt.type() && u.data && vt.data && w.data );
CV_Assert( u.cols >= nm && vt.rows >= nm && CV_Assert( u.cols >= nm && vt.rows >= nm &&
......
...@@ -951,9 +951,6 @@ cvBoundingRect( CvArr* array, int update ) ...@@ -951,9 +951,6 @@ cvBoundingRect( CvArr* array, int update )
if( ptseq->header_size < (int)sizeof(CvContour)) if( ptseq->header_size < (int)sizeof(CvContour))
{ {
/*if( update == 1 )
CV_Error( CV_StsBadArg, "The header is too small to fit the rectangle, "
"so it could not be updated" );*/
update = 0; update = 0;
calculate = 1; calculate = 1;
} }
...@@ -1067,86 +1064,123 @@ cvBoundingRect( CvArr* array, int update ) ...@@ -1067,86 +1064,123 @@ cvBoundingRect( CvArr* array, int update )
if( xmin >= size.width ) if( xmin >= size.width )
xmin = ymin = 0; xmin = ymin = 0;
} }
else if( ptseq->total ) else if( ptseq->total )
{ {
int is_float = CV_SEQ_ELTYPE(ptseq) == CV_32FC2; int is_float = CV_SEQ_ELTYPE(ptseq) == CV_32FC2;
cvStartReadSeq( ptseq, &reader, 0 ); cvStartReadSeq( ptseq, &reader, 0 );
CvPoint pt;
if( !is_float ) CV_READ_SEQ_ELEM( pt, reader );
{ #if CV_SSE4_2
CvPoint pt; if(cv::checkHardwareSupport(CV_CPU_SSE4_2))
/* init values */ {
CV_READ_SEQ_ELEM( pt, reader ); if( !is_float )
xmin = xmax = pt.x; {
ymin = ymax = pt.y; __m128i minval, maxval;
minval = maxval = _mm_loadl_epi64((const __m128i*)(&pt)); //min[0]=pt.x, min[1]=pt.y
for( i = 1; i < ptseq->total; i++ )
{ for( i = 1; i < ptseq->total; i++)
CV_READ_SEQ_ELEM( pt, reader ); {
__m128i ptXY = _mm_loadl_epi64((const __m128i*)(reader.ptr));
if( xmin > pt.x ) CV_NEXT_SEQ_ELEM(sizeof(pt), reader);
xmin = pt.x; minval = _mm_min_epi32(ptXY, minval);
maxval = _mm_max_epi32(ptXY, maxval);
if( xmax < pt.x ) }
xmax = pt.x; xmin = _mm_cvtsi128_si32(minval);
ymin = _mm_cvtsi128_si32(_mm_srli_si128(minval, 4));
if( ymin > pt.y ) xmax = _mm_cvtsi128_si32(maxval);
ymin = pt.y; ymax = _mm_cvtsi128_si32(_mm_srli_si128(maxval, 4));
}
if( ymax < pt.y ) else
ymax = pt.y; {
__m128 minvalf, maxvalf, z = _mm_setzero_ps(), ptXY = _mm_setzero_ps();
minvalf = maxvalf = _mm_loadl_pi(z, (const __m64*)(&pt));
for( i = 1; i < ptseq->total; i++ )
{
ptXY = _mm_loadl_pi(ptXY, (const __m64*)reader.ptr);
CV_NEXT_SEQ_ELEM(sizeof(pt), reader);
minvalf = _mm_min_ps(minvalf, ptXY);
maxvalf = _mm_max_ps(maxvalf, ptXY);
}
float xyminf[2], xymaxf[2];
_mm_storel_pi((__m64*)xyminf, minvalf);
_mm_storel_pi((__m64*)xymaxf, maxvalf);
xmin = cvFloor(xyminf[0]);
ymin = cvFloor(xyminf[1]);
xmax = cvFloor(xymaxf[0]);
ymax = cvFloor(xymaxf[1]);
} }
} }
else else
{ #endif
CvPoint pt; {
Cv32suf v; if( !is_float )
/* init values */ {
CV_READ_SEQ_ELEM( pt, reader ); xmin = xmax = pt.x;
xmin = xmax = CV_TOGGLE_FLT(pt.x); ymin = ymax = pt.y;
ymin = ymax = CV_TOGGLE_FLT(pt.y);
for( i = 1; i < ptseq->total; i++ )
for( i = 1; i < ptseq->total; i++ ) {
{ CV_READ_SEQ_ELEM( pt, reader );
CV_READ_SEQ_ELEM( pt, reader );
pt.x = CV_TOGGLE_FLT(pt.x); if( xmin > pt.x )
pt.y = CV_TOGGLE_FLT(pt.y); xmin = pt.x;
if( xmin > pt.x ) if( xmax < pt.x )
xmin = pt.x; xmax = pt.x;
if( xmax < pt.x ) if( ymin > pt.y )
xmax = pt.x; ymin = pt.y;
if( ymin > pt.y ) if( ymax < pt.y )
ymin = pt.y; ymax = pt.y;
}
if( ymax < pt.y ) }
ymax = pt.y; else
} {
Cv32suf v;
v.i = CV_TOGGLE_FLT(xmin); xmin = cvFloor(v.f); // init values
v.i = CV_TOGGLE_FLT(ymin); ymin = cvFloor(v.f); xmin = xmax = CV_TOGGLE_FLT(pt.x);
/* because right and bottom sides of ymin = ymax = CV_TOGGLE_FLT(pt.y);
the bounding rectangle are not inclusive
(note +1 in width and height calculation below), for( i = 1; i < ptseq->total; i++ )
cvFloor is used here instead of cvCeil */ {
v.i = CV_TOGGLE_FLT(xmax); xmax = cvFloor(v.f); CV_READ_SEQ_ELEM( pt, reader );
v.i = CV_TOGGLE_FLT(ymax); ymax = cvFloor(v.f); pt.x = CV_TOGGLE_FLT(pt.x);
} pt.y = CV_TOGGLE_FLT(pt.y);
}
if( xmin > pt.x )
rect.x = xmin; xmin = pt.x;
rect.y = ymin;
rect.width = xmax - xmin + 1; if( xmax < pt.x )
rect.height = ymax - ymin + 1; xmax = pt.x;
if( update ) if( ymin > pt.y )
ymin = pt.y;
if( ymax < pt.y )
ymax = pt.y;
}
v.i = CV_TOGGLE_FLT(xmin); xmin = cvFloor(v.f);
v.i = CV_TOGGLE_FLT(ymin); ymin = cvFloor(v.f);
// because right and bottom sides of the bounding rectangle are not inclusive
// (note +1 in width and height calculation below), cvFloor is used here instead of cvCeil
v.i = CV_TOGGLE_FLT(xmax); xmax = cvFloor(v.f);
v.i = CV_TOGGLE_FLT(ymax); ymax = cvFloor(v.f);
}
}
rect.x = xmin;
rect.y = ymin;
rect.width = xmax - xmin + 1;
rect.height = ymax - ymin + 1;
}
if( update )
((CvContour*)ptseq)->rect = rect; ((CvContour*)ptseq)->rect = rect;
return rect; return rect;
} }
/* End of file. */ /* End of file. */
...@@ -43,19 +43,26 @@ ...@@ -43,19 +43,26 @@
#include "precomp.hpp" #include "precomp.hpp"
#include <stdio.h> #include <stdio.h>
/*
/*#if CV_SSE2 #if CV_SSE2
# if CV_SSE4 || defined __SSE4__ # if !CV_SSE4_1 && !CV_SSE4_2
# include <smmintrin.h> # define _mm_blendv_pd(a, b, m) _mm_xor_pd(a, _mm_and_pd(_mm_xor_pd(b, a), m))
# else # define _mm_blendv_ps(a, b, m) _mm_xor_ps(a, _mm_and_ps(_mm_xor_ps(b, a), m))
# define _mm_blendv_pd(a, b, m) _mm_xor_pd(a, _mm_and_pd(_mm_xor_pd(b, a), m))
# define _mm_blendv_ps(a, b, m) _mm_xor_ps(a, _mm_and_ps(_mm_xor_ps(b, a), m))
# endif # endif
#if defined CV_ICC
# define CV_HAAR_USE_SSE 1
#endif #endif
#endif*/
#if defined CV_ICC
# if defined CV_AVX
# define CV_HAAR_USE_AVX 1
# else
# if defined CV_SSE2 || defined CV_SSE4_1 || defined CV_SSE4_2
# define CV_HAAR_USE_SSE 1
# else
# define CV_HAAR_NO_SIMD 1
# endif
# endif
#endif
*/
/* these settings affect the quality of detection: change with care */ /* these settings affect the quality of detection: change with care */
#define CV_ADJUST_FEATURES 1 #define CV_ADJUST_FEATURES 1
#define CV_ADJUST_WEIGHTS 0 #define CV_ADJUST_WEIGHTS 0
...@@ -730,6 +737,7 @@ cvRunHaarClassifierCascadeSum( const CvHaarClassifierCascade* _cascade, ...@@ -730,6 +737,7 @@ cvRunHaarClassifierCascadeSum( const CvHaarClassifierCascade* _cascade,
{ {
CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j; CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j;
CvHidHaarTreeNode* node = classifier->node; CvHidHaarTreeNode* node = classifier->node;
#ifndef CV_HAAR_USE_SSE #ifndef CV_HAAR_USE_SSE
double t = node->threshold*variance_norm_factor; double t = node->threshold*variance_norm_factor;
double sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight; double sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight;
...@@ -745,6 +753,7 @@ cvRunHaarClassifierCascadeSum( const CvHaarClassifierCascade* _cascade, ...@@ -745,6 +753,7 @@ cvRunHaarClassifierCascadeSum( const CvHaarClassifierCascade* _cascade,
t = _mm_cmpgt_sd(t, sum); t = _mm_cmpgt_sd(t, sum);
stage_sum = _mm_add_sd(stage_sum, _mm_blendv_pd(b, a, t)); stage_sum = _mm_add_sd(stage_sum, _mm_blendv_pd(b, a, t));
#endif #endif
} }
} }
else else
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment