Commit 6bce6ee3 authored by Ilya Lavrenov's avatar Ilya Lavrenov

checks

parent 1d3c8604
...@@ -64,7 +64,7 @@ FUNCTOR_TEMPLATE(VLoadStore128); ...@@ -64,7 +64,7 @@ FUNCTOR_TEMPLATE(VLoadStore128);
#if CV_SSE2 #if CV_SSE2
FUNCTOR_TEMPLATE(VLoadStore64); FUNCTOR_TEMPLATE(VLoadStore64);
FUNCTOR_TEMPLATE(VLoadStore128Aligned); FUNCTOR_TEMPLATE(VLoadStore128Aligned);
#if CV_AVX #if CV_AVX2
FUNCTOR_TEMPLATE(VLoadStore256); FUNCTOR_TEMPLATE(VLoadStore256);
FUNCTOR_TEMPLATE(VLoadStore256Aligned); FUNCTOR_TEMPLATE(VLoadStore256Aligned);
#endif #endif
...@@ -2626,10 +2626,16 @@ struct Div_SIMD ...@@ -2626,10 +2626,16 @@ struct Div_SIMD
template <> template <>
struct Div_SIMD<uchar> struct Div_SIMD<uchar>
{ {
bool haveSIMD;
Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1); }
int operator() (const uchar * src1, const uchar * src2, uchar * dst, int width, double scale) const int operator() (const uchar * src1, const uchar * src2, uchar * dst, int width, double scale) const
{ {
int x = 0; int x = 0;
if (!haveSIMD)
return x;
__m128d v_scale = _mm_set1_pd(scale); __m128d v_scale = _mm_set1_pd(scale);
__m128i v_zero = _mm_setzero_si128(); __m128i v_zero = _mm_setzero_si128();
...@@ -2672,10 +2678,16 @@ struct Div_SIMD<uchar> ...@@ -2672,10 +2678,16 @@ struct Div_SIMD<uchar>
template <> template <>
struct Div_SIMD<schar> struct Div_SIMD<schar>
{ {
bool haveSIMD;
Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2); }
int operator() (const schar * src1, const schar * src2, schar * dst, int width, double scale) const int operator() (const schar * src1, const schar * src2, schar * dst, int width, double scale) const
{ {
int x = 0; int x = 0;
if (!haveSIMD)
return x;
__m128d v_scale = _mm_set1_pd(scale); __m128d v_scale = _mm_set1_pd(scale);
__m128i v_zero = _mm_setzero_si128(); __m128i v_zero = _mm_setzero_si128();
...@@ -2718,10 +2730,16 @@ struct Div_SIMD<schar> ...@@ -2718,10 +2730,16 @@ struct Div_SIMD<schar>
template <> template <>
struct Div_SIMD<ushort> struct Div_SIMD<ushort>
{ {
bool haveSIMD;
Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1); }
int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, double scale) const int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, double scale) const
{ {
int x = 0; int x = 0;
if (!haveSIMD)
return x;
__m128d v_scale = _mm_set1_pd(scale); __m128d v_scale = _mm_set1_pd(scale);
__m128i v_zero = _mm_setzero_si128(); __m128i v_zero = _mm_setzero_si128();
...@@ -2763,10 +2781,16 @@ struct Div_SIMD<ushort> ...@@ -2763,10 +2781,16 @@ struct Div_SIMD<ushort>
template <> template <>
struct Div_SIMD<short> struct Div_SIMD<short>
{ {
bool haveSIMD;
Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2); }
int operator() (const short * src1, const short * src2, short * dst, int width, double scale) const int operator() (const short * src1, const short * src2, short * dst, int width, double scale) const
{ {
int x = 0; int x = 0;
if (!haveSIMD)
return x;
__m128d v_scale = _mm_set1_pd(scale); __m128d v_scale = _mm_set1_pd(scale);
__m128i v_zero = _mm_setzero_si128(); __m128i v_zero = _mm_setzero_si128();
...@@ -2806,10 +2830,16 @@ struct Div_SIMD<short> ...@@ -2806,10 +2830,16 @@ struct Div_SIMD<short>
template <> template <>
struct Div_SIMD<int> struct Div_SIMD<int>
{ {
bool haveSIMD;
Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2); }
int operator() (const int * src1, const int * src2, int * dst, int width, double scale) const int operator() (const int * src1, const int * src2, int * dst, int width, double scale) const
{ {
int x = 0; int x = 0;
if (!haveSIMD)
return x;
__m128d v_scale = _mm_set1_pd(scale); __m128d v_scale = _mm_set1_pd(scale);
__m128i v_zero = _mm_setzero_si128(); __m128i v_zero = _mm_setzero_si128();
...@@ -2902,10 +2932,16 @@ struct Recip_SIMD ...@@ -2902,10 +2932,16 @@ struct Recip_SIMD
template <> template <>
struct Recip_SIMD<uchar> struct Recip_SIMD<uchar>
{ {
bool haveSIMD;
Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1); }
int operator() (const uchar * src2, uchar * dst, int width, double scale) const int operator() (const uchar * src2, uchar * dst, int width, double scale) const
{ {
int x = 0; int x = 0;
if (!haveSIMD)
return x;
__m128d v_scale = _mm_set1_pd(scale); __m128d v_scale = _mm_set1_pd(scale);
__m128i v_zero = _mm_setzero_si128(); __m128i v_zero = _mm_setzero_si128();
...@@ -2941,10 +2977,16 @@ struct Recip_SIMD<uchar> ...@@ -2941,10 +2977,16 @@ struct Recip_SIMD<uchar>
template <> template <>
struct Recip_SIMD<schar> struct Recip_SIMD<schar>
{ {
bool haveSIMD;
Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2); }
int operator() (const schar * src2, schar * dst, int width, double scale) const int operator() (const schar * src2, schar * dst, int width, double scale) const
{ {
int x = 0; int x = 0;
if (!haveSIMD)
return x;
__m128d v_scale = _mm_set1_pd(scale); __m128d v_scale = _mm_set1_pd(scale);
__m128i v_zero = _mm_setzero_si128(); __m128i v_zero = _mm_setzero_si128();
...@@ -2980,10 +3022,16 @@ struct Recip_SIMD<schar> ...@@ -2980,10 +3022,16 @@ struct Recip_SIMD<schar>
template <> template <>
struct Recip_SIMD<ushort> struct Recip_SIMD<ushort>
{ {
bool haveSIMD;
Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1); }
int operator() (const ushort * src2, ushort * dst, int width, double scale) const int operator() (const ushort * src2, ushort * dst, int width, double scale) const
{ {
int x = 0; int x = 0;
if (!haveSIMD)
return x;
__m128d v_scale = _mm_set1_pd(scale); __m128d v_scale = _mm_set1_pd(scale);
__m128i v_zero = _mm_setzero_si128(); __m128i v_zero = _mm_setzero_si128();
...@@ -3018,10 +3066,16 @@ struct Recip_SIMD<ushort> ...@@ -3018,10 +3066,16 @@ struct Recip_SIMD<ushort>
template <> template <>
struct Recip_SIMD<short> struct Recip_SIMD<short>
{ {
bool haveSIMD;
Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2); }
int operator() (const short * src2, short * dst, int width, double scale) const int operator() (const short * src2, short * dst, int width, double scale) const
{ {
int x = 0; int x = 0;
if (!haveSIMD)
return x;
__m128d v_scale = _mm_set1_pd(scale); __m128d v_scale = _mm_set1_pd(scale);
__m128i v_zero = _mm_setzero_si128(); __m128i v_zero = _mm_setzero_si128();
...@@ -3054,10 +3108,16 @@ struct Recip_SIMD<short> ...@@ -3054,10 +3108,16 @@ struct Recip_SIMD<short>
template <> template <>
struct Recip_SIMD<int> struct Recip_SIMD<int>
{ {
bool haveSIMD;
Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2); }
int operator() (const int * src2, int * dst, int width, double scale) const int operator() (const int * src2, int * dst, int width, double scale) const
{ {
int x = 0; int x = 0;
if (!haveSIMD)
return x;
__m128d v_scale = _mm_set1_pd(scale); __m128d v_scale = _mm_set1_pd(scale);
__m128i v_zero = _mm_setzero_si128(); __m128i v_zero = _mm_setzero_si128();
...@@ -4126,7 +4186,8 @@ static void cmp8u(const uchar* src1, size_t step1, const uchar* src2, size_t ste ...@@ -4126,7 +4186,8 @@ static void cmp8u(const uchar* src1, size_t step1, const uchar* src2, size_t ste
{ {
int x =0; int x =0;
#if CV_SSE2 #if CV_SSE2
if( USE_SSE2 ){ if( USE_SSE2 )
{
__m128i m128 = code == CMP_GT ? _mm_setzero_si128() : _mm_set1_epi8 (-1); __m128i m128 = code == CMP_GT ? _mm_setzero_si128() : _mm_set1_epi8 (-1);
__m128i c128 = _mm_set1_epi8 (-128); __m128i c128 = _mm_set1_epi8 (-128);
for( ; x <= size.width - 16; x += 16 ) for( ; x <= size.width - 16; x += 16 )
...@@ -4164,7 +4225,8 @@ static void cmp8u(const uchar* src1, size_t step1, const uchar* src2, size_t ste ...@@ -4164,7 +4225,8 @@ static void cmp8u(const uchar* src1, size_t step1, const uchar* src2, size_t ste
{ {
int x = 0; int x = 0;
#if CV_SSE2 #if CV_SSE2
if( USE_SSE2 ){ if( USE_SSE2 )
{
__m128i m128 = code == CMP_EQ ? _mm_setzero_si128() : _mm_set1_epi8 (-1); __m128i m128 = code == CMP_EQ ? _mm_setzero_si128() : _mm_set1_epi8 (-1);
for( ; x <= size.width - 16; x += 16 ) for( ; x <= size.width - 16; x += 16 )
{ {
...@@ -4254,7 +4316,8 @@ static void cmp16s(const short* src1, size_t step1, const short* src2, size_t st ...@@ -4254,7 +4316,8 @@ static void cmp16s(const short* src1, size_t step1, const short* src2, size_t st
{ {
int x =0; int x =0;
#if CV_SSE2 #if CV_SSE2
if( USE_SSE2){// if( USE_SSE2)
{
__m128i m128 = code == CMP_GT ? _mm_setzero_si128() : _mm_set1_epi16 (-1); __m128i m128 = code == CMP_GT ? _mm_setzero_si128() : _mm_set1_epi16 (-1);
for( ; x <= size.width - 16; x += 16 ) for( ; x <= size.width - 16; x += 16 )
{ {
...@@ -4293,7 +4356,6 @@ static void cmp16s(const short* src1, size_t step1, const short* src2, size_t st ...@@ -4293,7 +4356,6 @@ static void cmp16s(const short* src1, size_t step1, const short* src2, size_t st
vst1q_u8(dst+x, veorq_u8(vcombine_u8(t1, t2), mask)); vst1q_u8(dst+x, veorq_u8(vcombine_u8(t1, t2), mask));
} }
#endif #endif
for( ; x < size.width; x++ ){ for( ; x < size.width; x++ ){
...@@ -4308,7 +4370,8 @@ static void cmp16s(const short* src1, size_t step1, const short* src2, size_t st ...@@ -4308,7 +4370,8 @@ static void cmp16s(const short* src1, size_t step1, const short* src2, size_t st
{ {
int x = 0; int x = 0;
#if CV_SSE2 #if CV_SSE2
if( USE_SSE2 ){ if( USE_SSE2 )
{
__m128i m128 = code == CMP_EQ ? _mm_setzero_si128() : _mm_set1_epi16 (-1); __m128i m128 = code == CMP_EQ ? _mm_setzero_si128() : _mm_set1_epi16 (-1);
for( ; x <= size.width - 16; x += 16 ) for( ; x <= size.width - 16; x += 16 )
{ {
......
This diff is collapsed.
...@@ -597,6 +597,8 @@ void phase( InputArray src1, InputArray src2, OutputArray dst, bool angleInDegre ...@@ -597,6 +597,8 @@ void phase( InputArray src1, InputArray src2, OutputArray dst, bool angleInDegre
k = 0; k = 0;
#if CV_SSE2 #if CV_SSE2
if (USE_SSE2)
{
for ( ; k <= len - 4; k += 4) for ( ; k <= len - 4; k += 4)
{ {
__m128 v_dst0 = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(x + k)), __m128 v_dst0 = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(x + k)),
...@@ -607,6 +609,7 @@ void phase( InputArray src1, InputArray src2, OutputArray dst, bool angleInDegre ...@@ -607,6 +609,7 @@ void phase( InputArray src1, InputArray src2, OutputArray dst, bool angleInDegre
_mm_storeu_ps(buf[0] + k, v_dst0); _mm_storeu_ps(buf[0] + k, v_dst0);
_mm_storeu_ps(buf[1] + k, v_dst1); _mm_storeu_ps(buf[1] + k, v_dst1);
} }
}
#endif #endif
for( ; k < len; k++ ) for( ; k < len; k++ )
...@@ -619,12 +622,15 @@ void phase( InputArray src1, InputArray src2, OutputArray dst, bool angleInDegre ...@@ -619,12 +622,15 @@ void phase( InputArray src1, InputArray src2, OutputArray dst, bool angleInDegre
k = 0; k = 0;
#if CV_SSE2 #if CV_SSE2
if (USE_SSE2)
{
for ( ; k <= len - 4; k += 4) for ( ; k <= len - 4; k += 4)
{ {
__m128 v_src = _mm_loadu_ps(buf[0] + k); __m128 v_src = _mm_loadu_ps(buf[0] + k);
_mm_storeu_pd(angle + k, _mm_cvtps_pd(v_src)); _mm_storeu_pd(angle + k, _mm_cvtps_pd(v_src));
_mm_storeu_pd(angle + k + 2, _mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_src), 8)))); _mm_storeu_pd(angle + k + 2, _mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_src), 8))));
} }
}
#endif #endif
for( ; k < len; k++ ) for( ; k < len; k++ )
...@@ -728,6 +734,8 @@ void cartToPolar( InputArray src1, InputArray src2, ...@@ -728,6 +734,8 @@ void cartToPolar( InputArray src1, InputArray src2,
k = 0; k = 0;
#if CV_SSE2 #if CV_SSE2
if (USE_SSE2)
{
for ( ; k <= len - 4; k += 4) for ( ; k <= len - 4; k += 4)
{ {
__m128 v_dst0 = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(x + k)), __m128 v_dst0 = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(x + k)),
...@@ -738,6 +746,7 @@ void cartToPolar( InputArray src1, InputArray src2, ...@@ -738,6 +746,7 @@ void cartToPolar( InputArray src1, InputArray src2,
_mm_storeu_ps(buf[0] + k, v_dst0); _mm_storeu_ps(buf[0] + k, v_dst0);
_mm_storeu_ps(buf[1] + k, v_dst1); _mm_storeu_ps(buf[1] + k, v_dst1);
} }
}
#endif #endif
for( ; k < len; k++ ) for( ; k < len; k++ )
...@@ -750,12 +759,15 @@ void cartToPolar( InputArray src1, InputArray src2, ...@@ -750,12 +759,15 @@ void cartToPolar( InputArray src1, InputArray src2,
k = 0; k = 0;
#if CV_SSE2 #if CV_SSE2
if (USE_SSE2)
{
for ( ; k <= len - 4; k += 4) for ( ; k <= len - 4; k += 4)
{ {
__m128 v_src = _mm_loadu_ps(buf[0] + k); __m128 v_src = _mm_loadu_ps(buf[0] + k);
_mm_storeu_pd(angle + k, _mm_cvtps_pd(v_src)); _mm_storeu_pd(angle + k, _mm_cvtps_pd(v_src));
_mm_storeu_pd(angle + k + 2, _mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_src), 8)))); _mm_storeu_pd(angle + k + 2, _mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_src), 8))));
} }
}
#endif #endif
for( ; k < len; k++ ) for( ; k < len; k++ )
...@@ -832,7 +844,8 @@ static void SinCos_32f( const float *angle, float *sinval, float* cosval, ...@@ -832,7 +844,8 @@ static void SinCos_32f( const float *angle, float *sinval, float* cosval,
k1 = N/360.; k1 = N/360.;
#if CV_AVX2 #if CV_AVX2
__m128d v_i = _mm_set_pd(1, 0); if (USE_AVX2)
{
__m128d v_k1 = _mm_set1_pd(k1); __m128d v_k1 = _mm_set1_pd(k1);
__m128d v_1 = _mm_set1_pd(1); __m128d v_1 = _mm_set1_pd(1);
__m128i v_N1 = _mm_set1_epi32(N - 1); __m128i v_N1 = _mm_set1_epi32(N - 1);
...@@ -841,8 +854,6 @@ static void SinCos_32f( const float *angle, float *sinval, float* cosval, ...@@ -841,8 +854,6 @@ static void SinCos_32f( const float *angle, float *sinval, float* cosval,
__m128d v_sin_a2 = _mm_set1_pd(sin_a2); __m128d v_sin_a2 = _mm_set1_pd(sin_a2);
__m128d v_cos_a0 = _mm_set1_pd(cos_a0); __m128d v_cos_a0 = _mm_set1_pd(cos_a0);
if (USE_AVX2)
{
for ( ; i <= len - 4; i += 4) for ( ; i <= len - 4; i += 4)
{ {
__m128 v_angle = _mm_loadu_ps(angle + i); __m128 v_angle = _mm_loadu_ps(angle + i);
...@@ -859,8 +870,8 @@ static void SinCos_32f( const float *angle, float *sinval, float* cosval, ...@@ -859,8 +870,8 @@ static void SinCos_32f( const float *angle, float *sinval, float* cosval,
__m128d v_sin_b = _mm_mul_pd(_mm_add_pd(_mm_mul_pd(v_sin_a0, v_t2), v_sin_a2), v_t); __m128d v_sin_b = _mm_mul_pd(_mm_add_pd(_mm_mul_pd(v_sin_a0, v_t2), v_sin_a2), v_t);
__m128d v_cos_b = _mm_add_pd(_mm_mul_pd(v_cos_a0, v_t2), v_1); __m128d v_cos_b = _mm_add_pd(_mm_mul_pd(v_cos_a0, v_t2), v_1);
__m128d v_sin_a = _mm_i32gather_pd(sin_table, v_sin_idx, 1); __m128d v_sin_a = _mm_i32gather_pd(sin_table, v_sin_idx, 8);
__m128d v_cos_a = _mm_i32gather_pd(sin_table, v_cos_idx, 1); __m128d v_cos_a = _mm_i32gather_pd(sin_table, v_cos_idx, 8);
__m128d v_sin_val_0 = _mm_add_pd(_mm_mul_pd(v_sin_a, v_cos_b), __m128d v_sin_val_0 = _mm_add_pd(_mm_mul_pd(v_sin_a, v_cos_b),
_mm_mul_pd(v_cos_a, v_sin_b)); _mm_mul_pd(v_cos_a, v_sin_b));
...@@ -868,7 +879,7 @@ static void SinCos_32f( const float *angle, float *sinval, float* cosval, ...@@ -868,7 +879,7 @@ static void SinCos_32f( const float *angle, float *sinval, float* cosval,
_mm_mul_pd(v_sin_a, v_sin_b)); _mm_mul_pd(v_sin_a, v_sin_b));
// 2-3 // 2-3
v_t = _mm_mul_pd(_mm_cvtps_pd(_mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(v_angle), 8))), v_k1); v_t = _mm_mul_pd(_mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_angle), 8))), v_k1);
v_it = _mm_cvtpd_epi32(v_t); v_it = _mm_cvtpd_epi32(v_t);
v_t = _mm_sub_pd(v_t, _mm_cvtepi32_pd(v_it)); v_t = _mm_sub_pd(v_t, _mm_cvtepi32_pd(v_it));
...@@ -879,8 +890,8 @@ static void SinCos_32f( const float *angle, float *sinval, float* cosval, ...@@ -879,8 +890,8 @@ static void SinCos_32f( const float *angle, float *sinval, float* cosval,
v_sin_b = _mm_mul_pd(_mm_add_pd(_mm_mul_pd(v_sin_a0, v_t2), v_sin_a2), v_t); v_sin_b = _mm_mul_pd(_mm_add_pd(_mm_mul_pd(v_sin_a0, v_t2), v_sin_a2), v_t);
v_cos_b = _mm_add_pd(_mm_mul_pd(v_cos_a0, v_t2), v_1); v_cos_b = _mm_add_pd(_mm_mul_pd(v_cos_a0, v_t2), v_1);
v_sin_a = _mm_i32gather_pd(sin_table, v_sin_idx, 1); v_sin_a = _mm_i32gather_pd(sin_table, v_sin_idx, 8);
v_cos_a = _mm_i32gather_pd(sin_table, v_cos_idx, 1); v_cos_a = _mm_i32gather_pd(sin_table, v_cos_idx, 8);
__m128d v_sin_val_1 = _mm_add_pd(_mm_mul_pd(v_sin_a, v_cos_b), __m128d v_sin_val_1 = _mm_add_pd(_mm_mul_pd(v_sin_a, v_cos_b),
_mm_mul_pd(v_cos_a, v_sin_b)); _mm_mul_pd(v_cos_a, v_sin_b));
...@@ -1032,12 +1043,15 @@ void polarToCart( InputArray src1, InputArray src2, ...@@ -1032,12 +1043,15 @@ void polarToCart( InputArray src1, InputArray src2,
vst1q_f32(y + k, vmulq_f32(vld1q_f32(y + k), v_m)); vst1q_f32(y + k, vmulq_f32(vld1q_f32(y + k), v_m));
} }
#elif CV_SSE2 #elif CV_SSE2
if (USE_SSE2)
{
for( ; k <= len - 4; k += 4 ) for( ; k <= len - 4; k += 4 )
{ {
__m128 v_m = _mm_loadu_ps(mag + k); __m128 v_m = _mm_loadu_ps(mag + k);
_mm_storeu_ps(x + k, _mm_mul_ps(_mm_loadu_ps(x + k), v_m)); _mm_storeu_ps(x + k, _mm_mul_ps(_mm_loadu_ps(x + k), v_m));
_mm_storeu_ps(y + k, _mm_mul_ps(_mm_loadu_ps(y + k), v_m)); _mm_storeu_ps(y + k, _mm_mul_ps(_mm_loadu_ps(y + k), v_m));
} }
}
#endif #endif
for( ; k < len; k++ ) for( ; k < len; k++ )
...@@ -1063,9 +1077,9 @@ void polarToCart( InputArray src1, InputArray src2, ...@@ -1063,9 +1077,9 @@ void polarToCart( InputArray src1, InputArray src2,
x[k] = buf[0][k]*m; y[k] = buf[1][k]*m; x[k] = buf[0][k]*m; y[k] = buf[1][k]*m;
} }
else else
for( k = 0; k < len; k++ )
{ {
x[k] = buf[0][k]; y[k] = buf[1][k]; std::memcpy(x, buf[0], sizeof(float) * len);
std::memcpy(y, buf[1], sizeof(float) * len);
} }
} }
......
...@@ -397,6 +397,8 @@ static int countNonZero_(const T* src, int len ) ...@@ -397,6 +397,8 @@ static int countNonZero_(const T* src, int len )
return nz; return nz;
} }
#if CV_SSE2
static const uchar * initPopcountTable() static const uchar * initPopcountTable()
{ {
static uchar tab[256]; static uchar tab[256];
...@@ -425,6 +427,8 @@ static const uchar * initPopcountTable() ...@@ -425,6 +427,8 @@ static const uchar * initPopcountTable()
return tab; return tab;
} }
#endif
static int countNonZero8u( const uchar* src, int len ) static int countNonZero8u( const uchar* src, int len )
{ {
int i=0, nz = 0; int i=0, nz = 0;
......
This diff is collapsed.
...@@ -1963,9 +1963,9 @@ private: ...@@ -1963,9 +1963,9 @@ private:
struct ResizeAreaFastVec_SIMD_32f struct ResizeAreaFastVec_SIMD_32f
{ {
ResizeAreaFastVec_SIMD_32f(int _scale_x, int _scale_y, int _cn, int _step) : ResizeAreaFastVec_SIMD_32f(int _scale_x, int _scale_y, int _cn, int _step) :
scale_x(_scale_x), scale_y(_scale_y), cn(_cn), step(_step) cn(_cn), step(_step)
{ {
fast_mode = scale_x == 2 && scale_y == 2 && (cn == 1 || cn == 3 || cn == 4); fast_mode = _scale_x == 2 && _scale_y == 2 && (cn == 1 || cn == 4);
} }
int operator() (const float * S, float * D, int w) const int operator() (const float * S, float * D, int w) const
...@@ -2005,7 +2005,6 @@ struct ResizeAreaFastVec_SIMD_32f ...@@ -2005,7 +2005,6 @@ struct ResizeAreaFastVec_SIMD_32f
} }
private: private:
int scale_x, scale_y;
int cn; int cn;
bool fast_mode; bool fast_mode;
int step; int step;
...@@ -2289,9 +2288,10 @@ private: ...@@ -2289,9 +2288,10 @@ private:
struct ResizeAreaFastVec_SIMD_32f struct ResizeAreaFastVec_SIMD_32f
{ {
ResizeAreaFastVec_SIMD_32f(int _scale_x, int _scale_y, int _cn, int _step) : ResizeAreaFastVec_SIMD_32f(int _scale_x, int _scale_y, int _cn, int _step) :
scale_x(_scale_x), scale_y(_scale_y), cn(_cn), step(_step) cn(_cn), step(_step)
{ {
fast_mode = scale_x == 2 && scale_y == 2 && (cn == 1 || cn == 3 || cn == 4); fast_mode = _scale_x == 2 && _scale_y == 2 && (cn == 1 || cn == 4);
fast_mode = fast_mode && checkHardwareSupport(CV_CPU_SSE2);
} }
int operator() (const float * S, float * D, int w) const int operator() (const float * S, float * D, int w) const
...@@ -2335,7 +2335,6 @@ struct ResizeAreaFastVec_SIMD_32f ...@@ -2335,7 +2335,6 @@ struct ResizeAreaFastVec_SIMD_32f
} }
private: private:
int scale_x, scale_y;
int cn; int cn;
bool fast_mode; bool fast_mode;
int step; int step;
...@@ -4817,6 +4816,13 @@ void cv::convertMaps( InputArray _map1, InputArray _map2, ...@@ -4817,6 +4816,13 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
size.height = 1; size.height = 1;
} }
#if CV_SSE2
bool useSSE2 = checkHardwareSupport(CV_CPU_SSE2);
#endif
#if CV_SSE4_1
bool useSSE4_1 = checkHardwareSupport(CV_CPU_SSE4_1);
#endif
const float scale = 1.f/INTER_TAB_SIZE; const float scale = 1.f/INTER_TAB_SIZE;
int x, y; int x, y;
for( y = 0; y < size.height; y++ ) for( y = 0; y < size.height; y++ )
...@@ -4848,6 +4854,8 @@ void cv::convertMaps( InputArray _map1, InputArray _map2, ...@@ -4848,6 +4854,8 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
vst2q_s16(dst1 + (x << 1), v_dst); vst2q_s16(dst1 + (x << 1), v_dst);
} }
#elif CV_SSE4_1 #elif CV_SSE4_1
if (useSSE4_1)
{
for( ; x <= size.width - 16; x += 16 ) for( ; x <= size.width - 16; x += 16 )
{ {
__m128i v_dst0 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src1f + x)), __m128i v_dst0 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src1f + x)),
...@@ -4867,6 +4875,7 @@ void cv::convertMaps( InputArray _map1, InputArray _map2, ...@@ -4867,6 +4875,7 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
_mm_storeu_si128((__m128i *)(dst1 + x * 2 + 16), v_dst2); _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 16), v_dst2);
_mm_storeu_si128((__m128i *)(dst1 + x * 2 + 24), v_dst3); _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 24), v_dst3);
} }
}
#endif #endif
for( ; x < size.width; x++ ) for( ; x < size.width; x++ )
{ {
...@@ -4902,6 +4911,8 @@ void cv::convertMaps( InputArray _map1, InputArray _map2, ...@@ -4902,6 +4911,8 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
vst1q_u16(dst2 + x, vcombine_u16(v_dst0, v_dst1)); vst1q_u16(dst2 + x, vcombine_u16(v_dst0, v_dst1));
} }
#elif CV_SSE4_1 #elif CV_SSE4_1
if (useSSE4_1)
{
__m128 v_its = _mm_set1_ps(INTER_TAB_SIZE); __m128 v_its = _mm_set1_ps(INTER_TAB_SIZE);
__m128i v_its1 = _mm_set1_epi32(INTER_TAB_SIZE-1); __m128i v_its1 = _mm_set1_epi32(INTER_TAB_SIZE-1);
...@@ -4944,6 +4955,7 @@ void cv::convertMaps( InputArray _map1, InputArray _map2, ...@@ -4944,6 +4955,7 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
_mm_storeu_si128((__m128i *)(dst1 + x * 2 + 16), v_dst12); _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 16), v_dst12);
_mm_storeu_si128((__m128i *)(dst1 + x * 2 + 24), v_dst13); _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 24), v_dst13);
} }
}
#endif #endif
for( ; x < size.width; x++ ) for( ; x < size.width; x++ )
{ {
...@@ -5005,6 +5017,8 @@ void cv::convertMaps( InputArray _map1, InputArray _map2, ...@@ -5005,6 +5017,8 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
vst1q_u16(dst2 + x, vcombine_u16(v_dst0, v_dst1)); vst1q_u16(dst2 + x, vcombine_u16(v_dst0, v_dst1));
} }
#elif CV_SSE2 #elif CV_SSE2
if (useSSE2)
{
__m128 v_its = _mm_set1_ps(INTER_TAB_SIZE); __m128 v_its = _mm_set1_ps(INTER_TAB_SIZE);
__m128i v_its1 = _mm_set1_epi32(INTER_TAB_SIZE-1); __m128i v_its1 = _mm_set1_epi32(INTER_TAB_SIZE-1);
__m128i v_y_mask = _mm_set1_epi32((INTER_TAB_SIZE-1) << 16); __m128i v_y_mask = _mm_set1_epi32((INTER_TAB_SIZE-1) << 16);
...@@ -5025,6 +5039,7 @@ void cv::convertMaps( InputArray _map1, InputArray _map2, ...@@ -5025,6 +5039,7 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
_mm_and_si128(v_src0, v_its1)); // 0 x0 0 x1 . . . _mm_and_si128(v_src0, v_its1)); // 0 x0 0 x1 . . .
_mm_storel_epi64((__m128i *)(dst2 + x), _mm_packus_epi32(v_dst2, v_dst2)); _mm_storel_epi64((__m128i *)(dst2 + x), _mm_packus_epi32(v_dst2, v_dst2));
} }
}
#endif #endif
for( ; x < size.width; x++ ) for( ; x < size.width; x++ )
{ {
...@@ -5150,6 +5165,8 @@ void cv::convertMaps( InputArray _map1, InputArray _map2, ...@@ -5150,6 +5165,8 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
vst2q_f32(dst1f + (x << 1) + 8, v_dst); vst2q_f32(dst1f + (x << 1) + 8, v_dst);
} }
#elif CV_SSE2 #elif CV_SSE2
if (useSSE2)
{
__m128i v_mask2 = _mm_set1_epi16(INTER_TAB_SIZE2-1); __m128i v_mask2 = _mm_set1_epi16(INTER_TAB_SIZE2-1);
__m128i v_zero = _mm_set1_epi32(0), v_mask = _mm_set1_epi32(INTER_TAB_SIZE-1); __m128i v_zero = _mm_set1_epi32(0), v_mask = _mm_set1_epi32(INTER_TAB_SIZE-1);
__m128 v_scale = _mm_set1_ps(scale); __m128 v_scale = _mm_set1_ps(scale);
...@@ -5167,6 +5184,7 @@ void cv::convertMaps( InputArray _map1, InputArray _map2, ...@@ -5167,6 +5184,7 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
v_add = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_fxy1, v_fxy2)), v_scale); v_add = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_fxy1, v_fxy2)), v_scale);
_mm_storeu_ps(dst1f + x * 2, _mm_add_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)), v_add)); _mm_storeu_ps(dst1f + x * 2, _mm_add_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)), v_add));
} }
}
#endif #endif
for( ; x < size.width; x++ ) for( ; x < size.width; x++ )
{ {
...@@ -5204,7 +5222,10 @@ public: ...@@ -5204,7 +5222,10 @@ public:
const int AB_SCALE = 1 << AB_BITS; const int AB_SCALE = 1 << AB_BITS;
int round_delta = interpolation == INTER_NEAREST ? AB_SCALE/2 : AB_SCALE/INTER_TAB_SIZE/2, x, y, x1, y1; int round_delta = interpolation == INTER_NEAREST ? AB_SCALE/2 : AB_SCALE/INTER_TAB_SIZE/2, x, y, x1, y1;
#if CV_SSE2 #if CV_SSE2
bool useSIMD = checkHardwareSupport(CV_CPU_SSE2); bool useSSE2 = checkHardwareSupport(CV_CPU_SSE2);
#endif
#if CV_SSE4_1
bool useSSE4_1 = checkHardwareSupport(CV_CPU_SSE4_1);
#endif #endif
int bh0 = std::min(BLOCK_SZ/2, dst.rows); int bh0 = std::min(BLOCK_SZ/2, dst.rows);
...@@ -5243,6 +5264,8 @@ public: ...@@ -5243,6 +5264,8 @@ public:
vst2q_s16(xy + (x1 << 1), v_dst); vst2q_s16(xy + (x1 << 1), v_dst);
} }
#elif CV_SSE4_1 #elif CV_SSE4_1
if (useSSE4_1)
{
__m128i v_X0 = _mm_set1_epi32(X0); __m128i v_X0 = _mm_set1_epi32(X0);
__m128i v_Y0 = _mm_set1_epi32(Y0); __m128i v_Y0 = _mm_set1_epi32(Y0);
for ( ; x1 <= bw - 16; x1 += 16) for ( ; x1 <= bw - 16; x1 += 16)
...@@ -5264,6 +5287,7 @@ public: ...@@ -5264,6 +5287,7 @@ public:
_mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_y0); _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_y0);
_mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_y1); _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_y1);
} }
}
#endif #endif
for( ; x1 < bw; x1++ ) for( ; x1 < bw; x1++ )
{ {
...@@ -5278,7 +5302,7 @@ public: ...@@ -5278,7 +5302,7 @@ public:
short* alpha = A + y1*bw; short* alpha = A + y1*bw;
x1 = 0; x1 = 0;
#if CV_SSE2 #if CV_SSE2
if( useSIMD ) if( useSSE2 )
{ {
__m128i fxy_mask = _mm_set1_epi32(INTER_TAB_SIZE - 1); __m128i fxy_mask = _mm_set1_epi32(INTER_TAB_SIZE - 1);
__m128i XX = _mm_set1_epi32(X0), YY = _mm_set1_epi32(Y0); __m128i XX = _mm_set1_epi32(X0), YY = _mm_set1_epi32(Y0);
...@@ -5672,6 +5696,7 @@ public: ...@@ -5672,6 +5696,7 @@ public:
bh0 = std::min(BLOCK_SZ*BLOCK_SZ/bw0, height); bh0 = std::min(BLOCK_SZ*BLOCK_SZ/bw0, height);
#if CV_SSE4_1 #if CV_SSE4_1
bool haveSSE4_1 = checkHardwareSupport(CV_CPU_SSE4_1);
__m128d v_M0 = _mm_set1_pd(M[0]); __m128d v_M0 = _mm_set1_pd(M[0]);
__m128d v_M3 = _mm_set1_pd(M[3]); __m128d v_M3 = _mm_set1_pd(M[3]);
__m128d v_M6 = _mm_set1_pd(M[6]); __m128d v_M6 = _mm_set1_pd(M[6]);
...@@ -5706,6 +5731,8 @@ public: ...@@ -5706,6 +5731,8 @@ public:
x1 = 0; x1 = 0;
#if CV_SSE4_1 #if CV_SSE4_1
if (haveSSE4_1)
{
__m128d v_X0d = _mm_set1_pd(X0); __m128d v_X0d = _mm_set1_pd(X0);
__m128d v_Y0d = _mm_set1_pd(Y0); __m128d v_Y0d = _mm_set1_pd(Y0);
__m128d v_W0 = _mm_set1_pd(W0); __m128d v_W0 = _mm_set1_pd(W0);
...@@ -5810,6 +5837,7 @@ public: ...@@ -5810,6 +5837,7 @@ public:
_mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_Y0); _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_Y0);
_mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_Y1); _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_Y1);
} }
}
#endif #endif
for( ; x1 < bw; x1++ ) for( ; x1 < bw; x1++ )
...@@ -5831,6 +5859,8 @@ public: ...@@ -5831,6 +5859,8 @@ public:
x1 = 0; x1 = 0;
#if CV_SSE4_1 #if CV_SSE4_1
if (haveSSE4_1)
{
__m128d v_X0d = _mm_set1_pd(X0); __m128d v_X0d = _mm_set1_pd(X0);
__m128d v_Y0d = _mm_set1_pd(Y0); __m128d v_Y0d = _mm_set1_pd(Y0);
__m128d v_W0 = _mm_set1_pd(W0); __m128d v_W0 = _mm_set1_pd(W0);
...@@ -5948,6 +5978,7 @@ public: ...@@ -5948,6 +5978,7 @@ public:
_mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_Y0); _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_Y0);
_mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_Y1); _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_Y1);
} }
}
#endif #endif
for( ; x1 < bw; x1++ ) for( ; x1 < bw; x1++ )
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment