Commit fd711219 authored by Tomoaki Teshima's avatar Tomoaki Teshima

use universal intrinsic in VBLAS

 - brush up v_reduce_sum of SSE version
parent a8aff6f6
......@@ -1101,6 +1101,15 @@ OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(int16x8, short, max, epi16, (short)-32768)
OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(int16x8, short, min, epi16, (short)-32768)
OPENCV_HAL_IMPL_SSE_REDUCE_OP_8_SUM(int16x8, short, 16)
#define OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(_Tpvec, scalartype, regtype, suffix, cast_from, cast_to, extract) \
inline scalartype v_reduce_sum(const _Tpvec& a) \
{ \
regtype val = a.val; \
val = _mm_add_##suffix(val, cast_to(_mm_srli_si128(cast_from(val), 8))); \
val = _mm_add_##suffix(val, cast_to(_mm_srli_si128(cast_from(val), 4))); \
return (scalartype)_mm_cvt##extract(val); \
}
#define OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(_Tpvec, scalartype, func, scalar_func) \
inline scalartype v_reduce_##func(const _Tpvec& a) \
{ \
......@@ -1111,13 +1120,14 @@ inline scalartype v_reduce_##func(const _Tpvec& a) \
return scalar_func(s0, s1); \
}
OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, sum, OPENCV_HAL_ADD)
OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_uint32x4, unsigned, __m128i, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP, si128_si32)
OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_int32x4, int, __m128i, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP, si128_si32)
OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_float32x4, float, __m128, ps, _mm_castps_si128, _mm_castsi128_ps, ss_f32)
OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, max, std::max)
OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, min, std::min)
OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, sum, OPENCV_HAL_ADD)
OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, max, std::max)
OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, min, std::min)
OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, sum, OPENCV_HAL_ADD)
OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, max, std::max)
OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, min, std::min)
......
......@@ -262,25 +262,21 @@ template<typename T> struct VBLAS
int givensx(T*, T*, int, T, T, T*, T*) const { return 0; }
};
#if CV_SSE2
#if CV_SIMD128
template<> inline int VBLAS<float>::dot(const float* a, const float* b, int n, float* result) const
{
if( n < 8 )
return 0;
int k = 0;
__m128 s0 = _mm_setzero_ps(), s1 = _mm_setzero_ps();
for( ; k <= n - 8; k += 8 )
v_float32x4 s0 = v_setzero_f32();
for( ; k <= n - v_float32x4::nlanes; k += v_float32x4::nlanes )
{
__m128 a0 = _mm_load_ps(a + k), a1 = _mm_load_ps(a + k + 4);
__m128 b0 = _mm_load_ps(b + k), b1 = _mm_load_ps(b + k + 4);
v_float32x4 a0 = v_load(a + k);
v_float32x4 b0 = v_load(b + k);
s0 = _mm_add_ps(s0, _mm_mul_ps(a0, b0));
s1 = _mm_add_ps(s1, _mm_mul_ps(a1, b1));
s0 += a0 * b0;
}
s0 = _mm_add_ps(s0, s1);
float sbuf[4];
_mm_storeu_ps(sbuf, s0);
*result = sbuf[0] + sbuf[1] + sbuf[2] + sbuf[3];
*result = v_reduce_sum(s0);
return k;
}
......@@ -290,15 +286,15 @@ template<> inline int VBLAS<float>::givens(float* a, float* b, int n, float c, f
if( n < 4 )
return 0;
int k = 0;
__m128 c4 = _mm_set1_ps(c), s4 = _mm_set1_ps(s);
for( ; k <= n - 4; k += 4 )
v_float32x4 c4 = v_setall_f32(c), s4 = v_setall_f32(s);
for( ; k <= n - v_float32x4::nlanes; k += v_float32x4::nlanes )
{
__m128 a0 = _mm_load_ps(a + k);
__m128 b0 = _mm_load_ps(b + k);
__m128 t0 = _mm_add_ps(_mm_mul_ps(a0, c4), _mm_mul_ps(b0, s4));
__m128 t1 = _mm_sub_ps(_mm_mul_ps(b0, c4), _mm_mul_ps(a0, s4));
_mm_store_ps(a + k, t0);
_mm_store_ps(b + k, t1);
v_float32x4 a0 = v_load(a + k);
v_float32x4 b0 = v_load(b + k);
v_float32x4 t0 = (a0 * c4) + (b0 * s4);
v_float32x4 t1 = (b0 * c4) - (a0 * s4);
v_store(a + k, t0);
v_store(b + k, t1);
}
return k;
}
......@@ -310,45 +306,40 @@ template<> inline int VBLAS<float>::givensx(float* a, float* b, int n, float c,
if( n < 4 )
return 0;
int k = 0;
__m128 c4 = _mm_set1_ps(c), s4 = _mm_set1_ps(s);
__m128 sa = _mm_setzero_ps(), sb = _mm_setzero_ps();
for( ; k <= n - 4; k += 4 )
{
__m128 a0 = _mm_load_ps(a + k);
__m128 b0 = _mm_load_ps(b + k);
__m128 t0 = _mm_add_ps(_mm_mul_ps(a0, c4), _mm_mul_ps(b0, s4));
__m128 t1 = _mm_sub_ps(_mm_mul_ps(b0, c4), _mm_mul_ps(a0, s4));
_mm_store_ps(a + k, t0);
_mm_store_ps(b + k, t1);
sa = _mm_add_ps(sa, _mm_mul_ps(t0, t0));
sb = _mm_add_ps(sb, _mm_mul_ps(t1, t1));
}
float abuf[4], bbuf[4];
_mm_storeu_ps(abuf, sa);
_mm_storeu_ps(bbuf, sb);
*anorm = abuf[0] + abuf[1] + abuf[2] + abuf[3];
*bnorm = bbuf[0] + bbuf[1] + bbuf[2] + bbuf[3];
v_float32x4 c4 = v_setall_f32(c), s4 = v_setall_f32(s);
v_float32x4 sa = v_setzero_f32(), sb = v_setzero_f32();
for( ; k <= n - v_float32x4::nlanes; k += v_float32x4::nlanes )
{
v_float32x4 a0 = v_load(a + k);
v_float32x4 b0 = v_load(b + k);
v_float32x4 t0 = (a0 * c4) + (b0 * s4);
v_float32x4 t1 = (b0 * c4) - (a0 * s4);
v_store(a + k, t0);
v_store(b + k, t1);
sa += t0 + t0;
sb += t1 + t1;
}
*anorm = v_reduce_sum(sa);
*bnorm = v_reduce_sum(sb);
return k;
}
#if CV_SIMD128_64F
template<> inline int VBLAS<double>::dot(const double* a, const double* b, int n, double* result) const
{
if( n < 4 )
return 0;
int k = 0;
__m128d s0 = _mm_setzero_pd(), s1 = _mm_setzero_pd();
for( ; k <= n - 4; k += 4 )
v_float64x2 s0 = v_setzero_f64();
for( ; k <= n - v_float64x2::nlanes; k += v_float64x2::nlanes )
{
__m128d a0 = _mm_load_pd(a + k), a1 = _mm_load_pd(a + k + 2);
__m128d b0 = _mm_load_pd(b + k), b1 = _mm_load_pd(b + k + 2);
v_float64x2 a0 = v_load(a + k);
v_float64x2 b0 = v_load(b + k);
s0 = _mm_add_pd(s0, _mm_mul_pd(a0, b0));
s1 = _mm_add_pd(s1, _mm_mul_pd(a1, b1));
s0 += a0 * b0;
}
s0 = _mm_add_pd(s0, s1);
double sbuf[2];
_mm_storeu_pd(sbuf, s0);
v_store(sbuf, s0);
*result = sbuf[0] + sbuf[1];
return k;
}
......@@ -357,15 +348,15 @@ template<> inline int VBLAS<double>::dot(const double* a, const double* b, int n
template<> inline int VBLAS<double>::givens(double* a, double* b, int n, double c, double s) const
{
int k = 0;
__m128d c2 = _mm_set1_pd(c), s2 = _mm_set1_pd(s);
for( ; k <= n - 2; k += 2 )
v_float64x2 c2 = v_setall_f64(c), s2 = v_setall_f64(s);
for( ; k <= n - v_float64x2::nlanes; k += v_float64x2::nlanes )
{
__m128d a0 = _mm_load_pd(a + k);
__m128d b0 = _mm_load_pd(b + k);
__m128d t0 = _mm_add_pd(_mm_mul_pd(a0, c2), _mm_mul_pd(b0, s2));
__m128d t1 = _mm_sub_pd(_mm_mul_pd(b0, c2), _mm_mul_pd(a0, s2));
_mm_store_pd(a + k, t0);
_mm_store_pd(b + k, t1);
v_float64x2 a0 = v_load(a + k);
v_float64x2 b0 = v_load(b + k);
v_float64x2 t0 = (a0 * c2) + (b0 * s2);
v_float64x2 t1 = (b0 * c2) - (a0 * s2);
v_store(a + k, t0);
v_store(b + k, t1);
}
return k;
}
......@@ -375,27 +366,28 @@ template<> inline int VBLAS<double>::givensx(double* a, double* b, int n, double
double* anorm, double* bnorm) const
{
int k = 0;
__m128d c2 = _mm_set1_pd(c), s2 = _mm_set1_pd(s);
__m128d sa = _mm_setzero_pd(), sb = _mm_setzero_pd();
for( ; k <= n - 2; k += 2 )
{
__m128d a0 = _mm_load_pd(a + k);
__m128d b0 = _mm_load_pd(b + k);
__m128d t0 = _mm_add_pd(_mm_mul_pd(a0, c2), _mm_mul_pd(b0, s2));
__m128d t1 = _mm_sub_pd(_mm_mul_pd(b0, c2), _mm_mul_pd(a0, s2));
_mm_store_pd(a + k, t0);
_mm_store_pd(b + k, t1);
sa = _mm_add_pd(sa, _mm_mul_pd(t0, t0));
sb = _mm_add_pd(sb, _mm_mul_pd(t1, t1));
v_float64x2 c2 = v_setall_f64(c), s2 = v_setall_f64(s);
v_float64x2 sa = v_setzero_f64(), sb = v_setzero_f64();
for( ; k <= n - v_float64x2::nlanes; k += v_float64x2::nlanes )
{
v_float64x2 a0 = v_load(a + k);
v_float64x2 b0 = v_load(b + k);
v_float64x2 t0 = (a0 * c2) + (b0 * s2);
v_float64x2 t1 = (b0 * c2) - (a0 * s2);
v_store(a + k, t0);
v_store(b + k, t1);
sa += t0 * t0;
sb += t1 * t1;
}
double abuf[2], bbuf[2];
_mm_storeu_pd(abuf, sa);
_mm_storeu_pd(bbuf, sb);
v_store(abuf, sa);
v_store(bbuf, sb);
*anorm = abuf[0] + abuf[1];
*bnorm = bbuf[0] + bbuf[1];
return k;
}
#endif
#endif //CV_SIMD128_64F
#endif //CV_SIMD128
template<typename _Tp> void
JacobiSVDImpl_(_Tp* At, size_t astep, _Tp* _W, _Tp* Vt, size_t vstep,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment