Commit 9fbd1d68 authored by Vadim Pisarevsky's avatar Vadim Pisarevsky

refactored div & pow funcs; added tests for special cases in pow() function.

fixed http://code.opencv.org/issues/3935
possibly fixed http://code.opencv.org/issues/3594
parent 74e2b8cb
......@@ -2781,15 +2781,23 @@ struct Div_SIMD
}
};
#if CV_SSE2
template <typename T>
struct Recip_SIMD
{
int operator() (const T *, T *, int, double) const
{
return 0;
}
};
#if CV_SSE4_1
#if CV_SIMD128
template <>
struct Div_SIMD<uchar>
{
bool haveSIMD;
Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1); }
Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
int operator() (const uchar * src1, const uchar * src2, uchar * dst, int width, double scale) const
{
......@@ -2798,50 +2806,44 @@ struct Div_SIMD<uchar>
if (!haveSIMD)
return x;
__m128d v_scale = _mm_set1_pd(scale);
__m128i v_zero = _mm_setzero_si128();
v_float32x4 v_scale = v_setall_f32((float)scale);
v_uint16x8 v_zero = v_setzero_u16();
for ( ; x <= width - 8; x += 8)
{
__m128i v_src1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(src1 + x)), v_zero);
__m128i _v_src2 = _mm_loadl_epi64((const __m128i *)(src2 + x));
__m128i v_src2 = _mm_unpacklo_epi8(_v_src2, v_zero);
__m128i v_src1i = _mm_unpacklo_epi16(v_src1, v_zero);
__m128i v_src2i = _mm_unpacklo_epi16(v_src2, v_zero);
__m128d v_src1d = _mm_cvtepi32_pd(v_src1i);
__m128d v_src2d = _mm_cvtepi32_pd(v_src2i);
__m128i v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d));
v_src1d = _mm_cvtepi32_pd(_mm_srli_si128(v_src1i, 8));
v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8));
__m128i v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d));
__m128i v_dst_0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1)));
v_src1i = _mm_unpackhi_epi16(v_src1, v_zero);
v_src2i = _mm_unpackhi_epi16(v_src2, v_zero);
v_src1d = _mm_cvtepi32_pd(v_src1i);
v_src2d = _mm_cvtepi32_pd(v_src2i);
v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d));
v_src1d = _mm_cvtepi32_pd(_mm_srli_si128(v_src1i, 8));
v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8));
v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d));
__m128i v_dst_1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1)));
__m128i v_mask = _mm_cmpeq_epi8(_v_src2, v_zero);
_mm_storel_epi64((__m128i *)(dst + x), _mm_andnot_si128(v_mask, _mm_packus_epi16(_mm_packs_epi32(v_dst_0, v_dst_1), v_zero)));
v_uint16x8 v_src1 = v_load_expand(src1 + x);
v_uint16x8 v_src2 = v_load_expand(src2 + x);
v_uint32x4 t0, t1, t2, t3;
v_expand(v_src1, t0, t1);
v_expand(v_src2, t2, t3);
v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0));
v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1));
v_float32x4 f2 = v_cvt_f32(v_reinterpret_as_s32(t2));
v_float32x4 f3 = v_cvt_f32(v_reinterpret_as_s32(t3));
f0 = f0 * v_scale / f2;
f1 = f1 * v_scale / f3;
v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
v_uint16x8 res = v_pack_u(i0, i1);
res = v_select(v_src2 == v_zero, v_zero, res);
v_pack_store(dst + x, res);
}
return x;
}
};
#endif // CV_SSE4_1
template <>
struct Div_SIMD<schar>
{
bool haveSIMD;
Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2); }
Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
int operator() (const schar * src1, const schar * src2, schar * dst, int width, double scale) const
{
......@@ -2850,50 +2852,44 @@ struct Div_SIMD<schar>
if (!haveSIMD)
return x;
__m128d v_scale = _mm_set1_pd(scale);
__m128i v_zero = _mm_setzero_si128();
v_float32x4 v_scale = v_setall_f32((float)scale);
v_int16x8 v_zero = v_setzero_s16();
for ( ; x <= width - 8; x += 8)
{
__m128i v_src1 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((const __m128i *)(src1 + x))), 8);
__m128i _v_src2 = _mm_loadl_epi64((const __m128i *)(src2 + x));
__m128i v_src2 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _v_src2), 8);
__m128i v_src1i = _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16);
__m128i v_src2i = _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16);
__m128d v_src1d = _mm_cvtepi32_pd(v_src1i);
__m128d v_src2d = _mm_cvtepi32_pd(v_src2i);
__m128i v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d));
v_src1d = _mm_cvtepi32_pd(_mm_srli_si128(v_src1i, 8));
v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8));
__m128i v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d));
__m128i v_dst_0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1)));
v_src1i = _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16);
v_src2i = _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16);
v_src1d = _mm_cvtepi32_pd(v_src1i);
v_src2d = _mm_cvtepi32_pd(v_src2i);
v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d));
v_src1d = _mm_cvtepi32_pd(_mm_srli_si128(v_src1i, 8));
v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8));
v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d));
__m128i v_dst_1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1)));
__m128i v_mask = _mm_cmpeq_epi8(_v_src2, v_zero);
_mm_storel_epi64((__m128i *)(dst + x), _mm_andnot_si128(v_mask, _mm_packs_epi16(_mm_packs_epi32(v_dst_0, v_dst_1), v_zero)));
v_int16x8 v_src1 = v_load_expand(src1 + x);
v_int16x8 v_src2 = v_load_expand(src2 + x);
v_int32x4 t0, t1, t2, t3;
v_expand(v_src1, t0, t1);
v_expand(v_src2, t2, t3);
v_float32x4 f0 = v_cvt_f32(t0);
v_float32x4 f1 = v_cvt_f32(t1);
v_float32x4 f2 = v_cvt_f32(t2);
v_float32x4 f3 = v_cvt_f32(t3);
f0 = f0 * v_scale / f2;
f1 = f1 * v_scale / f3;
v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
v_int16x8 res = v_pack(i0, i1);
res = v_select(v_src2 == v_zero, v_zero, res);
v_pack_store(dst + x, res);
}
return x;
}
};
#if CV_SSE4_1
template <>
struct Div_SIMD<ushort>
{
bool haveSIMD;
Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1); }
Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, double scale) const
{
......@@ -2902,49 +2898,43 @@ struct Div_SIMD<ushort>
if (!haveSIMD)
return x;
__m128d v_scale = _mm_set1_pd(scale);
__m128i v_zero = _mm_setzero_si128();
v_float32x4 v_scale = v_setall_f32((float)scale);
v_uint16x8 v_zero = v_setzero_u16();
for ( ; x <= width - 8; x += 8)
{
__m128i v_src1 = _mm_loadu_si128((const __m128i *)(src1 + x));
__m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x));
v_uint16x8 v_src1 = v_load(src1 + x);
v_uint16x8 v_src2 = v_load(src2 + x);
v_uint32x4 t0, t1, t2, t3;
v_expand(v_src1, t0, t1);
v_expand(v_src2, t2, t3);
__m128i v_src1i = _mm_unpacklo_epi16(v_src1, v_zero);
__m128i v_src2i = _mm_unpacklo_epi16(v_src2, v_zero);
__m128d v_src1d = _mm_cvtepi32_pd(v_src1i);
__m128d v_src2d = _mm_cvtepi32_pd(v_src2i);
__m128i v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d));
v_src1d = _mm_cvtepi32_pd(_mm_srli_si128(v_src1i, 8));
v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8));
__m128i v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d));
__m128i v_dst_0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1)));
v_src1i = _mm_unpackhi_epi16(v_src1, v_zero);
v_src2i = _mm_unpackhi_epi16(v_src2, v_zero);
v_src1d = _mm_cvtepi32_pd(v_src1i);
v_src2d = _mm_cvtepi32_pd(v_src2i);
v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d));
v_src1d = _mm_cvtepi32_pd(_mm_srli_si128(v_src1i, 8));
v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8));
v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d));
__m128i v_dst_1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1)));
__m128i v_mask = _mm_cmpeq_epi16(v_src2, v_zero);
_mm_storeu_si128((__m128i *)(dst + x), _mm_andnot_si128(v_mask, _mm_packus_epi32(v_dst_0, v_dst_1)));
v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0));
v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1));
v_float32x4 f2 = v_cvt_f32(v_reinterpret_as_s32(t2));
v_float32x4 f3 = v_cvt_f32(v_reinterpret_as_s32(t3));
f0 = f0 * v_scale / f2;
f1 = f1 * v_scale / f3;
v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
v_uint16x8 res = v_pack_u(i0, i1);
res = v_select(v_src2 == v_zero, v_zero, res);
v_store(dst + x, res);
}
return x;
}
};
#endif // CV_SSE4_1
template <>
struct Div_SIMD<short>
{
bool haveSIMD;
Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2); }
Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
int operator() (const short * src1, const short * src2, short * dst, int width, double scale) const
{
......@@ -2953,36 +2943,32 @@ struct Div_SIMD<short>
if (!haveSIMD)
return x;
__m128d v_scale = _mm_set1_pd(scale);
__m128i v_zero = _mm_setzero_si128();
v_float32x4 v_scale = v_setall_f32((float)scale);
v_int16x8 v_zero = v_setzero_s16();
for ( ; x <= width - 8; x += 8)
{
__m128i v_src1 = _mm_loadu_si128((const __m128i *)(src1 + x));
__m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x));
v_int16x8 v_src1 = v_load(src1 + x);
v_int16x8 v_src2 = v_load(src2 + x);
__m128i v_src1i = _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16);
__m128i v_src2i = _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16);
__m128d v_src1d = _mm_cvtepi32_pd(v_src1i);
__m128d v_src2d = _mm_cvtepi32_pd(v_src2i);
__m128i v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d));
v_src1d = _mm_cvtepi32_pd(_mm_srli_si128(v_src1i, 8));
v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8));
__m128i v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d));
__m128i v_dst_0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1)));
v_src1i = _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16);
v_src2i = _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16);
v_src1d = _mm_cvtepi32_pd(v_src1i);
v_src2d = _mm_cvtepi32_pd(v_src2i);
v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d));
v_src1d = _mm_cvtepi32_pd(_mm_srli_si128(v_src1i, 8));
v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8));
v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d));
__m128i v_dst_1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1)));
__m128i v_mask = _mm_cmpeq_epi16(v_src2, v_zero);
_mm_storeu_si128((__m128i *)(dst + x), _mm_andnot_si128(v_mask, _mm_packs_epi32(v_dst_0, v_dst_1)));
v_int32x4 t0, t1, t2, t3;
v_expand(v_src1, t0, t1);
v_expand(v_src2, t2, t3);
v_float32x4 f0 = v_cvt_f32(t0);
v_float32x4 f1 = v_cvt_f32(t1);
v_float32x4 f2 = v_cvt_f32(t2);
v_float32x4 f3 = v_cvt_f32(t3);
f0 = f0 * v_scale / f2;
f1 = f1 * v_scale / f3;
v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
v_int16x8 res = v_pack(i0, i1);
res = v_select(v_src2 == v_zero, v_zero, res);
v_store(dst + x, res);
}
return x;
......@@ -2993,7 +2979,7 @@ template <>
struct Div_SIMD<int>
{
bool haveSIMD;
Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2); }
Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
int operator() (const int * src1, const int * src2, int * dst, int width, double scale) const
{
......@@ -3002,100 +2988,82 @@ struct Div_SIMD<int>
if (!haveSIMD)
return x;
__m128d v_scale = _mm_set1_pd(scale);
__m128i v_zero = _mm_setzero_si128();
v_float32x4 v_scale = v_setall_f32((float)scale);
v_int32x4 v_zero = v_setzero_s32();
for ( ; x <= width - 4; x += 4)
for ( ; x <= width - 8; x += 8)
{
__m128i v_src1 = _mm_loadu_si128((const __m128i *)(src1 + x));
__m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x));
v_int32x4 t0 = v_load(src1 + x);
v_int32x4 t1 = v_load(src1 + x + 4);
v_int32x4 t2 = v_load(src2 + x);
v_int32x4 t3 = v_load(src2 + x + 4);
v_float32x4 f0 = v_cvt_f32(t0);
v_float32x4 f1 = v_cvt_f32(t1);
v_float32x4 f2 = v_cvt_f32(t2);
v_float32x4 f3 = v_cvt_f32(t3);
__m128d v_src1d = _mm_cvtepi32_pd(v_src1);
__m128d v_src2d = _mm_cvtepi32_pd(v_src2);
__m128i v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d));
f0 = f0 * v_scale / f2;
f1 = f1 * v_scale / f3;
v_src1d = _mm_cvtepi32_pd(_mm_srli_si128(v_src1, 8));
v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2, 8));
__m128i v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d));
v_int32x4 res0 = v_round(f0), res1 = v_round(f1);
__m128i v_dst = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1)));
__m128i v_mask = _mm_cmpeq_epi32(v_src2, v_zero);
_mm_storeu_si128((__m128i *)(dst + x), _mm_andnot_si128(v_mask, v_dst));
res0 = v_select(t2 == v_zero, v_zero, res0);
res1 = v_select(t3 == v_zero, v_zero, res1);
v_store(dst + x, res0);
v_store(dst + x + 4, res1);
}
return x;
}
};
#endif
template<typename T> static void
div_( const T* src1, size_t step1, const T* src2, size_t step2,
T* dst, size_t step, Size size, double scale )
template <>
struct Div_SIMD<float>
{
step1 /= sizeof(src1[0]);
step2 /= sizeof(src2[0]);
step /= sizeof(dst[0]);
Div_SIMD<T> vop;
bool haveSIMD;
Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
for( ; size.height--; src1 += step1, src2 += step2, dst += step )
int operator() (const float * src1, const float * src2, float * dst, int width, double scale) const
{
int i = vop(src1, src2, dst, size.width, scale);
#if CV_ENABLE_UNROLLED
for( ; i <= size.width - 4; i += 4 )
int x = 0;
if (!haveSIMD)
return x;
v_float32x4 v_scale = v_setall_f32((float)scale);
v_float32x4 v_zero = v_setzero_f32();
for ( ; x <= width - 8; x += 8)
{
if( src2[i] != 0 && src2[i+1] != 0 && src2[i+2] != 0 && src2[i+3] != 0 )
{
double a = (double)src2[i] * src2[i+1];
double b = (double)src2[i+2] * src2[i+3];
double d = scale/(a * b);
b *= d;
a *= d;
T z0 = saturate_cast<T>(src2[i+1] * ((double)src1[i] * b));
T z1 = saturate_cast<T>(src2[i] * ((double)src1[i+1] * b));
T z2 = saturate_cast<T>(src2[i+3] * ((double)src1[i+2] * a));
T z3 = saturate_cast<T>(src2[i+2] * ((double)src1[i+3] * a));
dst[i] = z0; dst[i+1] = z1;
dst[i+2] = z2; dst[i+3] = z3;
}
else
{
T z0 = src2[i] != 0 ? saturate_cast<T>(src1[i]*scale/src2[i]) : 0;
T z1 = src2[i+1] != 0 ? saturate_cast<T>(src1[i+1]*scale/src2[i+1]) : 0;
T z2 = src2[i+2] != 0 ? saturate_cast<T>(src1[i+2]*scale/src2[i+2]) : 0;
T z3 = src2[i+3] != 0 ? saturate_cast<T>(src1[i+3]*scale/src2[i+3]) : 0;
v_float32x4 f0 = v_load(src1 + x);
v_float32x4 f1 = v_load(src1 + x + 4);
v_float32x4 f2 = v_load(src2 + x);
v_float32x4 f3 = v_load(src2 + x + 4);
dst[i] = z0; dst[i+1] = z1;
dst[i+2] = z2; dst[i+3] = z3;
}
v_float32x4 res0 = f0 * v_scale / f2;
v_float32x4 res1 = f1 * v_scale / f3;
res0 = v_select(f2 == v_zero, v_zero, res0);
res1 = v_select(f3 == v_zero, v_zero, res1);
v_store(dst + x, res0);
v_store(dst + x + 4, res1);
}
#endif
for( ; i < size.width; i++ )
dst[i] = src2[i] != 0 ? saturate_cast<T>(src1[i]*scale/src2[i]) : 0;
}
}
template <typename T>
struct Recip_SIMD
{
int operator() (const T *, T *, int, double) const
{
return 0;
return x;
}
};
#if CV_SSE2
#if CV_SSE4_1
///////////////////////// RECIPROCAL //////////////////////
template <>
struct Recip_SIMD<uchar>
{
bool haveSIMD;
Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1); }
Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
int operator() (const uchar * src2, uchar * dst, int width, double scale) const
{
......@@ -3104,43 +3072,39 @@ struct Recip_SIMD<uchar>
if (!haveSIMD)
return x;
__m128d v_scale = _mm_set1_pd(scale);
__m128i v_zero = _mm_setzero_si128();
v_float32x4 v_scale = v_setall_f32((float)scale);
v_uint16x8 v_zero = v_setzero_u16();
for ( ; x <= width - 8; x += 8)
{
__m128i _v_src2 = _mm_loadl_epi64((const __m128i *)(src2 + x));
__m128i v_src2 = _mm_unpacklo_epi8(_v_src2, v_zero);
v_uint16x8 v_src2 = v_load_expand(src2 + x);
__m128i v_src2i = _mm_unpacklo_epi16(v_src2, v_zero);
__m128d v_src2d = _mm_cvtepi32_pd(v_src2i);
__m128i v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d));
v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8));
__m128i v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d));
__m128i v_dst_0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1)));
v_uint32x4 t0, t1;
v_expand(v_src2, t0, t1);
v_src2i = _mm_unpackhi_epi16(v_src2, v_zero);
v_src2d = _mm_cvtepi32_pd(v_src2i);
v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d));
v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8));
v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d));
__m128i v_dst_1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1)));
v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0));
v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1));
__m128i v_mask = _mm_cmpeq_epi8(_v_src2, v_zero);
_mm_storel_epi64((__m128i *)(dst + x), _mm_andnot_si128(v_mask, _mm_packus_epi16(_mm_packs_epi32(v_dst_0, v_dst_1), v_zero)));
f0 = v_scale / f0;
f1 = v_scale / f1;
v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
v_uint16x8 res = v_pack_u(i0, i1);
res = v_select(v_src2 == v_zero, v_zero, res);
v_pack_store(dst + x, res);
}
return x;
}
};
#endif // CV_SSE4_1
template <>
struct Recip_SIMD<schar>
{
bool haveSIMD;
Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2); }
Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
int operator() (const schar * src2, schar * dst, int width, double scale) const
{
......@@ -3149,43 +3113,39 @@ struct Recip_SIMD<schar>
if (!haveSIMD)
return x;
__m128d v_scale = _mm_set1_pd(scale);
__m128i v_zero = _mm_setzero_si128();
v_float32x4 v_scale = v_setall_f32((float)scale);
v_int16x8 v_zero = v_setzero_s16();
for ( ; x <= width - 8; x += 8)
{
__m128i _v_src2 = _mm_loadl_epi64((const __m128i *)(src2 + x));
__m128i v_src2 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _v_src2), 8);
v_int16x8 v_src2 = v_load_expand(src2 + x);
v_int32x4 t0, t1;
v_expand(v_src2, t0, t1);
__m128i v_src2i = _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16);
__m128d v_src2d = _mm_cvtepi32_pd(v_src2i);
__m128i v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d));
v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8));
__m128i v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d));
__m128i v_dst_0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1)));
v_float32x4 f0 = v_cvt_f32(t0);
v_float32x4 f1 = v_cvt_f32(t1);
v_src2i = _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16);
v_src2d = _mm_cvtepi32_pd(v_src2i);
v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d));
v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8));
v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d));
__m128i v_dst_1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1)));
f0 = v_scale / f0;
f1 = v_scale / f1;
__m128i v_mask = _mm_cmpeq_epi8(_v_src2, v_zero);
_mm_storel_epi64((__m128i *)(dst + x), _mm_andnot_si128(v_mask, _mm_packs_epi16(_mm_packs_epi32(v_dst_0, v_dst_1), v_zero)));
v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
v_int16x8 res = v_pack(i0, i1);
res = v_select(v_src2 == v_zero, v_zero, res);
v_pack_store(dst + x, res);
}
return x;
}
};
#if CV_SSE4_1
template <>
struct Recip_SIMD<ushort>
{
bool haveSIMD;
Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1); }
Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
int operator() (const ushort * src2, ushort * dst, int width, double scale) const
{
......@@ -3194,42 +3154,38 @@ struct Recip_SIMD<ushort>
if (!haveSIMD)
return x;
__m128d v_scale = _mm_set1_pd(scale);
__m128i v_zero = _mm_setzero_si128();
v_float32x4 v_scale = v_setall_f32((float)scale);
v_uint16x8 v_zero = v_setzero_u16();
for ( ; x <= width - 8; x += 8)
{
__m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x));
v_uint16x8 v_src2 = v_load(src2 + x);
__m128i v_src2i = _mm_unpacklo_epi16(v_src2, v_zero);
__m128d v_src2d = _mm_cvtepi32_pd(v_src2i);
__m128i v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d));
v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8));
__m128i v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d));
__m128i v_dst_0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1)));
v_uint32x4 t0, t1;
v_expand(v_src2, t0, t1);
v_src2i = _mm_unpackhi_epi16(v_src2, v_zero);
v_src2d = _mm_cvtepi32_pd(v_src2i);
v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d));
v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8));
v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d));
__m128i v_dst_1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1)));
v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0));
v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1));
__m128i v_mask = _mm_cmpeq_epi16(v_src2, v_zero);
_mm_storeu_si128((__m128i *)(dst + x), _mm_andnot_si128(v_mask, _mm_packus_epi32(v_dst_0, v_dst_1)));
f0 = v_scale / f0;
f1 = v_scale / f1;
v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
v_uint16x8 res = v_pack_u(i0, i1);
res = v_select(v_src2 == v_zero, v_zero, res);
v_store(dst + x, res);
}
return x;
}
};
#endif // CV_SSE4_1
template <>
struct Recip_SIMD<short>
{
bool haveSIMD;
Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2); }
Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
int operator() (const short * src2, short * dst, int width, double scale) const
{
......@@ -3238,29 +3194,27 @@ struct Recip_SIMD<short>
if (!haveSIMD)
return x;
__m128d v_scale = _mm_set1_pd(scale);
__m128i v_zero = _mm_setzero_si128();
v_float32x4 v_scale = v_setall_f32((float)scale);
v_int16x8 v_zero = v_setzero_s16();
for ( ; x <= width - 8; x += 8)
{
__m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x));
v_int16x8 v_src2 = v_load(src2 + x);
v_int32x4 t0, t1;
v_expand(v_src2, t0, t1);
v_float32x4 f0 = v_cvt_f32(t0);
v_float32x4 f1 = v_cvt_f32(t1);
__m128i v_src2i = _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16);
__m128d v_src2d = _mm_cvtepi32_pd(v_src2i);
__m128i v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d));
v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8));
__m128i v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d));
__m128i v_dst_0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1)));
f0 = v_scale / f0;
f1 = v_scale / f1;
v_src2i = _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16);
v_src2d = _mm_cvtepi32_pd(v_src2i);
v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d));
v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8));
v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d));
__m128i v_dst_1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1)));
v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
v_int16x8 res = v_pack(i0, i1);
__m128i v_mask = _mm_cmpeq_epi16(v_src2, v_zero);
_mm_storeu_si128((__m128i *)(dst + x), _mm_andnot_si128(v_mask, _mm_packs_epi32(v_dst_0, v_dst_1)));
res = v_select(v_src2 == v_zero, v_zero, res);
v_store(dst + x, res);
}
return x;
......@@ -3271,7 +3225,7 @@ template <>
struct Recip_SIMD<int>
{
bool haveSIMD;
Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2); }
Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
int operator() (const int * src2, int * dst, int width, double scale) const
{
......@@ -3280,22 +3234,136 @@ struct Recip_SIMD<int>
if (!haveSIMD)
return x;
__m128d v_scale = _mm_set1_pd(scale);
__m128i v_zero = _mm_setzero_si128();
v_float32x4 v_scale = v_setall_f32((float)scale);
v_int32x4 v_zero = v_setzero_s32();
for ( ; x <= width - 8; x += 8)
{
v_int32x4 t0 = v_load(src2 + x);
v_int32x4 t1 = v_load(src2 + x + 4);
v_float32x4 f0 = v_cvt_f32(t0);
v_float32x4 f1 = v_cvt_f32(t1);
f0 = v_scale / f0;
f1 = v_scale / f1;
v_int32x4 res0 = v_round(f0), res1 = v_round(f1);
res0 = v_select(t0 == v_zero, v_zero, res0);
res1 = v_select(t1 == v_zero, v_zero, res1);
v_store(dst + x, res0);
v_store(dst + x + 4, res1);
}
return x;
}
};
template <>
struct Recip_SIMD<float>
{
bool haveSIMD;
Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
int operator() (const float * src2, float * dst, int width, double scale) const
{
int x = 0;
if (!haveSIMD)
return x;
v_float32x4 v_scale = v_setall_f32((float)scale);
v_float32x4 v_zero = v_setzero_f32();
for ( ; x <= width - 8; x += 8)
{
v_float32x4 f0 = v_load(src2 + x);
v_float32x4 f1 = v_load(src2 + x + 4);
v_float32x4 res0 = v_scale / f0;
v_float32x4 res1 = v_scale / f1;
res0 = v_select(f0 == v_zero, v_zero, res0);
res1 = v_select(f1 == v_zero, v_zero, res1);
v_store(dst + x, res0);
v_store(dst + x + 4, res1);
}
return x;
}
};
#if CV_SIMD128_64F
template <>
struct Div_SIMD<double>
{
bool haveSIMD;
Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
int operator() (const double * src1, const double * src2, double * dst, int width, double scale) const
{
int x = 0;
if (!haveSIMD)
return x;
v_float64x2 v_scale = v_setall_f64(scale);
v_float64x2 v_zero = v_setzero_f64();
for ( ; x <= width - 4; x += 4)
{
__m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x));
v_float64x2 f0 = v_load(src1 + x);
v_float64x2 f1 = v_load(src1 + x + 2);
v_float64x2 f2 = v_load(src2 + x);
v_float64x2 f3 = v_load(src2 + x + 2);
v_float64x2 res0 = f0 * v_scale / f2;
v_float64x2 res1 = f1 * v_scale / f3;
res0 = v_select(f0 == v_zero, v_zero, res0);
res1 = v_select(f1 == v_zero, v_zero, res1);
v_store(dst + x, res0);
v_store(dst + x + 2, res1);
}
return x;
}
};
template <>
struct Recip_SIMD<double>
{
bool haveSIMD;
Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
int operator() (const double * src2, double * dst, int width, double scale) const
{
int x = 0;
__m128d v_src2d = _mm_cvtepi32_pd(v_src2);
__m128i v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d));
if (!haveSIMD)
return x;
v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2, 8));
__m128i v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d));
v_float64x2 v_scale = v_setall_f64(scale);
v_float64x2 v_zero = v_setzero_f64();
__m128i v_dst = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1)));
__m128i v_mask = _mm_cmpeq_epi32(v_src2, v_zero);
_mm_storeu_si128((__m128i *)(dst + x), _mm_andnot_si128(v_mask, v_dst));
for ( ; x <= width - 4; x += 4)
{
v_float64x2 f0 = v_load(src2 + x);
v_float64x2 f1 = v_load(src2 + x + 2);
v_float64x2 res0 = v_scale / f0;
v_float64x2 res1 = v_scale / f1;
res0 = v_select(f0 == v_zero, v_zero, res0);
res1 = v_select(f1 == v_zero, v_zero, res1);
v_store(dst + x, res0);
v_store(dst + x + 2, res1);
}
return x;
......@@ -3304,51 +3372,91 @@ struct Recip_SIMD<int>
#endif
#endif
template<typename T> static void
recip_( const T*, size_t, const T* src2, size_t step2,
T* dst, size_t step, Size size, double scale )
div_i( const T* src1, size_t step1, const T* src2, size_t step2,
T* dst, size_t step, Size size, double scale )
{
step1 /= sizeof(src1[0]);
step2 /= sizeof(src2[0]);
step /= sizeof(dst[0]);
Div_SIMD<T> vop;
float scale_f = (float)scale;
for( ; size.height--; src1 += step1, src2 += step2, dst += step )
{
int i = vop(src1, src2, dst, size.width, scale);
for( ; i < size.width; i++ )
{
T num = src1[i], denom = src2[i];
dst[i] = denom != 0 ? saturate_cast<T>(num*scale_f/denom) : (T)0;
}
}
}
template<typename T> static void
div_f( const T* src1, size_t step1, const T* src2, size_t step2,
T* dst, size_t step, Size size, double scale )
{
T scale_f = (T)scale;
step1 /= sizeof(src1[0]);
step2 /= sizeof(src2[0]);
step /= sizeof(dst[0]);
Div_SIMD<T> vop;
for( ; size.height--; src1 += step1, src2 += step2, dst += step )
{
int i = vop(src1, src2, dst, size.width, scale);
for( ; i < size.width; i++ )
{
T num = src1[i], denom = src2[i];
dst[i] = denom != 0 ? saturate_cast<T>(num*scale_f/denom) : (T)0;
}
}
}
template<typename T> static void
recip_i( const T*, size_t, const T* src2, size_t step2,
T* dst, size_t step, Size size, double scale )
{
step2 /= sizeof(src2[0]);
step /= sizeof(dst[0]);
Recip_SIMD<T> vop;
float scale_f = (float)scale;
for( ; size.height--; src2 += step2, dst += step )
{
int i = vop(src2, dst, size.width, scale);
#if CV_ENABLE_UNROLLED
for( ; i <= size.width - 4; i += 4 )
for( ; i < size.width; i++ )
{
if( src2[i] != 0 && src2[i+1] != 0 && src2[i+2] != 0 && src2[i+3] != 0 )
{
double a = (double)src2[i] * src2[i+1];
double b = (double)src2[i+2] * src2[i+3];
double d = scale/(a * b);
b *= d;
a *= d;
T z0 = saturate_cast<T>(src2[i+1] * b);
T z1 = saturate_cast<T>(src2[i] * b);
T z2 = saturate_cast<T>(src2[i+3] * a);
T z3 = saturate_cast<T>(src2[i+2] * a);
dst[i] = z0; dst[i+1] = z1;
dst[i+2] = z2; dst[i+3] = z3;
}
else
{
T z0 = src2[i] != 0 ? saturate_cast<T>(scale/src2[i]) : 0;
T z1 = src2[i+1] != 0 ? saturate_cast<T>(scale/src2[i+1]) : 0;
T z2 = src2[i+2] != 0 ? saturate_cast<T>(scale/src2[i+2]) : 0;
T z3 = src2[i+3] != 0 ? saturate_cast<T>(scale/src2[i+3]) : 0;
dst[i] = z0; dst[i+1] = z1;
dst[i+2] = z2; dst[i+3] = z3;
}
T denom = src2[i];
dst[i] = denom != 0 ? saturate_cast<T>(scale_f/denom) : (T)0;
}
#endif
}
}
template<typename T> static void
recip_f( const T*, size_t, const T* src2, size_t step2,
T* dst, size_t step, Size size, double scale )
{
T scale_f = (T)scale;
step2 /= sizeof(src2[0]);
step /= sizeof(dst[0]);
Recip_SIMD<T> vop;
for( ; size.height--; src2 += step2, dst += step )
{
int i = vop(src2, dst, size.width, scale);
for( ; i < size.width; i++ )
dst[i] = src2[i] != 0 ? saturate_cast<T>(scale/src2[i]) : 0;
{
T denom = src2[i];
dst[i] = denom != 0 ? saturate_cast<T>(scale_f/denom) : (T)0;
}
}
}
......@@ -3459,87 +3567,87 @@ static void div8u( const uchar* src1, size_t step1, const uchar* src2, size_t st
uchar* dst, size_t step, Size sz, void* scale)
{
if( src1 )
div_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
div_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
else
recip_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
recip_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
}
static void div8s( const schar* src1, size_t step1, const schar* src2, size_t step2,
schar* dst, size_t step, Size sz, void* scale)
{
div_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
div_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
}
static void div16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2,
ushort* dst, size_t step, Size sz, void* scale)
{
div_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
div_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
}
static void div16s( const short* src1, size_t step1, const short* src2, size_t step2,
short* dst, size_t step, Size sz, void* scale)
{
div_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
div_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
}
static void div32s( const int* src1, size_t step1, const int* src2, size_t step2,
int* dst, size_t step, Size sz, void* scale)
{
div_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
div_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
}
static void div32f( const float* src1, size_t step1, const float* src2, size_t step2,
float* dst, size_t step, Size sz, void* scale)
{
div_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
div_f(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
}
static void div64f( const double* src1, size_t step1, const double* src2, size_t step2,
double* dst, size_t step, Size sz, void* scale)
{
div_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
div_f(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
}
static void recip8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2,
uchar* dst, size_t step, Size sz, void* scale)
{
recip_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
recip_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
}
static void recip8s( const schar* src1, size_t step1, const schar* src2, size_t step2,
schar* dst, size_t step, Size sz, void* scale)
{
recip_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
recip_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
}
static void recip16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2,
ushort* dst, size_t step, Size sz, void* scale)
{
recip_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
recip_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
}
static void recip16s( const short* src1, size_t step1, const short* src2, size_t step2,
short* dst, size_t step, Size sz, void* scale)
{
recip_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
recip_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
}
static void recip32s( const int* src1, size_t step1, const int* src2, size_t step2,
int* dst, size_t step, Size sz, void* scale)
{
recip_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
recip_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
}
static void recip32f( const float* src1, size_t step1, const float* src2, size_t step2,
float* dst, size_t step, Size sz, void* scale)
{
recip_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
recip_f(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
}
static void recip64f( const double* src1, size_t step1, const double* src2, size_t step2,
double* dst, size_t step, Size sz, void* scale)
{
recip_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
recip_f(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
}
......
......@@ -502,7 +502,7 @@ JacobiSVDImpl_(_Tp* At, size_t astep, _Tp* _W, _Tp* Vt, size_t vstep,
{
sd = i < n ? W[i] : 0;
while( sd <= minval )
for( int ii = 0; ii < 100 && sd <= minval; ii++ )
{
// if we got a zero singular value, then in order to get the corresponding left singular vector
// we generate a random vector, project it to the previously computed left singular vectors,
......@@ -541,7 +541,7 @@ JacobiSVDImpl_(_Tp* At, size_t astep, _Tp* _W, _Tp* Vt, size_t vstep,
sd = std::sqrt(sd);
}
s = (_Tp)(1/sd);
s = (_Tp)(sd > minval ? 1/sd : 0.);
for( k = 0; k < m; k++ )
At[i*astep + k] *= s;
}
......
......@@ -889,38 +889,41 @@ struct iPow_SIMD
}
};
#if CV_NEON
#if CV_SIMD128
template <>
struct iPow_SIMD<uchar, int>
{
int operator() ( const uchar * src, uchar * dst, int len, int power)
int operator() ( const uchar * src, uchar * dst, int len, int power )
{
int i = 0;
uint32x4_t v_1 = vdupq_n_u32(1u);
v_uint32x4 v_1 = v_setall_u32(1u);
for ( ; i <= len - 8; i += 8)
{
uint32x4_t v_a1 = v_1, v_a2 = v_1;
uint16x8_t v_src = vmovl_u8(vld1_u8(src + i));
uint32x4_t v_b1 = vmovl_u16(vget_low_u16(v_src)), v_b2 = vmovl_u16(vget_high_u16(v_src));
v_uint32x4 v_a1 = v_1, v_a2 = v_1;
v_uint16x8 v = v_load_expand(src + i);
v_uint32x4 v_b1, v_b2;
v_expand(v, v_b1, v_b2);
int p = power;
while( p > 1 )
{
if (p & 1)
{
v_a1 = vmulq_u32(v_a1, v_b1);
v_a2 = vmulq_u32(v_a2, v_b2);
v_a1 *= v_b1;
v_a2 *= v_b2;
}
v_b1 = vmulq_u32(v_b1, v_b1);
v_b2 = vmulq_u32(v_b2, v_b2);
v_b1 *= v_b1;
v_b2 *= v_b2;
p >>= 1;
}
v_a1 = vmulq_u32(v_a1, v_b1);
v_a2 = vmulq_u32(v_a2, v_b2);
vst1_u8(dst + i, vqmovn_u16(vcombine_u16(vqmovn_u32(v_a1), vqmovn_u32(v_a2))));
v_a1 *= v_b1;
v_a2 *= v_b2;
v = v_pack(v_a1, v_a2);
v_pack_store(dst + i, v);
}
return i;
......@@ -933,30 +936,33 @@ struct iPow_SIMD<schar, int>
int operator() ( const schar * src, schar * dst, int len, int power)
{
int i = 0;
int32x4_t v_1 = vdupq_n_s32(1);
v_int32x4 v_1 = v_setall_s32(1);
for ( ; i <= len - 8; i += 8)
{
int32x4_t v_a1 = v_1, v_a2 = v_1;
int16x8_t v_src = vmovl_s8(vld1_s8(src + i));
int32x4_t v_b1 = vmovl_s16(vget_low_s16(v_src)), v_b2 = vmovl_s16(vget_high_s16(v_src));
v_int32x4 v_a1 = v_1, v_a2 = v_1;
v_int16x8 v = v_load_expand(src + i);
v_int32x4 v_b1, v_b2;
v_expand(v, v_b1, v_b2);
int p = power;
while( p > 1 )
{
if (p & 1)
{
v_a1 = vmulq_s32(v_a1, v_b1);
v_a2 = vmulq_s32(v_a2, v_b2);
v_a1 *= v_b1;
v_a2 *= v_b2;
}
v_b1 = vmulq_s32(v_b1, v_b1);
v_b2 = vmulq_s32(v_b2, v_b2);
v_b1 *= v_b1;
v_b2 *= v_b2;
p >>= 1;
}
v_a1 = vmulq_s32(v_a1, v_b1);
v_a2 = vmulq_s32(v_a2, v_b2);
vst1_s8(dst + i, vqmovn_s16(vcombine_s16(vqmovn_s32(v_a1), vqmovn_s32(v_a2))));
v_a1 *= v_b1;
v_a2 *= v_b2;
v = v_pack(v_a1, v_a2);
v_pack_store(dst + i, v);
}
return i;
......@@ -969,30 +975,33 @@ struct iPow_SIMD<ushort, int>
int operator() ( const ushort * src, ushort * dst, int len, int power)
{
int i = 0;
uint32x4_t v_1 = vdupq_n_u32(1u);
v_uint32x4 v_1 = v_setall_u32(1u);
for ( ; i <= len - 8; i += 8)
{
uint32x4_t v_a1 = v_1, v_a2 = v_1;
uint16x8_t v_src = vld1q_u16(src + i);
uint32x4_t v_b1 = vmovl_u16(vget_low_u16(v_src)), v_b2 = vmovl_u16(vget_high_u16(v_src));
v_uint32x4 v_a1 = v_1, v_a2 = v_1;
v_uint16x8 v = v_load(src + i);
v_uint32x4 v_b1, v_b2;
v_expand(v, v_b1, v_b2);
int p = power;
while( p > 1 )
{
if (p & 1)
{
v_a1 = vmulq_u32(v_a1, v_b1);
v_a2 = vmulq_u32(v_a2, v_b2);
v_a1 *= v_b1;
v_a2 *= v_b2;
}
v_b1 = vmulq_u32(v_b1, v_b1);
v_b2 = vmulq_u32(v_b2, v_b2);
v_b1 *= v_b1;
v_b2 *= v_b2;
p >>= 1;
}
v_a1 = vmulq_u32(v_a1, v_b1);
v_a2 = vmulq_u32(v_a2, v_b2);
vst1q_u16(dst + i, vcombine_u16(vqmovn_u32(v_a1), vqmovn_u32(v_a2)));
v_a1 *= v_b1;
v_a2 *= v_b2;
v = v_pack(v_a1, v_a2);
v_store(dst + i, v);
}
return i;
......@@ -1005,60 +1014,70 @@ struct iPow_SIMD<short, int>
int operator() ( const short * src, short * dst, int len, int power)
{
int i = 0;
int32x4_t v_1 = vdupq_n_s32(1);
v_int32x4 v_1 = v_setall_s32(1);
for ( ; i <= len - 8; i += 8)
{
int32x4_t v_a1 = v_1, v_a2 = v_1;
int16x8_t v_src = vld1q_s16(src + i);
int32x4_t v_b1 = vmovl_s16(vget_low_s16(v_src)), v_b2 = vmovl_s16(vget_high_s16(v_src));
v_int32x4 v_a1 = v_1, v_a2 = v_1;
v_int16x8 v = v_load(src + i);
v_int32x4 v_b1, v_b2;
v_expand(v, v_b1, v_b2);
int p = power;
while( p > 1 )
{
if (p & 1)
{
v_a1 = vmulq_s32(v_a1, v_b1);
v_a2 = vmulq_s32(v_a2, v_b2);
v_a1 *= v_b1;
v_a2 *= v_b2;
}
v_b1 = vmulq_s32(v_b1, v_b1);
v_b2 = vmulq_s32(v_b2, v_b2);
v_b1 *= v_b1;
v_b2 *= v_b2;
p >>= 1;
}
v_a1 = vmulq_s32(v_a1, v_b1);
v_a2 = vmulq_s32(v_a2, v_b2);
vst1q_s16(dst + i, vcombine_s16(vqmovn_s32(v_a1), vqmovn_s32(v_a2)));
v_a1 *= v_b1;
v_a2 *= v_b2;
v = v_pack(v_a1, v_a2);
v_store(dst + i, v);
}
return i;
}
};
template <>
struct iPow_SIMD<int, int>
{
int operator() ( const int * src, int * dst, int len, int power)
{
int i = 0;
int32x4_t v_1 = vdupq_n_s32(1);
v_int32x4 v_1 = v_setall_s32(1);
for ( ; i <= len - 4; i += 4)
for ( ; i <= len - 8; i += 8)
{
int32x4_t v_b = vld1q_s32(src + i), v_a = v_1;
v_int32x4 v_a1 = v_1, v_a2 = v_1;
v_int32x4 v_b1 = v_load(src + i), v_b2 = v_load(src + i + 4);
int p = power;
while( p > 1 )
{
if (p & 1)
v_a = vmulq_s32(v_a, v_b);
v_b = vmulq_s32(v_b, v_b);
{
v_a1 *= v_b1;
v_a2 *= v_b2;
}
v_b1 *= v_b1;
v_b2 *= v_b2;
p >>= 1;
}
v_a = vmulq_s32(v_a, v_b);
vst1q_s32(dst + i, v_a);
v_a1 *= v_b1;
v_a2 *= v_b2;
v_store(dst + i, v_a1);
v_store(dst + i + 4, v_a2);
}
return i;
......@@ -1071,42 +1090,143 @@ struct iPow_SIMD<float, float>
int operator() ( const float * src, float * dst, int len, int power)
{
int i = 0;
float32x4_t v_1 = vdupq_n_f32(1.0f);
v_float32x4 v_1 = v_setall_f32(1.f);
for ( ; i <= len - 8; i += 8)
{
v_float32x4 v_a1 = v_1, v_a2 = v_1;
v_float32x4 v_b1 = v_load(src + i), v_b2 = v_load(src + i + 4);
int p = std::abs(power);
if( power < 0 )
{
v_b1 = v_1 / v_b1;
v_b2 = v_1 / v_b2;
}
while( p > 1 )
{
if (p & 1)
{
v_a1 *= v_b1;
v_a2 *= v_b2;
}
v_b1 *= v_b1;
v_b2 *= v_b2;
p >>= 1;
}
v_a1 *= v_b1;
v_a2 *= v_b2;
v_store(dst + i, v_a1);
v_store(dst + i + 4, v_a2);
}
return i;
}
};
#if CV_SIMD128_64F
template <>
struct iPow_SIMD<double, double>
{
int operator() ( const double * src, double * dst, int len, int power)
{
int i = 0;
v_float64x2 v_1 = v_setall_f64(1.);
for ( ; i <= len - 4; i += 4)
{
float32x4_t v_b = vld1q_f32(src + i), v_a = v_1;
int p = power;
v_float64x2 v_a1 = v_1, v_a2 = v_1;
v_float64x2 v_b1 = v_load(src + i), v_b2 = v_load(src + i + 2);
int p = std::abs(power);
if( power < 0 )
{
v_b1 = v_1 / v_b1;
v_b2 = v_1 / v_b2;
}
while( p > 1 )
{
if (p & 1)
v_a = vmulq_f32(v_a, v_b);
v_b = vmulq_f32(v_b, v_b);
{
v_a1 *= v_b1;
v_a2 *= v_b2;
}
v_b1 *= v_b1;
v_b2 *= v_b2;
p >>= 1;
}
v_a = vmulq_f32(v_a, v_b);
vst1q_f32(dst + i, v_a);
v_a1 *= v_b1;
v_a2 *= v_b2;
v_store(dst + i, v_a1);
v_store(dst + i + 2, v_a2);
}
return i;
}
};
#endif
#endif
template<typename T, typename WT>
static void
iPow_( const T* src, T* dst, int len, int power )
iPow_i( const T* src, T* dst, int len, int power )
{
iPow_SIMD<T, WT> vop;
int i = vop(src, dst, len, power);
if( power < 0 )
{
T tab[5] =
{
power == -1 ? saturate_cast<T>(-1) : 0, (power & 1) ? -1 : 1,
std::numeric_limits<T>::max(), 1, power == -1 ? 1 : 0
};
for( int i = 0; i < len; i++ )
{
T val = src[i];
dst[i] = cv_abs(val) <= 2 ? tab[val + 2] : (T)0;
}
}
else
{
iPow_SIMD<T, WT> vop;
int i = vop(src, dst, len, power);
for( ; i < len; i++ )
{
WT a = 1, b = src[i];
int p = power;
while( p > 1 )
{
if( p & 1 )
a *= b;
b *= b;
p >>= 1;
}
a *= b;
dst[i] = saturate_cast<T>(a);
}
}
}
template<typename T>
static void
iPow_f( const T* src, T* dst, int len, int power0 )
{
iPow_SIMD<T, T> vop;
int i = vop(src, dst, len, power0);
int power = std::abs(power0);
for( ; i < len; i++ )
{
WT a = 1, b = src[i];
T a = 1, b = src[i];
int p = power;
if( power0 < 0 )
b = 1/b;
while( p > 1 )
{
if( p & 1 )
......@@ -1116,44 +1236,43 @@ iPow_( const T* src, T* dst, int len, int power )
}
a *= b;
dst[i] = saturate_cast<T>(a);
dst[i] = a;
}
}
static void iPow8u(const uchar* src, uchar* dst, int len, int power)
{
iPow_<uchar, int>(src, dst, len, power);
iPow_i<uchar, unsigned>(src, dst, len, power);
}
static void iPow8s(const schar* src, schar* dst, int len, int power)
{
iPow_<schar, int>(src, dst, len, power);
iPow_i<schar, int>(src, dst, len, power);
}
static void iPow16u(const ushort* src, ushort* dst, int len, int power)
{
iPow_<ushort, int>(src, dst, len, power);
iPow_i<ushort, unsigned>(src, dst, len, power);
}
static void iPow16s(const short* src, short* dst, int len, int power)
{
iPow_<short, int>(src, dst, len, power);
iPow_i<short, int>(src, dst, len, power);
}
static void iPow32s(const int* src, int* dst, int len, int power)
{
iPow_<int, int>(src, dst, len, power);
iPow_i<int, int>(src, dst, len, power);
}
static void iPow32f(const float* src, float* dst, int len, int power)
{
iPow_<float, float>(src, dst, len, power);
iPow_f<float>(src, dst, len, power);
}
static void iPow64f(const double* src, double* dst, int len, int power)
{
iPow_<double, double>(src, dst, len, power);
iPow_f<double>(src, dst, len, power);
}
......@@ -1176,14 +1295,25 @@ static bool ocl_pow(InputArray _src, double power, OutputArray _dst,
bool doubleSupport = d.doubleFPConfig() > 0;
_dst.createSameSize(_src, type);
if (is_ipower && (ipower == 0 || ipower == 1))
if (is_ipower)
{
if (ipower == 0)
{
_dst.setTo(Scalar::all(1));
else if (ipower == 1)
return true;
}
if (ipower == 1)
{
_src.copyTo(_dst);
return true;
return true;
}
if( ipower < 0 )
{
if( depth == CV_32F || depth == CV_64F )
is_ipower = false;
else
return false;
}
}
if (depth == CV_64F && !doubleSupport)
......@@ -1238,15 +1368,6 @@ void pow( InputArray _src, double power, OutputArray _dst )
if( is_ipower && !(ocl::Device::getDefault().isIntel() && useOpenCL && depth != CV_64F))
{
if( ipower < 0 )
{
divide( Scalar::all(1), _src, _dst );
if( ipower == -1 )
return;
ipower = -ipower;
same = true;
}
switch( ipower )
{
case 0:
......@@ -1257,41 +1378,7 @@ void pow( InputArray _src, double power, OutputArray _dst )
_src.copyTo(_dst);
return;
case 2:
#if defined(HAVE_IPP)
CV_IPP_CHECK()
{
if (depth == CV_32F && !same && ( (_src.dims() <= 2 && !ocl::useOpenCL()) ||
(_src.dims() > 2 && _src.isContinuous() && _dst.isContinuous()) ))
{
Mat src = _src.getMat();
_dst.create( src.dims, src.size, type );
Mat dst = _dst.getMat();
Size size = src.size();
int srcstep = (int)src.step, dststep = (int)dst.step, esz = CV_ELEM_SIZE(type);
if (src.isContinuous() && dst.isContinuous())
{
size.width = (int)src.total();
size.height = 1;
srcstep = dststep = (int)src.total() * esz;
}
size.width *= cn;
IppStatus status = ippiSqr_32f_C1R(src.ptr<Ipp32f>(), srcstep, dst.ptr<Ipp32f>(), dststep, ippiSize(size.width, size.height));
if (status >= 0)
{
CV_IMPL_ADD(CV_IMPL_IPP);
return;
}
setIppErrorStatus();
}
}
#endif
if (same)
multiply(_dst, _dst, _dst);
else
multiply(_src, _src, _dst);
multiply(_src, _src, _dst);
return;
}
}
......@@ -1301,15 +1388,9 @@ void pow( InputArray _src, double power, OutputArray _dst )
CV_OCL_RUN(useOpenCL,
ocl_pow(same ? _dst : _src, power, _dst, is_ipower, ipower))
Mat src, dst;
if (same)
src = dst = _dst.getMat();
else
{
src = _src.getMat();
_dst.create( src.dims, src.size, type );
dst = _dst.getMat();
}
Mat src = _src.getMat();
_dst.create( src.dims, src.size, type );
Mat dst = _dst.getMat();
const Mat* arrays[] = {&src, &dst, 0};
uchar* ptrs[2];
......@@ -1335,52 +1416,103 @@ void pow( InputArray _src, double power, OutputArray _dst )
}
else
{
#if defined(HAVE_IPP)
CV_IPP_CHECK()
{
if (src.isContinuous() && dst.isContinuous())
{
IppStatus status = depth == CV_32F ?
ippsPowx_32f_A21(src.ptr<Ipp32f>(), (Ipp32f)power, dst.ptr<Ipp32f>(), (Ipp32s)(src.total() * cn)) :
ippsPowx_64f_A50(src.ptr<Ipp64f>(), power, dst.ptr<Ipp64f>(), (Ipp32s)(src.total() * cn));
if (status >= 0)
{
CV_IMPL_ADD(CV_IMPL_IPP);
return;
}
setIppErrorStatus();
}
}
#endif
int j, k, blockSize = std::min(len, ((BLOCK_SIZE + cn-1)/cn)*cn);
size_t esz1 = src.elemSize1();
AutoBuffer<uchar> buf;
Cv32suf inf32, nan32;
Cv64suf inf64, nan64;
float* fbuf = 0;
double* dbuf = 0;
inf32.i = 0x7f800000;
nan32.i = 0x7fffffff;
inf64.i = CV_BIG_INT(0x7FF0000000000000);
nan64.i = CV_BIG_INT(0x7FFFFFFFFFFFFFFF);
if( src.ptr() == dst.ptr() )
{
buf.allocate(blockSize*esz1);
fbuf = (float*)(uchar*)buf;
dbuf = (double*)(uchar*)buf;
}
for( size_t i = 0; i < it.nplanes; i++, ++it )
{
for( j = 0; j < len; j += blockSize )
{
int bsz = std::min(len - j, blockSize);
#if defined(HAVE_IPP)
CV_IPP_CHECK()
{
IppStatus status = depth == CV_32F ?
ippsPowx_32f_A21((const float*)ptrs[0], (float)power, (float*)ptrs[1], bsz) :
ippsPowx_64f_A50((const double*)ptrs[0], (double)power, (double*)ptrs[1], bsz);
if (status >= 0)
{
CV_IMPL_ADD(CV_IMPL_IPP);
ptrs[0] += bsz*esz1;
ptrs[1] += bsz*esz1;
continue;
}
setIppErrorStatus();
}
#endif
if( depth == CV_32F )
{
const float* x = (const float*)ptrs[0];
float* x0 = (float*)ptrs[0];
float* x = fbuf ? fbuf : x0;
float* y = (float*)ptrs[1];
if( x != x0 )
memcpy(x, x0, bsz*esz1);
Log_32f(x, y, bsz);
for( k = 0; k < bsz; k++ )
y[k] = (float)(y[k]*power);
Exp_32f(y, y, bsz);
for( k = 0; k < bsz; k++ )
{
if( x0[k] <= 0 )
{
if( x0[k] == 0.f )
{
if( power < 0 )
y[k] = inf32.f;
}
else
y[k] = nan32.f;
}
}
}
else
{
const double* x = (const double*)ptrs[0];
double* x0 = (double*)ptrs[0];
double* x = dbuf ? dbuf : x0;
double* y = (double*)ptrs[1];
if( x != x0 )
memcpy(x, x0, bsz*esz1);
Log_64f(x, y, bsz);
for( k = 0; k < bsz; k++ )
y[k] *= power;
Exp_64f(y, y, bsz);
for( k = 0; k < bsz; k++ )
{
if( x0[k] <= 0 )
{
if( x0[k] == 0. )
{
if( power < 0 )
y[k] = inf64.f;
}
else
y[k] = nan64.f;
}
}
}
ptrs[0] += bsz*esz1;
ptrs[1] += bsz*esz1;
......
......@@ -290,4 +290,6 @@ extern bool __termination; // skip some cleanups, because process is terminating
}
#include "opencv2/hal/intrin.hpp"
#endif /*_CXCORE_INTERNAL_H_*/
......@@ -1210,6 +1210,13 @@ TEST(Core_Mat, copyNx1ToVector)
ASSERT_PRED_FORMAT2(cvtest::MatComparator(0, 0), ref_dst16, cv::Mat_<ushort>(dst16));
}
TEST(Core_Matx, fromMat_)
{
Mat_<double> a = (Mat_<double>(2,2) << 10, 11, 12, 13);
Matx22d b(a);
ASSERT_EQ( norm(a, b, NORM_INF), 0.);
}
TEST(Core_SVD, orthogonality)
{
for( int i = 0; i < 2; i++ )
......
......@@ -232,7 +232,7 @@ void Core_PowTest::prepare_to_validation( int /*test_case_idx*/ )
for( j = 0; j < ncols; j++ )
{
int val = ((uchar*)a_data)[j];
((uchar*)b_data)[j] = (uchar)(val <= 1 ? val :
((uchar*)b_data)[j] = (uchar)(val == 0 ? 255 : val == 1 ? 1 :
val == 2 && ipower == -1 ? 1 : 0);
}
else
......@@ -247,17 +247,17 @@ void Core_PowTest::prepare_to_validation( int /*test_case_idx*/ )
if( ipower < 0 )
for( j = 0; j < ncols; j++ )
{
int val = ((char*)a_data)[j];
((char*)b_data)[j] = (char)((val&~1)==0 ? val :
int val = ((schar*)a_data)[j];
((schar*)b_data)[j] = (schar)(val == 0 ? 127 : val == 1 ? 1 :
val ==-1 ? 1-2*(ipower&1) :
val == 2 && ipower == -1 ? 1 : 0);
}
else
for( j = 0; j < ncols; j++ )
{
int val = ((char*)a_data)[j];
int val = ((schar*)a_data)[j];
val = ipow( val, ipower );
((char*)b_data)[j] = saturate_cast<schar>(val);
((schar*)b_data)[j] = saturate_cast<schar>(val);
}
break;
case CV_16U:
......@@ -265,7 +265,7 @@ void Core_PowTest::prepare_to_validation( int /*test_case_idx*/ )
for( j = 0; j < ncols; j++ )
{
int val = ((ushort*)a_data)[j];
((ushort*)b_data)[j] = (ushort)((val&~1)==0 ? val :
((ushort*)b_data)[j] = (ushort)(val == 0 ? 65535 : val == 1 ? 1 :
val ==-1 ? 1-2*(ipower&1) :
val == 2 && ipower == -1 ? 1 : 0);
}
......@@ -282,7 +282,7 @@ void Core_PowTest::prepare_to_validation( int /*test_case_idx*/ )
for( j = 0; j < ncols; j++ )
{
int val = ((short*)a_data)[j];
((short*)b_data)[j] = (short)((val&~1)==0 ? val :
((short*)b_data)[j] = (short)(val == 0 ? 32767 : val == 1 ? 1 :
val ==-1 ? 1-2*(ipower&1) :
val == 2 && ipower == -1 ? 1 : 0);
}
......@@ -299,7 +299,7 @@ void Core_PowTest::prepare_to_validation( int /*test_case_idx*/ )
for( j = 0; j < ncols; j++ )
{
int val = ((int*)a_data)[j];
((int*)b_data)[j] = (val&~1)==0 ? val :
((int*)b_data)[j] = val == 0 ? INT_MAX : val == 1 ? 1 :
val ==-1 ? 1-2*(ipower&1) :
val == 2 && ipower == -1 ? 1 : 0;
}
......@@ -351,8 +351,6 @@ void Core_PowTest::prepare_to_validation( int /*test_case_idx*/ )
}
}
///////////////////////////////////////// matrix tests ////////////////////////////////////////////
class Core_MatrixTest : public cvtest::ArrayTest
......@@ -2822,4 +2820,55 @@ TEST(CovariationMatrixVectorOfMatWithMean, accuracy)
ASSERT_EQ(sDiff.dot(sDiff), 0.0);
}
TEST(Core_Pow, special)
{
for( int i = 0; i < 100; i++ )
{
int n = theRNG().uniform(1, 30);
Mat mtx0(1, n, CV_8S), mtx, result;
randu(mtx0, -5, 5);
int type = theRNG().uniform(0, 2) ? CV_64F : CV_32F;
double eps = type == CV_32F ? 1e-3 : 1e-10;
mtx0.convertTo(mtx, type);
// generate power from [-n, n] interval with 1/8 step - enough to check various cases.
const int max_pf = 3;
int pf = theRNG().uniform(0, max_pf*2+1);
double power = ((1 << pf) - (1 << (max_pf*2-1)))/16.;
int ipower = cvRound(power);
bool is_ipower = ipower == power;
cv::pow(mtx, power, result);
for( int j = 0; j < n; j++ )
{
double val = type == CV_32F ? (double)mtx.at<float>(j) : mtx.at<double>(j);
double r = type == CV_32F ? (double)result.at<float>(j) : result.at<double>(j);
double r0;
if( power == 0. )
r0 = 1;
else if( is_ipower )
{
r0 = 1;
for( int k = 0; k < std::abs(ipower); k++ )
r0 *= val;
if( ipower < 0 )
r0 = 1./r0;
}
else
r0 = std::pow(val, power);
if( cvIsInf(r0) )
{
ASSERT_TRUE(cvIsInf(r));
}
else if( cvIsNaN(r0) )
{
ASSERT_TRUE(cvIsNaN(r));
}
else
{
ASSERT_LT(fabs(r - r0), eps);
}
}
}
}
/* End of file. */
......@@ -614,6 +614,16 @@ inline v_int32x4 operator * (const v_int32x4& a, const v_int32x4& b)
__m128i d1 = _mm_unpackhi_epi32(c0, c1);
return v_int32x4(_mm_unpacklo_epi64(d0, d1));
}
inline v_uint32x4& operator *= (v_uint32x4& a, const v_uint32x4& b)
{
a = a * b;
return a;
}
inline v_int32x4& operator *= (v_int32x4& a, const v_int32x4& b)
{
a = a * b;
return a;
}
inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
v_int32x4& c, v_int32x4& d)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment