Commit 93ffebc2 authored by Sayed Adel's avatar Sayed Adel

core: reimplement SIMD arithmetic, logic and comparison operations into wide universal intrinsics

  - initialize arithmetic dispatcher
  - add new universal intrinsic v_absdiffs
  - add new universal intrinsic v_pack_b
  - add accumulate version of universal intrinsic v_round
  - fix sse/avx2:uint8 multiplication overflow
  - reimplement arithmetic, logic and comparison operations into wide universal intrinsics
    with full support for all types
  - reimplement IPP arithmetic, logic and comparison operations in a sperate file arithm_ipp.hpp
  - avoid scalar multiplication if scaling factor eq 1 and use integer multiplication
  - move C arithmetic operations to precomp.hpp and delete [arithm_simd|arithm_core].hpp
  - add compatibility with new opencv4 divide policy
parent d61ad04f
......@@ -2,6 +2,7 @@ set(the_description "The Core Functionality")
ocv_add_dispatched_file(mathfuncs_core SSE2 AVX AVX2)
ocv_add_dispatched_file(stat SSE4_2 AVX2)
ocv_add_dispatched_file(arithm SSE2 SSE4_1 AVX2)
# dispatching for accuracy tests
ocv_add_dispatched_file_force_all(test_intrin128 TEST SSE2 SSE3 SSSE3 SSE4_1 SSE4_2 AVX FP16 AVX2)
......
......@@ -661,7 +661,7 @@ inline v_uint8x32 operator * (const v_uint8x32& a, const v_uint8x32& b)
{
v_uint16x16 c, d;
v_mul_expand(a, b, c, d);
return v_pack_u(v_reinterpret_as_s16(c), v_reinterpret_as_s16(d));
return v_pack(c, d);
}
inline v_int8x32 operator * (const v_int8x32& a, const v_int8x32& b)
{
......@@ -1291,6 +1291,16 @@ inline v_float32x8 v_absdiff(const v_float32x8& a, const v_float32x8& b)
inline v_float64x4 v_absdiff(const v_float64x4& a, const v_float64x4& b)
{ return v_abs(a - b); }
/** Saturating absolute difference **/
inline v_int8x32 v_absdiffs(const v_int8x32& a, const v_int8x32& b)
{
v_int8x32 d = a - b;
v_int8x32 m = a < b;
return (d ^ m) - m;
}
inline v_int16x16 v_absdiffs(const v_int16x16& a, const v_int16x16& b)
{ return v_max(a, b) - v_min(a, b); }
////////// Conversions /////////
/** Rounding **/
......@@ -1300,6 +1310,12 @@ inline v_int32x8 v_round(const v_float32x8& a)
inline v_int32x8 v_round(const v_float64x4& a)
{ return v_int32x8(_mm256_castsi128_si256(_mm256_cvtpd_epi32(a.val))); }
inline v_int32x8 v_round(const v_float64x4& a, const v_float64x4& b)
{
__m128i ai = _mm256_cvtpd_epi32(a.val), bi = _mm256_cvtpd_epi32(b.val);
return v_int32x8(_v256_combine(ai, bi));
}
inline v_int32x8 v_trunc(const v_float32x8& a)
{ return v_int32x8(_mm256_cvttps_epi32(a.val)); }
......@@ -1689,6 +1705,40 @@ void v_rshr_pack_store(int* ptr, const v_int64x4& a)
v_pack_store(ptr, (a + delta) >> n);
}
// pack boolean
inline v_uint8x32 v_pack_b(const v_uint16x16& a, const v_uint16x16& b)
{
__m256i ab = _mm256_packs_epi16(a.val, b.val);
return v_uint8x32(_v256_shuffle_odd_64(ab));
}
inline v_uint8x32 v_pack_b(const v_uint32x8& a, const v_uint32x8& b,
const v_uint32x8& c, const v_uint32x8& d)
{
__m256i ab = _mm256_packs_epi32(a.val, b.val);
__m256i cd = _mm256_packs_epi32(c.val, d.val);
__m256i abcd = _v256_shuffle_odd_64(_mm256_packs_epi16(ab, cd));
return v_uint8x32(_mm256_shuffle_epi32(abcd, _MM_SHUFFLE(3, 1, 2, 0)));
}
inline v_uint8x32 v_pack_b(const v_uint64x4& a, const v_uint64x4& b, const v_uint64x4& c,
const v_uint64x4& d, const v_uint64x4& e, const v_uint64x4& f,
const v_uint64x4& g, const v_uint64x4& h)
{
__m256i ab = _mm256_packs_epi32(a.val, b.val);
__m256i cd = _mm256_packs_epi32(c.val, d.val);
__m256i ef = _mm256_packs_epi32(e.val, f.val);
__m256i gh = _mm256_packs_epi32(g.val, h.val);
__m256i abcd = _mm256_packs_epi32(ab, cd);
__m256i efgh = _mm256_packs_epi32(ef, gh);
__m256i pkall = _v256_shuffle_odd_64(_mm256_packs_epi16(abcd, efgh));
__m256i rev = _mm256_alignr_epi8(pkall, pkall, 8);
return v_uint8x32(_mm256_unpacklo_epi16(pkall, rev));
}
/* Recombine */
// its up there with load and store operations
......
......@@ -109,7 +109,7 @@ These operations allow to reorder or recombine elements in one or multiple vecto
- Interleave, deinterleave (2, 3 and 4 channels): @ref v_load_deinterleave, @ref v_store_interleave
- Expand: @ref v_load_expand, @ref v_load_expand_q, @ref v_expand, @ref v_expand_low, @ref v_expand_high
- Pack: @ref v_pack, @ref v_pack_u, @ref v_rshr_pack, @ref v_rshr_pack_u,
- Pack: @ref v_pack, @ref v_pack_u, @ref v_pack_b, @ref v_rshr_pack, @ref v_rshr_pack_u,
@ref v_pack_store, @ref v_pack_u_store, @ref v_rshr_pack_store, @ref v_rshr_pack_u_store
- Recombine: @ref v_zip, @ref v_recombine, @ref v_combine_low, @ref v_combine_high
- Extract: @ref v_extract
......@@ -159,7 +159,7 @@ Most of these operations return only one value.
### Other math
- Some frequent operations: @ref v_sqrt, @ref v_invsqrt, @ref v_magnitude, @ref v_sqr_magnitude
- Absolute values: @ref v_abs, @ref v_absdiff
- Absolute values: @ref v_abs, @ref v_absdiff, @ref v_absdiffs
### Conversions
......@@ -199,10 +199,12 @@ Regular integers:
|logical | x | x | x | x | x | x |
|min, max | x | x | x | x | x | x |
|absdiff | x | x | x | x | x | x |
|absdiffs | | x | | x | | |
|reduce | | | | | x | x |
|mask | x | x | x | x | x | x |
|pack | x | x | x | x | x | x |
|pack_u | x | | x | | | |
|pack_b | x | | | | | |
|unpack | x | x | x | x | x | x |
|extract | x | x | x | x | x | x |
|rotate (lanes) | x | x | x | x | x | x |
......@@ -762,6 +764,19 @@ inline v_float64x2 v_absdiff(const v_float64x2& a, const v_float64x2& b)
return c;
}
/** @brief Saturating absolute difference
Returns \f$ saturate(|a - b|) \f$ .
For 8-, 16-bit signed integer source types. */
template<typename _Tp, int n>
inline v_reg<_Tp, n> v_absdiffs(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
{
v_reg<_Tp, n> c;
for( int i = 0; i < n; i++)
c.s[i] = saturate_cast<_Tp>(std::abs(a.s[i] - b.s[i]));
return c;
}
/** @brief Inversed square root
Returns \f$ 1/sqrt(a) \f$
......@@ -1613,6 +1628,18 @@ template<int n> inline v_reg<int, n> v_round(const v_reg<float, n>& a)
return c;
}
/** @overload */
template<int n> inline v_reg<int, n*2> v_round(const v_reg<double, n>& a, const v_reg<double, n>& b)
{
v_reg<int, n*2> c;
for( int i = 0; i < n; i++ )
{
c.s[i] = cvRound(a.s[i]);
c.s[i+n] = cvRound(b.s[i]);
}
return c;
}
/** @brief Floor
Floor each value. Input type is float vector ==> output type is int vector.*/
......@@ -2059,6 +2086,103 @@ OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int16x8, short, v_uint8x16, uchar, pack_u, s
OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int32x4, int, v_uint16x8, ushort, pack_u, saturate_cast)
//! @}
//! @cond IGNORED
template<typename _Tpm, typename _Tp, int n>
inline void _pack_b(_Tpm* mptr, const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
{
for (int i = 0; i < n; ++i)
{
mptr[i] = (_Tpm)a.s[i];
mptr[i + n] = (_Tpm)b.s[i];
}
}
//! @endcond
//! @name Pack boolean values
//! @{
//! @brief Pack boolean values from multiple vectors to one unsigned 8-bit integer vector
//!
//! @note Must provide valid boolean values to guarantee same result for all architectures.
/** @brief
//! For 16-bit boolean values
Scheme:
@code
a {0xFFFF 0 0 0xFFFF 0 0xFFFF 0xFFFF 0}
b {0xFFFF 0 0xFFFF 0 0 0xFFFF 0 0xFFFF}
===============
{
0xFF 0 0 0xFF 0 0xFF 0xFF 0
0xFF 0 0xFF 0 0 0xFF 0 0xFF
}
@endcode */
inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
{
v_uint8x16 mask;
_pack_b(mask.s, a, b);
return mask;
}
/** @overload
For 32-bit boolean values
Scheme:
@code
a {0xFFFF.. 0 0 0xFFFF..}
b {0 0xFFFF.. 0xFFFF.. 0}
c {0xFFFF.. 0 0xFFFF.. 0}
d {0 0xFFFF.. 0 0xFFFF..}
===============
{
0xFF 0 0 0xFF 0 0xFF 0xFF 0
0xFF 0 0xFF 0 0 0xFF 0 0xFF
}
@endcode */
inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
const v_uint32x4& c, const v_uint32x4& d)
{
v_uint8x16 mask;
_pack_b(mask.s, a, b);
_pack_b(mask.s + 8, c, d);
return mask;
}
/** @overload
For 64-bit boolean values
Scheme:
@code
a {0xFFFF.. 0}
b {0 0xFFFF..}
c {0xFFFF.. 0}
d {0 0xFFFF..}
e {0xFFFF.. 0}
f {0xFFFF.. 0}
g {0 0xFFFF..}
h {0 0xFFFF..}
===============
{
0xFF 0 0 0xFF 0xFF 0 0 0xFF
0xFF 0 0xFF 0 0 0xFF 0 0xFF
}
@endcode */
inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
const v_uint64x2& g, const v_uint64x2& h)
{
v_uint8x16 mask;
_pack_b(mask.s, a, b);
_pack_b(mask.s + 4, c, d);
_pack_b(mask.s + 8, e, f);
_pack_b(mask.s + 12, g, h);
return mask;
}
//! @}
/** @brief Matrix multiplication
Scheme:
......
......@@ -394,6 +394,35 @@ OPENCV_HAL_IMPL_NEON_PACK(v_int32x4, int, int32x2_t, s32, v_int64x2, pack, vmovn
OPENCV_HAL_IMPL_NEON_PACK(v_uint8x16, uchar, uint8x8_t, u8, v_int16x8, pack_u, vqmovun_s16, vqrshrun_n_s16)
OPENCV_HAL_IMPL_NEON_PACK(v_uint16x8, ushort, uint16x4_t, u16, v_int32x4, pack_u, vqmovun_s32, vqrshrun_n_s32)
// pack boolean
inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
{
uint8x16_t ab = vcombine_u8(vmovn_u16(a.val), vmovn_u16(b.val));
return v_uint8x16(ab);
}
inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
const v_uint32x4& c, const v_uint32x4& d)
{
uint16x8_t nab = vcombine_u16(vmovn_u32(a.val), vmovn_u32(b.val));
uint16x8_t ncd = vcombine_u16(vmovn_u32(c.val), vmovn_u32(d.val));
return v_uint8x16(vcombine_u8(vmovn_u16(nab), vmovn_u16(ncd)));
}
inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
const v_uint64x2& g, const v_uint64x2& h)
{
uint32x4_t ab = vcombine_u32(vmovn_u64(a.val), vmovn_u64(b.val));
uint32x4_t cd = vcombine_u32(vmovn_u64(c.val), vmovn_u64(d.val));
uint32x4_t ef = vcombine_u32(vmovn_u64(e.val), vmovn_u64(f.val));
uint32x4_t gh = vcombine_u32(vmovn_u64(g.val), vmovn_u64(h.val));
uint16x8_t abcd = vcombine_u16(vmovn_u32(ab), vmovn_u32(cd));
uint16x8_t efgh = vcombine_u16(vmovn_u32(ef), vmovn_u32(gh));
return v_uint8x16(vcombine_u8(vmovn_u16(abcd), vmovn_u16(efgh)));
}
inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
const v_float32x4& m1, const v_float32x4& m2,
const v_float32x4& m3)
......@@ -748,7 +777,6 @@ OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_mul_wrap, vmulq_s8)
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_mul_wrap, vmulq_u16)
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_mul_wrap, vmulq_s16)
// TODO: absdiff for signed integers
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_absdiff, vabdq_u8)
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_absdiff, vabdq_u16)
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_absdiff, vabdq_u32)
......@@ -757,6 +785,12 @@ OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_absdiff, vabdq_f32)
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float64x2, v_absdiff, vabdq_f64)
#endif
/** Saturating absolute difference **/
inline v_int8x16 v_absdiffs(const v_int8x16& a, const v_int8x16& b)
{ return v_int8x16(vqabsq_s8(vqsubq_s8(a.val, b.val))); }
inline v_int16x8 v_absdiffs(const v_int16x8& a, const v_int16x8& b)
{ return v_int16x8(vqabsq_s16(vqsubq_s16(a.val, b.val))); }
#define OPENCV_HAL_IMPL_NEON_BIN_FUNC2(_Tpvec, _Tpvec2, cast, func, intrin) \
inline _Tpvec2 func(const _Tpvec& a, const _Tpvec& b) \
{ \
......@@ -1242,6 +1276,11 @@ inline v_int32x4 v_round(const v_float64x2& a)
return v_int32x4(vcombine_s32(vmovn_s64(vcvtaq_s64_f64(a.val)), zero));
}
inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
{
return v_int32x4(vcombine_s32(vmovn_s64(vcvtaq_s64_f64(a.val)), vmovn_s64(vcvtaq_s64_f64(b.val))));
}
inline v_int32x4 v_floor(const v_float64x2& a)
{
static const int32x2_t zero = vdup_n_s32(0);
......
......@@ -634,6 +634,35 @@ void v_rshr_pack_store(int* ptr, const v_int64x2& a)
_mm_storel_epi64((__m128i*)ptr, a2);
}
// pack boolean
inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
{
__m128i ab = _mm_packs_epi16(a.val, b.val);
return v_uint8x16(ab);
}
inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
const v_uint32x4& c, const v_uint32x4& d)
{
__m128i ab = _mm_packs_epi32(a.val, b.val);
__m128i cd = _mm_packs_epi32(c.val, d.val);
return v_uint8x16(_mm_packs_epi16(ab, cd));
}
inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
const v_uint64x2& g, const v_uint64x2& h)
{
__m128i ab = _mm_packs_epi32(a.val, b.val);
__m128i cd = _mm_packs_epi32(c.val, d.val);
__m128i ef = _mm_packs_epi32(e.val, f.val);
__m128i gh = _mm_packs_epi32(g.val, h.val);
__m128i abcd = _mm_packs_epi32(ab, cd);
__m128i efgh = _mm_packs_epi32(ef, gh);
return v_uint8x16(_mm_packs_epi16(abcd, efgh));
}
inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
const v_float32x4& m1, const v_float32x4& m2,
const v_float32x4& m3)
......@@ -706,19 +735,11 @@ OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int64x2, _mm_sub_epi64)
inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \
{ a = a * b; return a; }
OPENCV_HAL_IMPL_SSE_MUL_SAT(v_uint8x16, v_uint16x8)
OPENCV_HAL_IMPL_SSE_MUL_SAT(v_int8x16, v_int16x8)
OPENCV_HAL_IMPL_SSE_MUL_SAT(v_uint16x8, v_uint32x4)
OPENCV_HAL_IMPL_SSE_MUL_SAT(v_int16x8, v_int32x4)
inline v_uint8x16 operator * (const v_uint8x16& a, const v_uint8x16& b)
{
v_uint16x8 c, d;
v_mul_expand(a, b, c, d);
return v_pack_u(v_reinterpret_as_s16(c), v_reinterpret_as_s16(d));
}
inline v_uint8x16& operator *= (v_uint8x16& a, const v_uint8x16& b)
{ a = a * b; return a; }
// Multiply and expand
inline void v_mul_expand(const v_uint8x16& a, const v_uint8x16& b,
v_uint16x8& c, v_uint16x8& d)
......@@ -1045,34 +1066,43 @@ inline v_int8x16 v_mul_wrap(const v_int8x16& a, const v_int8x16& b)
return v_reinterpret_as_s8(v_mul_wrap(v_reinterpret_as_u8(a), v_reinterpret_as_u8(b)));
}
#define OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(_Tpuvec, _Tpsvec, bits, smask32) \
inline _Tpuvec v_absdiff(const _Tpuvec& a, const _Tpuvec& b) \
{ \
return _Tpuvec(_mm_add_epi##bits(_mm_subs_epu##bits(a.val, b.val), _mm_subs_epu##bits(b.val, a.val))); \
} \
inline _Tpuvec v_absdiff(const _Tpsvec& a, const _Tpsvec& b) \
{ \
__m128i smask = _mm_set1_epi32(smask32); \
__m128i a1 = _mm_xor_si128(a.val, smask); \
__m128i b1 = _mm_xor_si128(b.val, smask); \
return _Tpuvec(_mm_add_epi##bits(_mm_subs_epu##bits(a1, b1), _mm_subs_epu##bits(b1, a1))); \
}
OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(v_uint8x16, v_int8x16, 8, (int)0x80808080)
OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(v_uint16x8, v_int16x8, 16, (int)0x80008000)
/** Absolute difference **/
inline v_uint8x16 v_absdiff(const v_uint8x16& a, const v_uint8x16& b)
{ return v_add_wrap(a - b, b - a); }
inline v_uint16x8 v_absdiff(const v_uint16x8& a, const v_uint16x8& b)
{ return v_add_wrap(a - b, b - a); }
inline v_uint32x4 v_absdiff(const v_uint32x4& a, const v_uint32x4& b)
{ return v_max(a, b) - v_min(a, b); }
inline v_uint8x16 v_absdiff(const v_int8x16& a, const v_int8x16& b)
{
return v_max(a, b) - v_min(a, b);
v_int8x16 d = v_sub_wrap(a, b);
v_int8x16 m = a < b;
return v_reinterpret_as_u8(v_sub_wrap(d ^ m, m));
}
inline v_uint16x8 v_absdiff(const v_int16x8& a, const v_int16x8& b)
{
return v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b)));
}
inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b)
{
__m128i d = _mm_sub_epi32(a.val, b.val);
__m128i m = _mm_cmpgt_epi32(b.val, a.val);
return v_uint32x4(_mm_sub_epi32(_mm_xor_si128(d, m), m));
v_int32x4 d = a - b;
v_int32x4 m = a < b;
return v_reinterpret_as_u32((d ^ m) - m);
}
/** Saturating absolute difference **/
inline v_int8x16 v_absdiffs(const v_int8x16& a, const v_int8x16& b)
{
v_int8x16 d = a - b;
v_int8x16 m = a < b;
return (d ^ m) - m;
}
inline v_int16x8 v_absdiffs(const v_int16x8& a, const v_int16x8& b)
{ return v_max(a, b) - v_min(a, b); }
inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
{
return a * b + c;
......@@ -1623,6 +1653,12 @@ inline v_int32x4 v_trunc(const v_float32x4& a)
inline v_int32x4 v_round(const v_float64x2& a)
{ return v_int32x4(_mm_cvtpd_epi32(a.val)); }
inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
{
__m128i ai = _mm_cvtpd_epi32(a.val), bi = _mm_cvtpd_epi32(b.val);
return v_int32x4(_mm_unpacklo_epi64(ai, bi));
}
inline v_int32x4 v_floor(const v_float64x2& a)
{
__m128i a1 = _mm_cvtpd_epi32(a.val);
......
......@@ -383,6 +383,35 @@ OPENCV_HAL_IMPL_VSX_PACK(v_uint16x8, ushort, v_int32x4, unsigned int, int,
//OPENCV_HAL_IMPL_VSX_PACK(v_uint32x4, uint, v_int64x2, unsigned long long, long long,
// vec_sra, vec_packsu, vec_add, pack_u)
// pack boolean
inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
{
vec_uchar16 ab = vec_pack(a.val, b.val);
return v_uint8x16(ab);
}
inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
const v_uint32x4& c, const v_uint32x4& d)
{
vec_ushort8 ab = vec_pack(a.val, b.val);
vec_ushort8 cd = vec_pack(c.val, d.val);
return v_uint8x16(vec_pack(ab, cd));
}
inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
const v_uint64x2& g, const v_uint64x2& h)
{
vec_uint4 ab = vec_pack(a.val, b.val);
vec_uint4 cd = vec_pack(c.val, d.val);
vec_uint4 ef = vec_pack(e.val, f.val);
vec_uint4 gh = vec_pack(g.val, h.val);
vec_ushort8 abcd = vec_pack(ab, cd);
vec_ushort8 efgh = vec_pack(ef, gh);
return v_uint8x16(vec_pack(abcd, efgh));
}
/* Recombine */
template <typename _Tpvec>
inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1)
......@@ -834,16 +863,27 @@ inline v_float32x4 v_abs(const v_float32x4& x)
inline v_float64x2 v_abs(const v_float64x2& x)
{ return v_float64x2(vec_abs(x.val)); }
/** Absolute difference **/
// unsigned
OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_absdiff, vec_absd)
#define OPENCV_HAL_IMPL_VSX_BIN_FUNC2(_Tpvec, _Tpvec2, cast, func, intrin) \
inline _Tpvec2 func(const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec2(cast(intrin(a.val, b.val))); }
inline v_uint8x16 v_absdiff(const v_int8x16& a, const v_int8x16& b)
{ return v_reinterpret_as_u8(v_sub_wrap(v_max(a, b), v_min(a, b))); }
inline v_uint16x8 v_absdiff(const v_int16x8& a, const v_int16x8& b)
{ return v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))); }
inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b)
{ return v_reinterpret_as_u32(v_max(a, b) - v_min(a, b)); }
OPENCV_HAL_IMPL_VSX_BIN_FUNC2(v_int8x16, v_uint8x16, vec_uchar16_c, v_absdiff, vec_absd)
OPENCV_HAL_IMPL_VSX_BIN_FUNC2(v_int16x8, v_uint16x8, vec_ushort8_c, v_absdiff, vec_absd)
OPENCV_HAL_IMPL_VSX_BIN_FUNC2(v_int32x4, v_uint32x4, vec_uint4_c, v_absdiff, vec_absd)
OPENCV_HAL_IMPL_VSX_BIN_FUNC2(v_int64x2, v_uint64x2, vec_udword2_c, v_absdiff, vec_absd)
inline v_float32x4 v_absdiff(const v_float32x4& a, const v_float32x4& b)
{ return v_abs(a - b); }
inline v_float64x2 v_absdiff(const v_float64x2& a, const v_float64x2& b)
{ return v_abs(a - b); }
/** Absolute difference for signed integers **/
inline v_int8x16 v_absdiffs(const v_int8x16& a, const v_int8x16& b)
{ return v_int8x16(vec_abss(vec_subs(a.val, b.val))); }
inline v_int16x8 v_absdiffs(const v_int16x8& a, const v_int16x8& b)
{ return v_int16x8(vec_abss(vec_subs(a.val, b.val))); }
////////// Conversions /////////
......@@ -854,6 +894,9 @@ inline v_int32x4 v_round(const v_float32x4& a)
inline v_int32x4 v_round(const v_float64x2& a)
{ return v_int32x4(vec_mergesqo(vec_ctso(vec_rint(a.val)), vec_int4_z)); }
inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
{ return v_int32x4(vec_mergesqo(vec_ctso(vec_rint(a.val)), vec_ctso(vec_rint(b.val)))); }
inline v_int32x4 v_floor(const v_float32x4& a)
{ return v_int32x4(vec_cts(vec_floor(a.val))); }
......
This diff is collapsed.
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html
#include "precomp.hpp"
#include "arithm_ipp.hpp"
#include "arithm.simd.hpp"
#include "arithm.simd_declarations.hpp"
#define ARITHM_DISPATCHING_ONLY
#include "arithm.simd.hpp"
\ No newline at end of file
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -86,7 +86,6 @@
#include "opencv2/core/sse_utils.hpp"
#include "opencv2/core/neon_utils.hpp"
#include "opencv2/core/vsx_utils.hpp"
#include "arithm_core.hpp"
#include "hal_replacement.hpp"
#ifdef HAVE_TEGRA_OPTIMIZATION
......@@ -110,6 +109,102 @@ extern const uchar g_Saturate8u[];
#define CV_MIN_8U(a,b) ((a) - CV_FAST_CAST_8U((a) - (b)))
#define CV_MAX_8U(a,b) ((a) + CV_FAST_CAST_8U((b) - (a)))
template<typename T1, typename T2=T1, typename T3=T1> struct OpAdd
{
typedef T1 type1;
typedef T2 type2;
typedef T3 rtype;
T3 operator ()(const T1 a, const T2 b) const { return saturate_cast<T3>(a + b); }
};
template<typename T1, typename T2=T1, typename T3=T1> struct OpSub
{
typedef T1 type1;
typedef T2 type2;
typedef T3 rtype;
T3 operator ()(const T1 a, const T2 b) const { return saturate_cast<T3>(a - b); }
};
template<typename T1, typename T2=T1, typename T3=T1> struct OpRSub
{
typedef T1 type1;
typedef T2 type2;
typedef T3 rtype;
T3 operator ()(const T1 a, const T2 b) const { return saturate_cast<T3>(b - a); }
};
template<typename T> struct OpMin
{
typedef T type1;
typedef T type2;
typedef T rtype;
T operator ()(const T a, const T b) const { return std::min(a, b); }
};
template<typename T> struct OpMax
{
typedef T type1;
typedef T type2;
typedef T rtype;
T operator ()(const T a, const T b) const { return std::max(a, b); }
};
template<typename T> struct OpAbsDiff
{
typedef T type1;
typedef T type2;
typedef T rtype;
T operator()(T a, T b) const { return a > b ? a - b : b - a; }
};
// specializations to prevent "-0" results
template<> struct OpAbsDiff<float>
{
typedef float type1;
typedef float type2;
typedef float rtype;
float operator()(float a, float b) const { return std::abs(a - b); }
};
template<> struct OpAbsDiff<double>
{
typedef double type1;
typedef double type2;
typedef double rtype;
double operator()(double a, double b) const { return std::abs(a - b); }
};
template<typename T> struct OpAnd
{
typedef T type1;
typedef T type2;
typedef T rtype;
T operator()( T a, T b ) const { return a & b; }
};
template<typename T> struct OpOr
{
typedef T type1;
typedef T type2;
typedef T rtype;
T operator()( T a, T b ) const { return a | b; }
};
template<typename T> struct OpXor
{
typedef T type1;
typedef T type2;
typedef T rtype;
T operator()( T a, T b ) const { return a ^ b; }
};
template<typename T> struct OpNot
{
typedef T type1;
typedef T type2;
typedef T rtype;
T operator()( T a, T ) const { return ~a; }
};
template<> inline uchar OpAdd<uchar>::operator ()(uchar a, uchar b) const
{ return CV_FAST_CAST_8U(a + b); }
......
......@@ -119,11 +119,15 @@ template <typename R> struct Data
d[i] += (LaneType)m;
return *this;
}
void fill(LaneType val)
void fill(LaneType val, int s, int c = R::nlanes)
{
for (int i = 0; i < R::nlanes; ++i)
for (int i = s; i < c; ++i)
d[i] = val;
}
void fill(LaneType val)
{
fill(val, 0);
}
void reverse()
{
for (int i = 0; i < R::nlanes / 2; ++i)
......@@ -739,6 +743,23 @@ template<typename R> struct TheTest
return *this;
}
TheTest & test_absdiffs()
{
Data<R> dataA(std::numeric_limits<LaneType>::max()),
dataB(std::numeric_limits<LaneType>::min());
dataA[0] = (LaneType)-1;
dataB[0] = 1;
dataA[1] = 2;
dataB[1] = (LaneType)-2;
R a = dataA, b = dataB;
Data<R> resC = v_absdiffs(a, b);
for (int i = 0; i < R::nlanes; ++i)
{
EXPECT_EQ(saturate_cast<LaneType>(std::abs(dataA[i] - dataB[i])), resC[i]);
}
return *this;
}
TheTest & test_reduce()
{
Data<R> dataA;
......@@ -874,6 +895,81 @@ template<typename R> struct TheTest
return *this;
}
// v_uint8 only
TheTest & test_pack_b()
{
// 16-bit
Data<R> dataA, dataB;
dataB.fill(0, R::nlanes / 2);
R a = dataA, b = dataB;
Data<R> maskA = a == b, maskB = a != b;
a = maskA; b = maskB;
Data<R> res = v_pack_b(v_reinterpret_as_u16(a), v_reinterpret_as_u16(b));
for (int i = 0; i < v_uint16::nlanes; ++i)
{
SCOPED_TRACE(cv::format("i=%d", i));
EXPECT_EQ(maskA[i * 2], res[i]);
EXPECT_EQ(maskB[i * 2], res[i + v_uint16::nlanes]);
}
// 32-bit
Data<R> dataC, dataD;
dataD.fill(0, R::nlanes / 2);
R c = dataC, d = dataD;
Data<R> maskC = c == d, maskD = c != d;
c = maskC; d = maskD;
res = v_pack_b
(
v_reinterpret_as_u32(a), v_reinterpret_as_u32(b),
v_reinterpret_as_u32(c), v_reinterpret_as_u32(d)
);
for (int i = 0; i < v_uint32::nlanes; ++i)
{
SCOPED_TRACE(cv::format("i=%d", i));
EXPECT_EQ(maskA[i * 4], res[i]);
EXPECT_EQ(maskB[i * 4], res[i + v_uint32::nlanes]);
EXPECT_EQ(maskC[i * 4], res[i + v_uint32::nlanes * 2]);
EXPECT_EQ(maskD[i * 4], res[i + v_uint32::nlanes * 3]);
}
// 64-bit
Data<R> dataE, dataF, dataG(0), dataH(0xFF);
dataF.fill(0, R::nlanes / 2);
R e = dataE, f = dataF, g = dataG, h = dataH;
Data<R> maskE = e == f, maskF = e != f;
e = maskE; f = maskF;
res = v_pack_b
(
v_reinterpret_as_u64(a), v_reinterpret_as_u64(b),
v_reinterpret_as_u64(c), v_reinterpret_as_u64(d),
v_reinterpret_as_u64(e), v_reinterpret_as_u64(f),
v_reinterpret_as_u64(g), v_reinterpret_as_u64(h)
);
for (int i = 0; i < v_uint64::nlanes; ++i)
{
SCOPED_TRACE(cv::format("i=%d", i));
EXPECT_EQ(maskA[i * 8], res[i]);
EXPECT_EQ(maskB[i * 8], res[i + v_uint64::nlanes]);
EXPECT_EQ(maskC[i * 8], res[i + v_uint64::nlanes * 2]);
EXPECT_EQ(maskD[i * 8], res[i + v_uint64::nlanes * 3]);
EXPECT_EQ(maskE[i * 8], res[i + v_uint64::nlanes * 4]);
EXPECT_EQ(maskF[i * 8], res[i + v_uint64::nlanes * 5]);
EXPECT_EQ(dataG[i * 8], res[i + v_uint64::nlanes * 6]);
EXPECT_EQ(dataH[i * 8], res[i + v_uint64::nlanes * 7]);
}
return *this;
}
TheTest & test_unpack()
{
Data<R> dataA, dataB;
......@@ -1228,6 +1324,7 @@ void test_hal_intrin_uint8()
.test_popcount()
.test_pack<1>().test_pack<2>().test_pack<3>().test_pack<8>()
.test_pack_u<1>().test_pack_u<2>().test_pack_u<3>().test_pack_u<8>()
.test_pack_b()
.test_unpack()
.test_extract<0>().test_extract<1>().test_extract<8>().test_extract<15>()
.test_rotate<0>().test_rotate<1>().test_rotate<8>().test_rotate<15>()
......@@ -1259,6 +1356,7 @@ void test_hal_intrin_int8()
.test_logic()
.test_min_max()
.test_absdiff()
.test_absdiffs()
.test_abs()
.test_mask()
.test_popcount()
......@@ -1317,6 +1415,7 @@ void test_hal_intrin_int16()
.test_logic()
.test_min_max()
.test_absdiff()
.test_absdiffs()
.test_abs()
.test_reduce()
.test_mask()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment