Commit 0f2b535f authored by Vitaly Tuzov's avatar Vitaly Tuzov Committed by Vadim Pisarevsky

Bit-exact GaussianBlur reworked to use wide intrinsics (#12073)

* Bit-exact GaussianBlur reworked to use wide intrinsics

* Added v_mul_hi universal intrinsic

* Removed custom SSE2 branch from bit-exact GaussianBlur

* Removed loop unrolling for gaussianBlur horizontal smoothing
parent e345cb03
......@@ -664,6 +664,8 @@ inline void v_mul_expand(const v_uint32x8& a, const v_uint32x8& b,
v_zip(v_uint64x4(v0), v_uint64x4(v1), c, d);
}
inline v_int16x16 v_mul_hi(const v_int16x16& a, const v_int16x16& b) { return v_int16x16(_mm256_mulhi_epi16(a.val, b.val)); }
inline v_uint16x16 v_mul_hi(const v_uint16x16& a, const v_uint16x16& b) { return v_uint16x16(_mm256_mulhi_epu16(a.val, b.val)); }
/** Non-saturating arithmetics **/
#define OPENCV_HAL_IMPL_AVX_BIN_FUNC(func, _Tpvec, intrin) \
......
......@@ -891,6 +891,20 @@ template<typename _Tp, int n> inline void v_mul_expand(const v_reg<_Tp, n>& a, c
}
}
/** @brief Multiply and extract high part
Multiply values two registers and store high part of the results.
Implemented only for 16-bit source types (v_int16x8, v_uint16x8). Returns \f$ a*b >> 16 \f$
*/
template<typename _Tp, int n> inline v_reg<_Tp, n> v_mul_hi(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
{
typedef typename V_TypeTraits<_Tp>::w_type w_type;
v_reg<_Tp, n> c;
for (int i = 0; i < n; i++)
c.s[i] = (_Tp)(((w_type)a.s[i] * b.s[i]) >> sizeof(_Tp)*8);
return c;
}
//! @cond IGNORED
template<typename _Tp, int n> inline void v_hsum(const v_reg<_Tp, n>& a,
v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& c)
......
......@@ -553,6 +553,21 @@ inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
d.val = vmull_u32(vget_high_u32(a.val), vget_high_u32(b.val));
}
inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b)
{
return v_int16x8(vcombine_s16(
vshrn_n_s32(vmull_s16( vget_low_s16(a.val), vget_low_s16(b.val)), 16),
vshrn_n_s32(vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val)), 16)
));
}
inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
{
return v_uint16x8(vcombine_u16(
vshrn_n_u32(vmull_u16( vget_low_u16(a.val), vget_low_u16(b.val)), 16),
vshrn_n_u32(vmull_u16(vget_high_u16(a.val), vget_high_u16(b.val)), 16)
));
}
inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
{
int32x4_t c = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
......
......@@ -737,6 +737,9 @@ inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
d.val = _mm_unpackhi_epi64(c0, c1);
}
inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b) { return v_int16x8(_mm_mulhi_epi16(a.val, b.val)); }
inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b) { return v_uint16x8(_mm_mulhi_epu16(a.val, b.val)); }
inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
{
return v_int32x4(_mm_madd_epi16(a.val, b.val));
......
......@@ -457,6 +457,21 @@ inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b, v_uint64x2& c
d.val = vec_mul(vec_unpacklu(a.val), vec_unpacklu(b.val));
}
inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b)
{
return v_int16x8(vec_packs(
vec_sra(vec_mul(vec_unpackh(a.val), vec_unpackh(b.val)), vec_uint4_sp(16)),
vec_sra(vec_mul(vec_unpackl(a.val), vec_unpackl(b.val)), vec_uint4_sp(16))
));
}
inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
{
return v_uint16x8(vec_packs(
vec_sr(vec_mul(vec_unpackhu(a.val), vec_unpackhu(b.val)), vec_uint4_sp(16)),
vec_sr(vec_mul(vec_unpacklu(a.val), vec_unpacklu(b.val)), vec_uint4_sp(16))
));
}
/** Non-saturating arithmetics **/
#define OPENCV_HAL_IMPL_VSX_BIN_FUNC(func, intrin) \
template<typename _Tpvec> \
......
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment