Bit-exact GaussianBlur reworked to use wide intrinsics (#12073)

* Bit-exact GaussianBlur reworked to use wide intrinsics * Added v_mul_hi universal intrinsic * Removed custom SSE2 branch from bit-exact GaussianBlur * Removed loop unrolling for gaussianBlur horizontal smoothing

Bit-exact GaussianBlur reworked to use wide intrinsics (#12073)
* Bit-exact GaussianBlur reworked to use wide intrinsics * Added v_mul_hi universal intrinsic * Removed custom SSE2 branch from bit-exact GaussianBlur * Removed loop unrolling for gaussianBlur horizontal smoothing
0f2b535f · Vitaly Tuzov · Vadim Pisarevsky · e345cb03 · 0f2b535f · 0f2b535f
Commit 0f2b535f authored Aug 31, 2018 by Vitaly Tuzov Committed by Vadim Pisarevsky Aug 31, 2018
6 changed files
--- a/modules/core/include/opencv2/core/hal/intrin_avx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_avx.hpp
@@ -664,6 +664,8 @@ inline void v_mul_expand(const v_uint32x8& a, const v_uint32x8& b,
    v_zip(v_uint64x4(v0), v_uint64x4(v1), c, d);
 }
+inline v_int16x16 v_mul_hi(const v_int16x16& a, const v_int16x16& b) { return v_int16x16(_mm256_mulhi_epi16(a.val, b.val)); }
+inline v_uint16x16 v_mul_hi(const v_uint16x16& a, const v_uint16x16& b) { return v_uint16x16(_mm256_mulhi_epu16(a.val, b.val)); }
 /** Non-saturating arithmetics **/
 #define OPENCV_HAL_IMPL_AVX_BIN_FUNC(func, _Tpvec, intrin) \

--- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
@@ -891,6 +891,20 @@ template<typename _Tp, int n> inline void v_mul_expand(const v_reg<_Tp, n>& a, c
    }
 }
+/** @brief Multiply and extract high part
+Multiply values two registers and store high part of the results.
+Implemented only for 16-bit source types (v_int16x8, v_uint16x8). Returns \f$ a*b >> 16 \f$
+*/
+template<typename _Tp, int n> inline v_reg<_Tp, n> v_mul_hi(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
+{
+    typedef typename V_TypeTraits<_Tp>::w_type w_type;
+    v_reg<_Tp, n> c;
+    for (int i = 0; i < n; i++)
+        c.s[i] = (_Tp)(((w_type)a.s[i] * b.s[i]) >> sizeof(_Tp)*8);
+    return c;
+}
 //! @cond IGNORED
 template<typename _Tp, int n> inline void v_hsum(const v_reg<_Tp, n>& a,
                                                 v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& c)

--- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
@@ -553,6 +553,21 @@ inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
    d.val = vmull_u32(vget_high_u32(a.val), vget_high_u32(b.val));
 }
+inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b)
+{
+    return v_int16x8(vcombine_s16(
+                                  vshrn_n_s32(vmull_s16( vget_low_s16(a.val),  vget_low_s16(b.val)), 16),
+                                  vshrn_n_s32(vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val)), 16)
+                                 ));
+}
+inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
+{
+    return v_uint16x8(vcombine_u16(
+                                   vshrn_n_u32(vmull_u16( vget_low_u16(a.val),  vget_low_u16(b.val)), 16),
+                                   vshrn_n_u32(vmull_u16(vget_high_u16(a.val), vget_high_u16(b.val)), 16)
+                                  ));
+}
 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
 {
    int32x4_t c = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));

--- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp
@@ -737,6 +737,9 @@ inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
    d.val = _mm_unpackhi_epi64(c0, c1);
 }
+inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b) { return v_int16x8(_mm_mulhi_epi16(a.val, b.val)); }
+inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b) { return v_uint16x8(_mm_mulhi_epu16(a.val, b.val)); }
 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
 {
    return v_int32x4(_mm_madd_epi16(a.val, b.val));

--- a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
@@ -457,6 +457,21 @@ inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b, v_uint64x2& c
    d.val = vec_mul(vec_unpacklu(a.val), vec_unpacklu(b.val));
 }
+inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b)
+{
+    return v_int16x8(vec_packs(
+                               vec_sra(vec_mul(vec_unpackh(a.val), vec_unpackh(b.val)), vec_uint4_sp(16)),
+                               vec_sra(vec_mul(vec_unpackl(a.val), vec_unpackl(b.val)), vec_uint4_sp(16))
+                              ));
+}
+inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
+{
+    return v_uint16x8(vec_packs(
+                                vec_sr(vec_mul(vec_unpackhu(a.val), vec_unpackhu(b.val)), vec_uint4_sp(16)),
+                                vec_sr(vec_mul(vec_unpacklu(a.val), vec_unpacklu(b.val)), vec_uint4_sp(16))
+                               ));
+}
 /** Non-saturating arithmetics **/
 #define OPENCV_HAL_IMPL_VSX_BIN_FUNC(func, intrin)    \
 template<typename _Tpvec>                             \

--- a/modules/imgproc/src/smooth.cpp
+++ b/modules/imgproc/src/smooth.cpp