Merge pull request #14916 from terfendail:wsignmask_deprecated

* Avoid using v_signmask universal intrinsic and mark it as deprecated * Renamed v_find_negative to v_scan_forward

Merge pull request #14916 from terfendail:wsignmask_deprecated
* Avoid using v_signmask universal intrinsic and mark it as deprecated * Renamed v_find_negative to v_scan_forward
9befb7a1 · Vitaly Tuzov · Alexander Alekhin · 3e4a195b · 9befb7a1 · 9befb7a1
Commit 9befb7a1 authored Jul 01, 2019 by Vitaly Tuzov Committed by Alexander Alekhin Jul 01, 2019
13 changed files
--- a/modules/calib3d/src/stereobm.cpp
+++ b/modules/calib3d/src/stereobm.cpp
@@ -534,12 +534,12 @@ static void findStereoCorrespondenceBM_SIMD( const Mat& left, const Mat& right,
                    v_expand(sad8, sad4_l, sad4_h);
                    mask4 = thresh4 > sad4_l;
                    mask4 = mask4 & ((d1 > d4) | (d4 > d2));
-                    if( v_signmask(mask4) )
+                    if( v_check_any(mask4) )
                        break;
                    d4 += dd_4;
                    mask4 = thresh4 > sad4_h;
                    mask4 = mask4 & ((d1 > d4) | (d4 > d2));
-                    if( v_signmask(mask4) )
+                    if( v_check_any(mask4) )
                        break;
                    d4 += dd_4;
                }

--- a/modules/calib3d/src/stereosgbm.cpp
+++ b/modules/calib3d/src/stereosgbm.cpp
@@ -2013,14 +2013,14 @@ void SGBM3WayMainLoop::operator () (const Range& range) const
                        mask = cost1 < thresh_reg;
                        mask = mask & ( (cur_d<d1) | (cur_d>d2) );
-                        if( v_signmask(mask) )
+                        if( v_check_any(mask) )
                            break;
                        cur_d = cur_d+eight_reg;
                        mask = cost2 < thresh_reg;
                        mask = mask & ( (cur_d<d1) | (cur_d>d2) );
-                        if( v_signmask(mask) )
+                        if( v_check_any(mask) )
                            break;
                        cur_d = cur_d+eight_reg;

--- a/modules/core/include/opencv2/core/hal/intrin.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin.hpp
@@ -55,6 +55,34 @@
 #define OPENCV_HAL_NOP(a) (a)
 #define OPENCV_HAL_1ST(a, b) (a)
+namespace {
+inline unsigned int trailingZeros32(unsigned int value) {
+#if defined(_MSC_VER)
+#if (_MSC_VER < 1700) || defined(_M_ARM)
+    unsigned long index = 0;
+    _BitScanForward(&index, value);
+    return (unsigned int)index;
+#elif defined(__clang__)
+    // clang-cl doesn't export _tzcnt_u32 for non BMI systems
+    return value ? __builtin_ctz(value) : 32;
+#else
+    return _tzcnt_u32(value);
+#endif
+#elif defined(__GNUC__) || defined(__GNUG__)
+    return __builtin_ctz(value);
+#elif defined(__ICC) || defined(__INTEL_COMPILER)
+    return _bit_scan_forward(value);
+#elif defined(__clang__)
+    return llvm.cttz.i32(value, true);
+#else
+    static const int MultiplyDeBruijnBitPosition[32] = {
+        0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
+        31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9 };
+    return MultiplyDeBruijnBitPosition[((uint32_t)((value & -value) * 0x077CB531U)) >> 27];
+#endif
+}
+}
 // unlike HAL API, which is in cv::hal,
 // we put intrinsics into cv namespace to make its
 // access from within opencv code more accessible
@@ -419,32 +447,6 @@ namespace CV__SIMD_NAMESPACE {
 using namespace CV__SIMD_NAMESPACE;
 #endif
-inline unsigned int trailingZeros32(unsigned int value) {
-#if defined(_MSC_VER)
-#if (_MSC_VER < 1700) || defined(_M_ARM)
-    unsigned long index = 0;
-    _BitScanForward(&index, value);
-    return (unsigned int)index;
-#elif defined(__clang__)
-    // clang-cl doesn't export _tzcnt_u32 for non BMI systems
-    return value ? __builtin_ctz(value) : 32;
-#else
-    return _tzcnt_u32(value);
-#endif
-#elif defined(__GNUC__) || defined(__GNUG__)
-    return __builtin_ctz(value);
-#elif defined(__ICC) || defined(__INTEL_COMPILER)
-    return _bit_scan_forward(value);
-#elif defined(__clang__)
-    return llvm.cttz.i32(value, true);
-#else
-    static const int MultiplyDeBruijnBitPosition[32] = {
-        0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
-        31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9 };
-    return MultiplyDeBruijnBitPosition[((uint32_t)((value & -value) * 0x077CB531U)) >> 27];
-#endif
-}
 #ifndef CV_DOXYGEN
 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
 #endif

--- a/modules/core/include/opencv2/core/hal/intrin_avx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_avx.hpp
@@ -1244,6 +1244,17 @@ inline int v_signmask(const v_float32x8& a)
 inline int v_signmask(const v_float64x4& a)
 { return _mm256_movemask_pd(a.val); }
+inline int v_scan_forward(const v_int8x32& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); }
+inline int v_scan_forward(const v_uint8x32& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); }
+inline int v_scan_forward(const v_int16x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; }
+inline int v_scan_forward(const v_uint16x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; }
+inline int v_scan_forward(const v_int32x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
+inline int v_scan_forward(const v_uint32x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
+inline int v_scan_forward(const v_float32x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
+inline int v_scan_forward(const v_int64x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
+inline int v_scan_forward(const v_uint64x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
+inline int v_scan_forward(const v_float64x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
 /** Checks **/
 #define OPENCV_HAL_IMPL_AVX_CHECK(_Tpvec, and_op, allmask)  \
    inline bool v_check_all(const _Tpvec& a)                \

--- a/modules/core/include/opencv2/core/hal/intrin_avx512.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_avx512.hpp
@@ -2719,7 +2719,7 @@ OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_float64x8, double, f64, v_uint64x8
 ////////// Mask and checks /////////
 /** Mask **/
-inline int64 v_signmask(const v_int8x64& a) { return (int64)_mm512_cmp_epi8_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }
+inline int64 v_signmask(const v_int8x64& a) { return (int64)_mm512_movepi8_mask(a.val); }
 inline int v_signmask(const v_int16x32& a) { return (int)_mm512_cmp_epi16_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }
 inline int v_signmask(const v_int32x16& a) { return (int)_mm512_cmp_epi32_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }
 inline int v_signmask(const v_int64x8& a) { return (int)_mm512_cmp_epi64_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }
@@ -2733,7 +2733,7 @@ inline int v_signmask(const v_float64x8& a) { return v_signmask(v_reinterpret_as
 /** Checks **/
 inline bool v_check_all(const v_int8x64& a) { return !(bool)_mm512_cmp_epi8_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_NLT); }
-inline bool v_check_any(const v_int8x64& a) { return (bool)_mm512_cmp_epi8_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }
+inline bool v_check_any(const v_int8x64& a) { return (bool)_mm512_movepi8_mask(a.val); }
 inline bool v_check_all(const v_int16x32& a) { return !(bool)_mm512_cmp_epi16_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_NLT); }
 inline bool v_check_any(const v_int16x32& a) { return (bool)_mm512_cmp_epi16_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }
 inline bool v_check_all(const v_int32x16& a) { return !(bool)_mm512_cmp_epi32_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_NLT); }
@@ -2754,6 +2754,22 @@ inline bool v_check_any(const v_uint16x32& a) { return v_check_any(v_reinterpret
 inline bool v_check_any(const v_uint32x16& a) { return v_check_any(v_reinterpret_as_s32(a)); }
 inline bool v_check_any(const v_uint64x8& a) { return v_check_any(v_reinterpret_as_s64(a)); }
+inline int v_scan_forward(const v_int8x64& a)
+{
+    int64 mask = _mm512_movepi8_mask(a.val);
+    int mask32 = (int)mask;
+    return mask != 0 ? mask32 != 0 ? trailingZeros32(mask32) : 32 + trailingZeros32((int)(mask >> 32)) : 0;
+}
+inline int v_scan_forward(const v_uint8x64& a) { return v_scan_forward(v_reinterpret_as_s8(a)); }
+inline int v_scan_forward(const v_int16x32& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))); }
+inline int v_scan_forward(const v_uint16x32& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))); }
+inline int v_scan_forward(const v_int32x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))) / 2; }
+inline int v_scan_forward(const v_uint32x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))) / 2; }
+inline int v_scan_forward(const v_float32x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))) / 2; }
+inline int v_scan_forward(const v_int64x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))) / 4; }
+inline int v_scan_forward(const v_uint64x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))) / 4; }
+inline int v_scan_forward(const v_float64x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))) / 4; }
 inline void v512_cleanup() { _mm256_zeroall(); }
 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END

--- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
@@ -1072,6 +1072,7 @@ template<typename _Tp, int n> inline typename V_TypeTraits< typename V_TypeTrait
 }
 /** @brief Get negative values mask
+@deprecated v_signmask depends on a lane count heavily and therefore isn't universal enough
 Returned value is a bit mask with bits set to 1 on places corresponding to negative packed values indexes.
 Example:
@@ -1088,6 +1089,23 @@ template<typename _Tp, int n> inline int v_signmask(const v_reg<_Tp, n>& a)
    return mask;
 }
+/** @brief Get first negative lane index
+Returned value is an index of first negative lane (undefined for input of all positive values)
+Example:
+@code{.cpp}
+v_int32x4 r; // set to {0, 0, -1, -1}
+int idx = v_heading_zeros(r); // idx = 2
+@endcode
+*/
+template <typename _Tp, int n> inline int v_scan_forward(const v_reg<_Tp, n>& a)
+{
+    for (int i = 0; i < n; i++)
+        if(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) < 0)
+            return i;
+    return 0;
+}
 /** @brief Check if all packed values are less than zero
 Unsigned values will be casted to signed: `uchar 254 => char -2`.

--- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
@@ -1096,17 +1096,32 @@ inline int v_signmask(const v_int32x4& a)
 { return v_signmask(v_reinterpret_as_u32(a)); }
 inline int v_signmask(const v_float32x4& a)
 { return v_signmask(v_reinterpret_as_u32(a)); }
-#if CV_SIMD128_64F
 inline int v_signmask(const v_uint64x2& a)
 {
    int64x1_t m0 = vdup_n_s64(0);
    uint64x2_t v0 = vshlq_u64(vshrq_n_u64(a.val, 63), vcombine_s64(m0, m0));
    return (int)vgetq_lane_u64(v0, 0) + ((int)vgetq_lane_u64(v0, 1) << 1);
 }
+inline int v_signmask(const v_int64x2& a)
+{ return v_signmask(v_reinterpret_as_u64(a)); }
+#if CV_SIMD128_64F
 inline int v_signmask(const v_float64x2& a)
 { return v_signmask(v_reinterpret_as_u64(a)); }
 #endif
+inline int v_scan_forward(const v_int8x16& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_uint8x16& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_int16x8& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_uint16x8& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_int32x4& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_uint32x4& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_float32x4& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_int64x2& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_uint64x2& a) { return trailingZeros32(v_signmask(a)); }
+#if CV_SIMD128_64F
+inline int v_scan_forward(const v_float64x2& a) { return trailingZeros32(v_signmask(a)); }
+#endif
 #define OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(_Tpvec, suffix, shift) \
 inline bool v_check_all(const v_##_Tpvec& a) \
 { \

--- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp
@@ -1617,6 +1617,17 @@ OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int32x4, epi8, v_packq_epi32, OPENCV_HAL_AND,
 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_float32x4, ps, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 15, 15)
 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_float64x2, pd, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 3, 3)
+inline int v_scan_forward(const v_int8x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); }
+inline int v_scan_forward(const v_uint8x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); }
+inline int v_scan_forward(const v_int16x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; }
+inline int v_scan_forward(const v_uint16x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; }
+inline int v_scan_forward(const v_int32x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
+inline int v_scan_forward(const v_uint32x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
+inline int v_scan_forward(const v_float32x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
+inline int v_scan_forward(const v_int64x2& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
+inline int v_scan_forward(const v_uint64x2& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
+inline int v_scan_forward(const v_float64x2& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
 #if CV_SSE4_1
 #define OPENCV_HAL_IMPL_SSE_SELECT(_Tpvec, cast_ret, cast, suffix) \
 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \

--- a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
@@ -891,6 +891,17 @@ inline int v_signmask(const v_uint64x2& a)
 inline int v_signmask(const v_float64x2& a)
 { return v_signmask(v_reinterpret_as_s64(a)); }
+inline int v_scan_forward(const v_int8x16& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_uint8x16& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_int16x8& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_uint16x8& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_int32x4& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_uint32x4& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_float32x4& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_int64x2& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_uint64x2& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_float64x2& a) { return trailingZeros32(v_signmask(a)); }
 template<typename _Tpvec>
 inline bool v_check_all(const _Tpvec& a)
 { return vec_all_lt(a.val, _Tpvec().val); }

--- a/modules/features2d/src/fast.cpp
+++ b/modules/features2d/src/fast.cpp
@@ -132,10 +132,9 @@ void FAST_t(InputArray _img, std::vector<KeyPoint>& keypoints, int threshold, bo
                            m1 = m1 | ((x3 < v1) & (x0 < v1));
                            m0 = m0 | m1;
-                            int mask = v_signmask(m0);
+                            if( !v_check_any(m0) )
-                            if( mask == 0 )
                                continue;
-                            if( (mask & 255) == 0 )
+                            if( !v_check_any(v_combine_low(m0, m0)) )
                            {
                                j -= 8;
                                ptr -= 8;
@@ -159,16 +158,36 @@ void FAST_t(InputArray _img, std::vector<KeyPoint>& keypoints, int threshold, bo
                                max1 = v_max(max1, v_reinterpret_as_u8(c1));
                            }
-                            max0 = v_max(max0, max1);
+                            max0 = K16 < v_max(max0, max1);
-                            int m = v_signmask(K16 < max0);
+                            int m = -v_reduce_sum(v_reinterpret_as_s8(max0));
+                            uchar mflag[16];
+                            v_store(mflag, max0);
-                            for( k = 0; m > 0 && k < 16; k++, m >>= 1 )
+                            for( k = 0; m > 0 && k < 16; k++ )
                            {
-                                if(m & 1)
+                                if(mflag[k])
                                {
+                                    --m;
                                    cornerpos[ncorners++] = j+k;
                                    if(nonmax_suppression)
-                                        curr[j+k] = (uchar)cornerScore<patternSize>(ptr+k, pixel, threshold);
+                                    {
+                                        short d[25];
+                                        for (int _k = 0; _k < 25; _k++)
+                                            d[_k] = (short)(ptr[k] - ptr[k + pixel[_k]]);
+                                        v_int16x8 a0, b0, a1, b1;
+                                        a0 = b0 = a1 = b1 = v_load(d + 8);
+                                        for(int shift = 0; shift < 8; ++shift)
+                                        {
+                                            v_int16x8 v_nms = v_load(d + shift);
+                                            a0 = v_min(a0, v_nms);
+                                            b0 = v_max(b0, v_nms);
+                                            v_nms = v_load(d + 9 + shift);
+                                            a1 = v_min(a1, v_nms);
+                                            b1 = v_max(b1, v_nms);
+                                        }
+                                        curr[j + k] = (uchar)(v_reduce_max(v_max(v_max(a0, a1), v_setzero_s16() - v_min(b0, b1))) - 1);
+                                    }
                                }
                            }
                        }

--- a/modules/imgproc/src/canny.cpp
+++ b/modules/imgproc/src/canny.cpp
--- a/modules/imgproc/src/contours.cpp
+++ b/modules/imgproc/src/contours.cpp
@@ -1061,19 +1061,13 @@ cvFindNextContour( CvContourScanner scanner )
                }
                else
                {
-#if CV_SIMD_WIDTH > 16
+                    v_uint8 v_prev = vx_setall_u8((uchar)prev);
-                    v_uint8 vx_prev = vx_setall_u8((uchar)prev);
+                    for (; x <= width - v_uint8::nlanes; x += v_uint8::nlanes)
-                    while (x <= width - v_uint8::nlanes &&
-                           v_check_all(vx_load((uchar*)(img + x)) == vx_prev))
-                        x += v_uint8::nlanes;
-#endif
-                    v_uint8x16 v_prev = v_setall_u8((uchar)prev);
-                    for (; x <= width - v_uint8x16::nlanes; x += v_uint8x16::nlanes)
                    {
-                        unsigned int mask = (unsigned int)v_signmask(v_load((uchar*)(img + x)) != v_prev);
+                        v_uint8 vmask = (vx_load((uchar*)(img + x)) != v_prev);
-                        if (mask)
+                        if (v_check_any(vmask))
                        {
-                            p = img[(x += cv::trailingZeros32(mask))];
+                            p = img[(x += v_scan_forward(vmask))];
                            goto _next_contour;
                        }
                    }
@@ -1334,19 +1328,13 @@ CvLinkedRunPoint;
 inline int findStartContourPoint(uchar *src_data, CvSize img_size, int j)
 {
 #if CV_SIMD
-#if CV_SIMD_WIDTH > 16
+    v_uint8 v_zero = vx_setzero_u8();
-    v_uint8 vx_zero = vx_setzero_u8();
+    for (; j <= img_size.width - v_uint8::nlanes; j += v_uint8::nlanes)
-    while (j <= img_size.width - v_uint8::nlanes &&
-           v_check_all(vx_load((uchar*)(src_data + j)) == vx_zero))
-        j += v_uint8::nlanes;
-#endif
-    v_uint8x16 v_zero = v_setzero_u8();
-    for (; j <= img_size.width - v_uint8x16::nlanes; j += v_uint8x16::nlanes)
    {
-        unsigned int mask = (unsigned int)v_signmask(v_load((uchar*)(src_data + j)) != v_zero);
+        v_uint8 vmask = (vx_load((uchar*)(src_data + j)) != v_zero);
-        if (mask)
+        if (v_check_any(vmask))
        {
-            j += cv::trailingZeros32(mask);
+            j += v_scan_forward(vmask);
            return j;
        }
    }
@@ -1365,19 +1353,13 @@ inline int findEndContourPoint(uchar *src_data, CvSize img_size, int j)
    }
    else
    {
-#if CV_SIMD_WIDTH > 16
+        v_uint8 v_zero = vx_setzero_u8();
-        v_uint8 vx_zero = vx_setzero_u8();
-        while (j <= img_size.width - v_uint8::nlanes &&
-               v_check_all(vx_load((uchar*)(src_data + j)) != vx_zero))
-            j += v_uint8::nlanes;
-#endif
-        v_uint8x16 v_zero = v_setzero_u8();
        for (; j <= img_size.width - v_uint8::nlanes; j += v_uint8::nlanes)
        {
-            unsigned int mask = (unsigned int)v_signmask(v_load((uchar*)(src_data + j)) == v_zero);
+            v_uint8 vmask = (vx_load((uchar*)(src_data + j)) == v_zero);
-            if (mask)
+            if (v_check_any(vmask))
            {
-                j += cv::trailingZeros32(mask);
+                j += v_scan_forward(vmask);
                return j;
            }
        }

--- a/modules/imgproc/src/hough.cpp
+++ b/modules/imgproc/src/hough.cpp
@@ -1139,32 +1139,23 @@ public:
            for(; x < numCols; ++x )
            {
-#if CV_SIMD128
+#if CV_SIMD
                {
-                    v_uint8x16 v_zero = v_setzero_u8();
+                    v_uint8 v_zero = vx_setzero_u8();
-                    for(; x <= numCols - 32; x += 32) {
+                    for(; x <= numCols - 2*v_uint8::nlanes; x += 2*v_uint8::nlanes) {
-                        v_uint8x16 v_edge1 = v_load(edgeData + x);
+                        v_uint8 v_edge1 = (vx_load(edgeData + x                  ) != v_zero);
-                        v_uint8x16 v_edge2 = v_load(edgeData + x + 16);
+                        v_uint8 v_edge2 = (vx_load(edgeData + x + v_uint8::nlanes) != v_zero);
-                        v_uint8x16 v_cmp1 = (v_edge1 == v_zero);
+                        if(v_check_any(v_edge1))
-                        v_uint8x16 v_cmp2 = (v_edge2 == v_zero);
-                        unsigned int mask1 = v_signmask(v_cmp1);
-                        unsigned int mask2 = v_signmask(v_cmp2);
-                        mask1 ^= 0x0000ffff;
-                        mask2 ^= 0x0000ffff;
-                        if(mask1)
                        {
-                            x += trailingZeros32(mask1);
+                            x += v_scan_forward(v_edge1);
                            goto _next_step;
                        }
-                        if(mask2)
+                        if(v_check_any(v_edge2))
                        {
-                            x += trailingZeros32(mask2 << 16);
+                            x += v_uint8::nlanes + v_scan_forward(v_edge2);
                            goto _next_step;
                        }
                    }
@@ -1175,7 +1166,7 @@ public:
                if(x == numCols)
                    continue;
-#if CV_SIMD128
+#if CV_SIMD
 _next_step:
 #endif
                float vx, vy;
@@ -1506,36 +1497,35 @@ inline int HoughCircleEstimateRadiusInvoker<NZPointList>::filterCircles(const Po
    int nzCount = 0;
    const Point* nz_ = &nz[0];
    int j = 0;
-#if CV_SIMD128
+#if CV_SIMD
    {
-        const v_float32x4 v_minRadius2 = v_setall_f32(minRadius2);
+        const v_float32 v_minRadius2 = vx_setall_f32(minRadius2);
-        const v_float32x4 v_maxRadius2 = v_setall_f32(maxRadius2);
+        const v_float32 v_maxRadius2 = vx_setall_f32(maxRadius2);
-        v_float32x4 v_curCenterX = v_setall_f32(curCenter.x);
+        v_float32 v_curCenterX = vx_setall_f32(curCenter.x);
-        v_float32x4 v_curCenterY = v_setall_f32(curCenter.y);
+        v_float32 v_curCenterY = vx_setall_f32(curCenter.y);
-        float CV_DECL_ALIGNED(16) rbuf[4];
+        float CV_DECL_ALIGNED(CV_SIMD_WIDTH) rbuf[v_float32::nlanes];
-        for(; j <= nzSz - 4; j += 4)
+        int CV_DECL_ALIGNED(CV_SIMD_WIDTH) rmask[v_int32::nlanes];
+        for(; j <= nzSz - v_float32::nlanes; j += v_float32::nlanes)
        {
-            v_float32x4 v_nzX, v_nzY;
+            v_float32 v_nzX, v_nzY;
            v_load_deinterleave((const float*)&nz_[j], v_nzX, v_nzY); // FIXIT use proper datatype
-            v_float32x4 v_x = v_cvt_f32(v_reinterpret_as_s32(v_nzX));
+            v_float32 v_x = v_cvt_f32(v_reinterpret_as_s32(v_nzX));
-            v_float32x4 v_y = v_cvt_f32(v_reinterpret_as_s32(v_nzY));
+            v_float32 v_y = v_cvt_f32(v_reinterpret_as_s32(v_nzY));
-            v_float32x4 v_dx = v_x - v_curCenterX;
+            v_float32 v_dx = v_x - v_curCenterX;
-            v_float32x4 v_dy = v_y - v_curCenterY;
+            v_float32 v_dy = v_y - v_curCenterY;
-            v_float32x4 v_r2 = (v_dx * v_dx) + (v_dy * v_dy);
+            v_float32 v_r2 = (v_dx * v_dx) + (v_dy * v_dy);
-            v_float32x4 vmask = (v_minRadius2 <= v_r2) & (v_r2 <= v_maxRadius2);
+            v_float32 vmask = (v_minRadius2 <= v_r2) & (v_r2 <= v_maxRadius2);
-            unsigned int mask = v_signmask(vmask);
+            if (v_check_any(vmask))
-            if (mask)
            {
+                v_store_aligned(rmask, v_reinterpret_as_s32(vmask));
                v_store_aligned(rbuf, v_r2);
-                if (mask & 1) ddata[nzCount++] = rbuf[0];
+                for (int i = 0; i < v_int32::nlanes; ++i)
-                if (mask & 2) ddata[nzCount++] = rbuf[1];
+                    if (rmask[i]) ddata[nzCount++] = rbuf[i];
-                if (mask & 4) ddata[nzCount++] = rbuf[2];
-                if (mask & 8) ddata[nzCount++] = rbuf[3];
            }
        }
    }
@@ -1566,12 +1556,13 @@ inline int HoughCircleEstimateRadiusInvoker<NZPointSet>::filterCircles(const Poi
    const Range xOuter = Range(std::max(int(curCenter.x - rOuter), 0), std::min(int(curCenter.x + rOuter), positions.cols));
    const Range yOuter = Range(std::max(int(curCenter.y - rOuter), 0), std::min(int(curCenter.y + rOuter), positions.rows));
-#if CV_SIMD128
+#if CV_SIMD
-    const int numSIMDPoints = 4;
+    float v_seq[v_float32::nlanes];
+    for (int i = 0; i < v_float32::nlanes; ++i)
-    const v_float32x4 v_minRadius2 = v_setall_f32(minRadius2);
+        v_seq[i] = (float)i;
-    const v_float32x4 v_maxRadius2 = v_setall_f32(maxRadius2);
+    const v_float32 v_minRadius2 = vx_setall_f32(minRadius2);
-    const v_float32x4 v_curCenterX_0123 = v_setall_f32(curCenter.x) - v_float32x4(0.0f, 1.0f, 2.0f, 3.0f);
+    const v_float32 v_maxRadius2 = vx_setall_f32(maxRadius2);
+    const v_float32 v_curCenterX_0123 = vx_setall_f32(curCenter.x) - vx_load(v_seq);
 #endif
    for (int y = yOuter.start; y < yOuter.end; y++)
@@ -1581,29 +1572,28 @@ inline int HoughCircleEstimateRadiusInvoker<NZPointSet>::filterCircles(const Poi
        float dy2 = dy * dy;
        int x = xOuter.start;
-#if CV_SIMD128
+#if CV_SIMD
        {
-            const v_float32x4 v_dy2 = v_setall_f32(dy2);
+            const v_float32 v_dy2 = vx_setall_f32(dy2);
-            const v_uint32x4 v_zero_u32 = v_setall_u32(0);
+            const v_uint32 v_zero_u32 = vx_setall_u32(0);
-            float CV_DECL_ALIGNED(16) rbuf[4];
+            float CV_DECL_ALIGNED(CV_SIMD_WIDTH) rbuf[v_float32::nlanes];
-            for (; x <= xOuter.end - 4; x += numSIMDPoints)
+            int CV_DECL_ALIGNED(CV_SIMD_WIDTH) rmask[v_int32::nlanes];
+            for (; x <= xOuter.end - v_float32::nlanes; x += v_float32::nlanes)
            {
-                v_uint32x4 v_mask = v_load_expand_q(ptr + x);
+                v_uint32 v_mask = vx_load_expand_q(ptr + x);
                v_mask = v_mask != v_zero_u32;
-                v_float32x4 v_x = v_cvt_f32(v_setall_s32(x));
+                v_float32 v_x = v_cvt_f32(vx_setall_s32(x));
-                v_float32x4 v_dx = v_x - v_curCenterX_0123;
+                v_float32 v_dx = v_x - v_curCenterX_0123;
-                v_float32x4 v_r2 = (v_dx * v_dx) + v_dy2;
+                v_float32 v_r2 = (v_dx * v_dx) + v_dy2;
-                v_float32x4 vmask = (v_minRadius2 <= v_r2) & (v_r2 <= v_maxRadius2) & v_reinterpret_as_f32(v_mask);
+                v_float32 vmask = (v_minRadius2 <= v_r2) & (v_r2 <= v_maxRadius2) & v_reinterpret_as_f32(v_mask);
-                unsigned int mask = v_signmask(vmask);
+                if (v_check_any(vmask))
-                if (mask)
                {
+                    v_store_aligned(rmask, v_reinterpret_as_s32(vmask));
                    v_store_aligned(rbuf, v_r2);
-                    if (mask & 1) ddata[nzCount++] = rbuf[0];
+                    for (int i = 0; i < v_int32::nlanes; ++i)
-                    if (mask & 2) ddata[nzCount++] = rbuf[1];
+                        if (rmask[i]) ddata[nzCount++] = rbuf[i];
-                    if (mask & 4) ddata[nzCount++] = rbuf[2];
-                    if (mask & 8) ddata[nzCount++] = rbuf[3];
                }
            }
        }