Merge pull request #15488 from ChipKerchner:vectorizeMinMax2

Vectorize minMaxIdx functions * Updated documentation and intrinsic tests for v_reduce * Add other files back in from the forced push * Prevent an constant overflow with v_reduce for int8 type * Another alternative to fix constant overflow warning. * Fix another compiler warning. * Update comments and change comparison form to be consistent with other vectorized loops. * Change return type of v_reduce_min & max for v_uint8 and v_uint16 to be same as lane type. * Cast v_reduce functions to int to avoid overflow. Reduce number of parameters in MINMAXIDX_REDUCE macro. * Restore cast type for v_reduce_min & max to LaneType

Merge pull request #15488 from ChipKerchner:vectorizeMinMax2
Vectorize minMaxIdx functions * Updated documentation and intrinsic tests for v_reduce * Add other files back in from the forced push * Prevent an constant overflow with v_reduce for int8 type * Another alternative to fix constant overflow warning. * Fix another compiler warning. * Update comments and change comparison form to be consistent with other vectorized loops. * Change return type of v_reduce_min & max for v_uint8 and v_uint16 to be same as lane type. * Cast v_reduce functions to int to avoid overflow. Reduce number of parameters in MINMAXIDX_REDUCE macro. * Restore cast type for v_reduce_min & max to LaneType
301626ba · Chip Kerchner · Alexander Alekhin · 886220b9 · 301626ba · 301626ba
Commit 301626ba authored Jan 17, 2020 by Chip Kerchner Committed by Alexander Alekhin Jan 17, 2020
4 changed files
--- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
@@ -213,7 +213,7 @@ Regular integers:
 |min, max           | x | x | x | x | x | x |
 |absdiff            | x | x | x | x | x | x |
 |absdiffs           |   | x |   | x |   |   |
-|reduce             |   |   |   |   | x | x |
+|reduce             | x | x | x | x | x | x |
 |mask               | x | x | x | x | x | x |
 |pack               | x | x | x | x | x | x |
 |pack_u             | x |   | x |   |   |   |
@@ -670,7 +670,7 @@ Scheme:
 @code
 {A1 A2 A3 ...} => min(A1,A2,A3,...)
 @endcode
-For 32-bit integer and 32-bit floating point types. */
+For all types except 64-bit integer and 64-bit floating point types. */
 OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC(v_reduce_min, std::min)

 /** @brief Find one max value
@@ -679,7 +679,7 @@ Scheme:
 @code
 {A1 A2 A3 ...} => max(A1,A2,A3,...)
 @endcode
-For 32-bit integer and 32-bit floating point types. */
+For all types except 64-bit integer and 64-bit floating point types. */
 OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC(v_reduce_max, std::max)

 static const unsigned char popCountTable[] =
@@ -1219,7 +1219,7 @@ Scheme:
 @code
 {A1 A2 A3 ...} => sum{A1,A2,A3,...}
 @endcode
-For 32-bit integer and 32-bit floating point types.*/
+*/
 template<typename _Tp, int n> inline typename V_TypeTraits<_Tp>::sum_type v_reduce_sum(const v_reg<_Tp, n>& a)
 {
    typename V_TypeTraits<_Tp>::sum_type c = a.s[0];

--- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
@@ -1241,6 +1241,20 @@ inline int v_reduce_sum(const v_int16x8& a)
    return vget_lane_s32(vpadd_s32(t1, t1), 0);
 }

+#define OPENCV_HAL_IMPL_NEON_REDUCE_OP_16(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
+inline scalartype v_reduce_##func(const _Tpvec& a) \
+{ \
+    _Tpnvec##_t a0 = vp##vectorfunc##_##suffix(vget_low_##suffix(a.val), vget_high_##suffix(a.val)); \
+    a0 = vp##vectorfunc##_##suffix(a0, a0); \
+    a0 = vp##vectorfunc##_##suffix(a0, a0); \
+    return (scalartype)vget_lane_##suffix(vp##vectorfunc##_##suffix(a0, a0),0); \
+}
+
+OPENCV_HAL_IMPL_NEON_REDUCE_OP_16(v_uint8x16, uint8x8, uchar, max, max, u8)
+OPENCV_HAL_IMPL_NEON_REDUCE_OP_16(v_uint8x16, uint8x8, uchar, min, min, u8)
+OPENCV_HAL_IMPL_NEON_REDUCE_OP_16(v_int8x16, int8x8, schar, max, max, s8)
+OPENCV_HAL_IMPL_NEON_REDUCE_OP_16(v_int8x16, int8x8, schar, min, min, s8)
+
 #define OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
 inline scalartype v_reduce_##func(const _Tpvec& a) \
 { \
@@ -1249,10 +1263,10 @@ inline scalartype v_reduce_##func(const _Tpvec& a) \
    return (scalartype)vget_lane_##suffix(vp##vectorfunc##_##suffix(a0, a0),0); \
 }

-OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, unsigned int, max, max, u16)
-OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, unsigned int, min, min, u16)
-OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, int, max, max, s16)
-OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, int, min, min, s16)
+OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, ushort, max, max, u16)
+OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, ushort, min, min, u16)
+OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, short, max, max, s16)
+OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, short, min, min, s16)

 #define OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
 inline scalartype v_reduce_##func(const _Tpvec& a) \

--- a/modules/core/src/minmax.cpp
+++ b/modules/core/src/minmax.cpp
--- a/modules/core/test/test_intrin_utils.hpp
+++ b/modules/core/test/test_intrin_utils.hpp
@@ -894,13 +894,18 @@ template<typename R> struct TheTest
    TheTest & test_reduce()
    {
        Data<R> dataA;
+        int sum = 0;
+        for (int i = 0; i < R::nlanes; ++i)
+        {
+            sum += (int)(dataA[i]);   // To prevent a constant overflow with int8
+        }
        R a = dataA;
-        EXPECT_EQ((LaneType)1, v_reduce_min(a));
-        EXPECT_EQ((LaneType)R::nlanes, v_reduce_max(a));
-        EXPECT_EQ((LaneType)((1 + R::nlanes)*R::nlanes/2), v_reduce_sum(a));
+        EXPECT_EQ((LaneType)1, (LaneType)v_reduce_min(a));
+        EXPECT_EQ((LaneType)(R::nlanes), (LaneType)v_reduce_max(a));
+        EXPECT_EQ((int)(sum), (int)v_reduce_sum(a));
        dataA[0] += R::nlanes;
        R an = dataA;
-        EXPECT_EQ((LaneType)2, v_reduce_min(an));
+        EXPECT_EQ((LaneType)2, (LaneType)v_reduce_min(an));
        return *this;
    }

@@ -1588,6 +1593,7 @@ void test_hal_intrin_uint8()
        .test_dotprod_expand()
        .test_min_max()
        .test_absdiff()
+        .test_reduce()
        .test_reduce_sad()
        .test_mask()
        .test_popcount()
@@ -1629,6 +1635,7 @@ void test_hal_intrin_int8()
        .test_absdiff()
        .test_absdiffs()
        .test_abs()
+        .test_reduce()
        .test_reduce_sad()
        .test_mask()
        .test_popcount()