Merge pull request #13823 from alalek:core_dispatch_countNonZero

3c70d966 · Alexander Alekhin · d9b57379 · e3633ec4 · 3c70d966 · 3c70d966
Commit 3c70d966 authored Feb 14, 2019 by Alexander Alekhin
Showing with 195 additions and 165 deletions

CMakeLists.txt modules/core/CMakeLists.txt +1 -0

count_non_zero.dispatch.cpp modules/core/src/count_non_zero.dispatch.cpp +181 -0

count_non_zero.simd.hpp modules/core/src/count_non_zero.simd.hpp +13 -165

No files found.
--- a/modules/core/CMakeLists.txt
+++ b/modules/core/CMakeLists.txt
@@ -5,6 +5,7 @@ ocv_add_dispatched_file(stat SSE4_2 AVX2)
 ocv_add_dispatched_file(arithm SSE2 SSE4_1 AVX2 VSX3)
 ocv_add_dispatched_file(convert SSE2 AVX2)
 ocv_add_dispatched_file(convert_scale SSE2 AVX2)
+ocv_add_dispatched_file(count_non_zero SSE2 AVX2)
 ocv_add_dispatched_file(sum SSE2 AVX2)
 # dispatching for accuracy tests

--- a/modules/core/src/count_non_zero.dispatch.cpp
+++ b/modules/core/src/count_non_zero.dispatch.cpp
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+#include "precomp.hpp"
+#include "opencl_kernels_core.hpp"
+#include "stat.hpp"
+#include "count_non_zero.simd.hpp"
+#include "count_non_zero.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content
+namespace cv {
+static CountNonZeroFunc getCountNonZeroTab(int depth)
+{
+    CV_INSTRUMENT_REGION();
+    CV_CPU_DISPATCH(getCountNonZeroTab, (depth),
+        CV_CPU_DISPATCH_MODES_ALL);
+}
+#ifdef HAVE_OPENCL
+static bool ocl_countNonZero( InputArray _src, int & res )
+{
+    int type = _src.type(), depth = CV_MAT_DEPTH(type), kercn = ocl::predictOptimalVectorWidth(_src);
+    bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
+    if (depth == CV_64F && !doubleSupport)
+        return false;
+    int dbsize = ocl::Device::getDefault().maxComputeUnits();
+    size_t wgs = ocl::Device::getDefault().maxWorkGroupSize();
+    int wgs2_aligned = 1;
+    while (wgs2_aligned < (int)wgs)
+        wgs2_aligned <<= 1;
+    wgs2_aligned >>= 1;
+    ocl::Kernel k("reduce", ocl::core::reduce_oclsrc,
+                  format("-D srcT=%s -D srcT1=%s -D cn=1 -D OP_COUNT_NON_ZERO"
+                         " -D WGS=%d -D kercn=%d -D WGS2_ALIGNED=%d%s%s",
+                         ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)),
+                         ocl::typeToStr(depth), (int)wgs, kercn,
+                         wgs2_aligned, doubleSupport ? " -D DOUBLE_SUPPORT" : "",
+                         _src.isContinuous() ? " -D HAVE_SRC_CONT" : ""));
+    if (k.empty())
+        return false;
+    UMat src = _src.getUMat(), db(1, dbsize, CV_32SC1);
+    k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(),
+           dbsize, ocl::KernelArg::PtrWriteOnly(db));
+    size_t globalsize = dbsize * wgs;
+    if (k.run(1, &globalsize, &wgs, true))
+        return res = saturate_cast<int>(cv::sum(db.getMat(ACCESS_READ))[0]), true;
+    return false;
+}
+#endif
+#if defined HAVE_IPP
+static bool ipp_countNonZero( Mat &src, int &res )
+{
+    CV_INSTRUMENT_REGION_IPP();
+#if IPP_VERSION_X100 < 201801
+    // Poor performance of SSE42
+    if(cv::ipp::getIppTopFeatures() == ippCPUID_SSE42)
+        return false;
+#endif
+    Ipp32s  count = 0;
+    int     depth = src.depth();
+    if(src.dims <= 2)
+    {
+        IppStatus status;
+        IppiSize  size = {src.cols*src.channels(), src.rows};
+        if(depth == CV_8U)
+            status = CV_INSTRUMENT_FUN_IPP(ippiCountInRange_8u_C1R, (const Ipp8u *)src.ptr(), (int)src.step, size, &count, 0, 0);
+        else if(depth == CV_32F)
+            status = CV_INSTRUMENT_FUN_IPP(ippiCountInRange_32f_C1R, (const Ipp32f *)src.ptr(), (int)src.step, size, &count, 0, 0);
+        else
+            return false;
+        if(status < 0)
+            return false;
+        res = size.width*size.height - count;
+    }
+    else
+    {
+        IppStatus       status;
+        const Mat      *arrays[] = {&src, NULL};
+        Mat            planes[1];
+        NAryMatIterator it(arrays, planes, 1);
+        IppiSize        size  = {(int)it.size*src.channels(), 1};
+        res = 0;
+        for (size_t i = 0; i < it.nplanes; i++, ++it)
+        {
+            if(depth == CV_8U)
+                status = CV_INSTRUMENT_FUN_IPP(ippiCountInRange_8u_C1R, it.planes->ptr<Ipp8u>(), (int)it.planes->step, size, &count, 0, 0);
+            else if(depth == CV_32F)
+                status = CV_INSTRUMENT_FUN_IPP(ippiCountInRange_32f_C1R, it.planes->ptr<Ipp32f>(), (int)it.planes->step, size, &count, 0, 0);
+            else
+                return false;
+            if(status < 0 || (int)it.planes->total()*src.channels() < count)
+                return false;
+            res += (int)it.planes->total()*src.channels() - count;
+        }
+    }
+    return true;
+}
+#endif
+int countNonZero(InputArray _src)
+{
+    CV_INSTRUMENT_REGION();
+    int type = _src.type(), cn = CV_MAT_CN(type);
+    CV_Assert( cn == 1 );
+#if defined HAVE_OPENCL || defined HAVE_IPP
+    int res = -1;
+#endif
+#ifdef HAVE_OPENCL
+    CV_OCL_RUN_(OCL_PERFORMANCE_CHECK(_src.isUMat()) && _src.dims() <= 2,
+                ocl_countNonZero(_src, res),
+                res)
+#endif
+    Mat src = _src.getMat();
+    CV_IPP_RUN_FAST(ipp_countNonZero(src, res), res);
+    CountNonZeroFunc func = getCountNonZeroTab(src.depth());
+    CV_Assert( func != 0 );
+    const Mat* arrays[] = {&src, 0};
+    uchar* ptrs[1] = {};
+    NAryMatIterator it(arrays, ptrs);
+    int total = (int)it.size, nz = 0;
+    for( size_t i = 0; i < it.nplanes; i++, ++it )
+        nz += func( ptrs[0], total );
+    return nz;
+}
+void findNonZero(InputArray _src, OutputArray _idx)
+{
+    CV_INSTRUMENT_REGION();
+    Mat src = _src.getMat();
+    CV_Assert( src.type() == CV_8UC1 );
+    int n = countNonZero(src);
+    if( n == 0 )
+    {
+        _idx.release();
+        return;
+    }
+    if( _idx.kind() == _InputArray::MAT && !_idx.getMatRef().isContinuous() )
+        _idx.release();
+    _idx.create(n, 1, CV_32SC2);
+    Mat idx = _idx.getMat();
+    CV_Assert(idx.isContinuous());
+    Point* idx_ptr = idx.ptr<Point>();
+    for( int i = 0; i < src.rows; i++ )
+    {
+        const uchar* bin_ptr = src.ptr(i);
+        for( int j = 0; j < src.cols; j++ )
+            if( bin_ptr[j] )
+                *idx_ptr++ = Point(j, i);
+    }
+}
+} // namespace
--- a/modules/core/src/count_non_zero.cpp
+++ b/modules/core/src/count_non_zero.cpp
@@ -2,13 +2,20 @@
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html
 #include "precomp.hpp"
-#include "opencl_kernels_core.hpp"
-#include "stat.hpp"
 namespace cv {
+typedef int (*CountNonZeroFunc)(const uchar*, int);
+CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
+CountNonZeroFunc getCountNonZeroTab(int depth);
+#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
 template<typename T>
 static int countNonZero_(const T* src, int len )
 {
@@ -175,9 +182,7 @@ static int countNonZero64f( const double* src, int len )
    return countNonZero_(src, len);
 }
-typedef int (*CountNonZeroFunc)(const uchar*, int);
+CountNonZeroFunc getCountNonZeroTab(int depth)
-static CountNonZeroFunc getCountNonZeroTab(int depth)
 {
    static CountNonZeroFunc countNonZeroTab[] =
    {
@@ -190,164 +195,7 @@ static CountNonZeroFunc getCountNonZeroTab(int depth)
    return countNonZeroTab[depth];
 }
-#ifdef HAVE_OPENCL
-static bool ocl_countNonZero( InputArray _src, int & res )
-{
-    int type = _src.type(), depth = CV_MAT_DEPTH(type), kercn = ocl::predictOptimalVectorWidth(_src);
-    bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
-    if (depth == CV_64F && !doubleSupport)
-        return false;
-    int dbsize = ocl::Device::getDefault().maxComputeUnits();
-    size_t wgs = ocl::Device::getDefault().maxWorkGroupSize();
-    int wgs2_aligned = 1;
-    while (wgs2_aligned < (int)wgs)
-        wgs2_aligned <<= 1;
-    wgs2_aligned >>= 1;
-    ocl::Kernel k("reduce", ocl::core::reduce_oclsrc,
-                  format("-D srcT=%s -D srcT1=%s -D cn=1 -D OP_COUNT_NON_ZERO"
-                         " -D WGS=%d -D kercn=%d -D WGS2_ALIGNED=%d%s%s",
-                         ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)),
-                         ocl::typeToStr(depth), (int)wgs, kercn,
-                         wgs2_aligned, doubleSupport ? " -D DOUBLE_SUPPORT" : "",
-                         _src.isContinuous() ? " -D HAVE_SRC_CONT" : ""));
-    if (k.empty())
-        return false;
-    UMat src = _src.getUMat(), db(1, dbsize, CV_32SC1);
-    k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(),
-           dbsize, ocl::KernelArg::PtrWriteOnly(db));
-    size_t globalsize = dbsize * wgs;
-    if (k.run(1, &globalsize, &wgs, true))
-        return res = saturate_cast<int>(cv::sum(db.getMat(ACCESS_READ))[0]), true;
-    return false;
-}
-#endif
-#if defined HAVE_IPP
-static bool ipp_countNonZero( Mat &src, int &res )
-{
-    CV_INSTRUMENT_REGION_IPP();
-#if IPP_VERSION_X100 < 201801
-    // Poor performance of SSE42
-    if(cv::ipp::getIppTopFeatures() == ippCPUID_SSE42)
-        return false;
-#endif
-    Ipp32s  count = 0;
-    int     depth = src.depth();
-    if(src.dims <= 2)
-    {
-        IppStatus status;
-        IppiSize  size = {src.cols*src.channels(), src.rows};
-        if(depth == CV_8U)
-            status = CV_INSTRUMENT_FUN_IPP(ippiCountInRange_8u_C1R, (const Ipp8u *)src.ptr(), (int)src.step, size, &count, 0, 0);
-        else if(depth == CV_32F)
-            status = CV_INSTRUMENT_FUN_IPP(ippiCountInRange_32f_C1R, (const Ipp32f *)src.ptr(), (int)src.step, size, &count, 0, 0);
-        else
-            return false;
-        if(status < 0)
-            return false;
-        res = size.width*size.height - count;
-    }
-    else
-    {
-        IppStatus       status;
-        const Mat      *arrays[] = {&src, NULL};
-        Mat            planes[1];
-        NAryMatIterator it(arrays, planes, 1);
-        IppiSize        size  = {(int)it.size*src.channels(), 1};
-        res = 0;
-        for (size_t i = 0; i < it.nplanes; i++, ++it)
-        {
-            if(depth == CV_8U)
-                status = CV_INSTRUMENT_FUN_IPP(ippiCountInRange_8u_C1R, it.planes->ptr<Ipp8u>(), (int)it.planes->step, size, &count, 0, 0);
-            else if(depth == CV_32F)
-                status = CV_INSTRUMENT_FUN_IPP(ippiCountInRange_32f_C1R, it.planes->ptr<Ipp32f>(), (int)it.planes->step, size, &count, 0, 0);
-            else
-                return false;
-            if(status < 0 || (int)it.planes->total()*src.channels() < count)
-                return false;
-            res += (int)it.planes->total()*src.channels() - count;
-        }
-    }
-    return true;
-}
-#endif
-} // cv::
-int cv::countNonZero( InputArray _src )
-{
-    CV_INSTRUMENT_REGION();
-    int type = _src.type(), cn = CV_MAT_CN(type);
-    CV_Assert( cn == 1 );
-#if defined HAVE_OPENCL || defined HAVE_IPP
-    int res = -1;
-#endif
-#ifdef HAVE_OPENCL
-    CV_OCL_RUN_(OCL_PERFORMANCE_CHECK(_src.isUMat()) && _src.dims() <= 2,
-                ocl_countNonZero(_src, res),
-                res)
 #endif
-    Mat src = _src.getMat();
+CV_CPU_OPTIMIZATION_NAMESPACE_END
-    CV_IPP_RUN_FAST(ipp_countNonZero(src, res), res);
+} // namespace
-    CountNonZeroFunc func = getCountNonZeroTab(src.depth());
-    CV_Assert( func != 0 );
-    const Mat* arrays[] = {&src, 0};
-    uchar* ptrs[1] = {};
-    NAryMatIterator it(arrays, ptrs);
-    int total = (int)it.size, nz = 0;
-    for( size_t i = 0; i < it.nplanes; i++, ++it )
-        nz += func( ptrs[0], total );
-    return nz;
-}
-void cv::findNonZero( InputArray _src, OutputArray _idx )
-{
-    CV_INSTRUMENT_REGION();
-    Mat src = _src.getMat();
-    CV_Assert( src.type() == CV_8UC1 );
-    int n = countNonZero(src);
-    if( n == 0 )
-    {
-        _idx.release();
-        return;
-    }
-    if( _idx.kind() == _InputArray::MAT && !_idx.getMatRef().isContinuous() )
-        _idx.release();
-    _idx.create(n, 1, CV_32SC2);
-    Mat idx = _idx.getMat();
-    CV_Assert(idx.isContinuous());
-    Point* idx_ptr = idx.ptr<Point>();
-    for( int i = 0; i < src.rows; i++ )
-    {
-        const uchar* bin_ptr = src.ptr(i);
-        for( int j = 0; j < src.cols; j++ )
-            if( bin_ptr[j] )
-                *idx_ptr++ = Point(j, i);
-    }
-}