Commit 3c70d966 authored by Alexander Alekhin's avatar Alexander Alekhin

Merge pull request #13823 from alalek:core_dispatch_countNonZero

parents d9b57379 e3633ec4
......@@ -5,6 +5,7 @@ ocv_add_dispatched_file(stat SSE4_2 AVX2)
ocv_add_dispatched_file(arithm SSE2 SSE4_1 AVX2 VSX3)
ocv_add_dispatched_file(convert SSE2 AVX2)
ocv_add_dispatched_file(convert_scale SSE2 AVX2)
ocv_add_dispatched_file(count_non_zero SSE2 AVX2)
ocv_add_dispatched_file(sum SSE2 AVX2)
# dispatching for accuracy tests
......
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html
#include "precomp.hpp"
#include "opencl_kernels_core.hpp"
#include "stat.hpp"
#include "count_non_zero.simd.hpp"
#include "count_non_zero.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content
namespace cv {
static CountNonZeroFunc getCountNonZeroTab(int depth)
{
CV_INSTRUMENT_REGION();
CV_CPU_DISPATCH(getCountNonZeroTab, (depth),
CV_CPU_DISPATCH_MODES_ALL);
}
#ifdef HAVE_OPENCL
static bool ocl_countNonZero( InputArray _src, int & res )
{
int type = _src.type(), depth = CV_MAT_DEPTH(type), kercn = ocl::predictOptimalVectorWidth(_src);
bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
if (depth == CV_64F && !doubleSupport)
return false;
int dbsize = ocl::Device::getDefault().maxComputeUnits();
size_t wgs = ocl::Device::getDefault().maxWorkGroupSize();
int wgs2_aligned = 1;
while (wgs2_aligned < (int)wgs)
wgs2_aligned <<= 1;
wgs2_aligned >>= 1;
ocl::Kernel k("reduce", ocl::core::reduce_oclsrc,
format("-D srcT=%s -D srcT1=%s -D cn=1 -D OP_COUNT_NON_ZERO"
" -D WGS=%d -D kercn=%d -D WGS2_ALIGNED=%d%s%s",
ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)),
ocl::typeToStr(depth), (int)wgs, kercn,
wgs2_aligned, doubleSupport ? " -D DOUBLE_SUPPORT" : "",
_src.isContinuous() ? " -D HAVE_SRC_CONT" : ""));
if (k.empty())
return false;
UMat src = _src.getUMat(), db(1, dbsize, CV_32SC1);
k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(),
dbsize, ocl::KernelArg::PtrWriteOnly(db));
size_t globalsize = dbsize * wgs;
if (k.run(1, &globalsize, &wgs, true))
return res = saturate_cast<int>(cv::sum(db.getMat(ACCESS_READ))[0]), true;
return false;
}
#endif
#if defined HAVE_IPP
static bool ipp_countNonZero( Mat &src, int &res )
{
CV_INSTRUMENT_REGION_IPP();
#if IPP_VERSION_X100 < 201801
// Poor performance of SSE42
if(cv::ipp::getIppTopFeatures() == ippCPUID_SSE42)
return false;
#endif
Ipp32s count = 0;
int depth = src.depth();
if(src.dims <= 2)
{
IppStatus status;
IppiSize size = {src.cols*src.channels(), src.rows};
if(depth == CV_8U)
status = CV_INSTRUMENT_FUN_IPP(ippiCountInRange_8u_C1R, (const Ipp8u *)src.ptr(), (int)src.step, size, &count, 0, 0);
else if(depth == CV_32F)
status = CV_INSTRUMENT_FUN_IPP(ippiCountInRange_32f_C1R, (const Ipp32f *)src.ptr(), (int)src.step, size, &count, 0, 0);
else
return false;
if(status < 0)
return false;
res = size.width*size.height - count;
}
else
{
IppStatus status;
const Mat *arrays[] = {&src, NULL};
Mat planes[1];
NAryMatIterator it(arrays, planes, 1);
IppiSize size = {(int)it.size*src.channels(), 1};
res = 0;
for (size_t i = 0; i < it.nplanes; i++, ++it)
{
if(depth == CV_8U)
status = CV_INSTRUMENT_FUN_IPP(ippiCountInRange_8u_C1R, it.planes->ptr<Ipp8u>(), (int)it.planes->step, size, &count, 0, 0);
else if(depth == CV_32F)
status = CV_INSTRUMENT_FUN_IPP(ippiCountInRange_32f_C1R, it.planes->ptr<Ipp32f>(), (int)it.planes->step, size, &count, 0, 0);
else
return false;
if(status < 0 || (int)it.planes->total()*src.channels() < count)
return false;
res += (int)it.planes->total()*src.channels() - count;
}
}
return true;
}
#endif
int countNonZero(InputArray _src)
{
CV_INSTRUMENT_REGION();
int type = _src.type(), cn = CV_MAT_CN(type);
CV_Assert( cn == 1 );
#if defined HAVE_OPENCL || defined HAVE_IPP
int res = -1;
#endif
#ifdef HAVE_OPENCL
CV_OCL_RUN_(OCL_PERFORMANCE_CHECK(_src.isUMat()) && _src.dims() <= 2,
ocl_countNonZero(_src, res),
res)
#endif
Mat src = _src.getMat();
CV_IPP_RUN_FAST(ipp_countNonZero(src, res), res);
CountNonZeroFunc func = getCountNonZeroTab(src.depth());
CV_Assert( func != 0 );
const Mat* arrays[] = {&src, 0};
uchar* ptrs[1] = {};
NAryMatIterator it(arrays, ptrs);
int total = (int)it.size, nz = 0;
for( size_t i = 0; i < it.nplanes; i++, ++it )
nz += func( ptrs[0], total );
return nz;
}
void findNonZero(InputArray _src, OutputArray _idx)
{
CV_INSTRUMENT_REGION();
Mat src = _src.getMat();
CV_Assert( src.type() == CV_8UC1 );
int n = countNonZero(src);
if( n == 0 )
{
_idx.release();
return;
}
if( _idx.kind() == _InputArray::MAT && !_idx.getMatRef().isContinuous() )
_idx.release();
_idx.create(n, 1, CV_32SC2);
Mat idx = _idx.getMat();
CV_Assert(idx.isContinuous());
Point* idx_ptr = idx.ptr<Point>();
for( int i = 0; i < src.rows; i++ )
{
const uchar* bin_ptr = src.ptr(i);
for( int j = 0; j < src.cols; j++ )
if( bin_ptr[j] )
*idx_ptr++ = Point(j, i);
}
}
} // namespace
......@@ -2,13 +2,20 @@
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html
#include "precomp.hpp"
#include "opencl_kernels_core.hpp"
#include "stat.hpp"
namespace cv {
typedef int (*CountNonZeroFunc)(const uchar*, int);
CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
CountNonZeroFunc getCountNonZeroTab(int depth);
#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
template<typename T>
static int countNonZero_(const T* src, int len )
{
......@@ -175,9 +182,7 @@ static int countNonZero64f( const double* src, int len )
return countNonZero_(src, len);
}
typedef int (*CountNonZeroFunc)(const uchar*, int);
static CountNonZeroFunc getCountNonZeroTab(int depth)
CountNonZeroFunc getCountNonZeroTab(int depth)
{
static CountNonZeroFunc countNonZeroTab[] =
{
......@@ -190,164 +195,7 @@ static CountNonZeroFunc getCountNonZeroTab(int depth)
return countNonZeroTab[depth];
}
#ifdef HAVE_OPENCL
static bool ocl_countNonZero( InputArray _src, int & res )
{
int type = _src.type(), depth = CV_MAT_DEPTH(type), kercn = ocl::predictOptimalVectorWidth(_src);
bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
if (depth == CV_64F && !doubleSupport)
return false;
int dbsize = ocl::Device::getDefault().maxComputeUnits();
size_t wgs = ocl::Device::getDefault().maxWorkGroupSize();
int wgs2_aligned = 1;
while (wgs2_aligned < (int)wgs)
wgs2_aligned <<= 1;
wgs2_aligned >>= 1;
ocl::Kernel k("reduce", ocl::core::reduce_oclsrc,
format("-D srcT=%s -D srcT1=%s -D cn=1 -D OP_COUNT_NON_ZERO"
" -D WGS=%d -D kercn=%d -D WGS2_ALIGNED=%d%s%s",
ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)),
ocl::typeToStr(depth), (int)wgs, kercn,
wgs2_aligned, doubleSupport ? " -D DOUBLE_SUPPORT" : "",
_src.isContinuous() ? " -D HAVE_SRC_CONT" : ""));
if (k.empty())
return false;
UMat src = _src.getUMat(), db(1, dbsize, CV_32SC1);
k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(),
dbsize, ocl::KernelArg::PtrWriteOnly(db));
size_t globalsize = dbsize * wgs;
if (k.run(1, &globalsize, &wgs, true))
return res = saturate_cast<int>(cv::sum(db.getMat(ACCESS_READ))[0]), true;
return false;
}
#endif
#if defined HAVE_IPP
static bool ipp_countNonZero( Mat &src, int &res )
{
CV_INSTRUMENT_REGION_IPP();
#if IPP_VERSION_X100 < 201801
// Poor performance of SSE42
if(cv::ipp::getIppTopFeatures() == ippCPUID_SSE42)
return false;
#endif
Ipp32s count = 0;
int depth = src.depth();
if(src.dims <= 2)
{
IppStatus status;
IppiSize size = {src.cols*src.channels(), src.rows};
if(depth == CV_8U)
status = CV_INSTRUMENT_FUN_IPP(ippiCountInRange_8u_C1R, (const Ipp8u *)src.ptr(), (int)src.step, size, &count, 0, 0);
else if(depth == CV_32F)
status = CV_INSTRUMENT_FUN_IPP(ippiCountInRange_32f_C1R, (const Ipp32f *)src.ptr(), (int)src.step, size, &count, 0, 0);
else
return false;
if(status < 0)
return false;
res = size.width*size.height - count;
}
else
{
IppStatus status;
const Mat *arrays[] = {&src, NULL};
Mat planes[1];
NAryMatIterator it(arrays, planes, 1);
IppiSize size = {(int)it.size*src.channels(), 1};
res = 0;
for (size_t i = 0; i < it.nplanes; i++, ++it)
{
if(depth == CV_8U)
status = CV_INSTRUMENT_FUN_IPP(ippiCountInRange_8u_C1R, it.planes->ptr<Ipp8u>(), (int)it.planes->step, size, &count, 0, 0);
else if(depth == CV_32F)
status = CV_INSTRUMENT_FUN_IPP(ippiCountInRange_32f_C1R, it.planes->ptr<Ipp32f>(), (int)it.planes->step, size, &count, 0, 0);
else
return false;
if(status < 0 || (int)it.planes->total()*src.channels() < count)
return false;
res += (int)it.planes->total()*src.channels() - count;
}
}
return true;
}
#endif
} // cv::
int cv::countNonZero( InputArray _src )
{
CV_INSTRUMENT_REGION();
int type = _src.type(), cn = CV_MAT_CN(type);
CV_Assert( cn == 1 );
#if defined HAVE_OPENCL || defined HAVE_IPP
int res = -1;
#endif
#ifdef HAVE_OPENCL
CV_OCL_RUN_(OCL_PERFORMANCE_CHECK(_src.isUMat()) && _src.dims() <= 2,
ocl_countNonZero(_src, res),
res)
#endif
Mat src = _src.getMat();
CV_IPP_RUN_FAST(ipp_countNonZero(src, res), res);
CountNonZeroFunc func = getCountNonZeroTab(src.depth());
CV_Assert( func != 0 );
const Mat* arrays[] = {&src, 0};
uchar* ptrs[1] = {};
NAryMatIterator it(arrays, ptrs);
int total = (int)it.size, nz = 0;
for( size_t i = 0; i < it.nplanes; i++, ++it )
nz += func( ptrs[0], total );
return nz;
}
void cv::findNonZero( InputArray _src, OutputArray _idx )
{
CV_INSTRUMENT_REGION();
Mat src = _src.getMat();
CV_Assert( src.type() == CV_8UC1 );
int n = countNonZero(src);
if( n == 0 )
{
_idx.release();
return;
}
if( _idx.kind() == _InputArray::MAT && !_idx.getMatRef().isContinuous() )
_idx.release();
_idx.create(n, 1, CV_32SC2);
Mat idx = _idx.getMat();
CV_Assert(idx.isContinuous());
Point* idx_ptr = idx.ptr<Point>();
for( int i = 0; i < src.rows; i++ )
{
const uchar* bin_ptr = src.ptr(i);
for( int j = 0; j < src.cols; j++ )
if( bin_ptr[j] )
*idx_ptr++ = Point(j, i);
}
}
CV_CPU_OPTIMIZATION_NAMESPACE_END
} // namespace
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment