Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
e3633ec4
Commit
e3633ec4
authored
Feb 13, 2019
by
Alexander Alekhin
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
core: dispatch count_non_zero
parent
0b496803
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
24 additions
and
347 deletions
+24
-347
CMakeLists.txt
modules/core/CMakeLists.txt
+1
-0
count_non_zero.dispatch.cpp
modules/core/src/count_non_zero.dispatch.cpp
+10
-182
count_non_zero.simd.hpp
modules/core/src/count_non_zero.simd.hpp
+13
-165
No files found.
modules/core/CMakeLists.txt
View file @
e3633ec4
...
...
@@ -5,6 +5,7 @@ ocv_add_dispatched_file(stat SSE4_2 AVX2)
ocv_add_dispatched_file
(
arithm SSE2 SSE4_1 AVX2 VSX3
)
ocv_add_dispatched_file
(
convert SSE2 AVX2
)
ocv_add_dispatched_file
(
convert_scale SSE2 AVX2
)
ocv_add_dispatched_file
(
count_non_zero SSE2 AVX2
)
ocv_add_dispatched_file
(
sum SSE2 AVX2
)
# dispatching for accuracy tests
...
...
modules/core/src/count_non_zero.dispatch.cpp
View file @
e3633ec4
...
...
@@ -7,190 +7,18 @@
#include "opencl_kernels_core.hpp"
#include "stat.hpp"
namespace
cv
{
template
<
typename
T
>
static
int
countNonZero_
(
const
T
*
src
,
int
len
)
{
int
i
=
0
,
nz
=
0
;
#if CV_ENABLE_UNROLLED
for
(;
i
<=
len
-
4
;
i
+=
4
)
nz
+=
(
src
[
i
]
!=
0
)
+
(
src
[
i
+
1
]
!=
0
)
+
(
src
[
i
+
2
]
!=
0
)
+
(
src
[
i
+
3
]
!=
0
);
#endif
for
(
;
i
<
len
;
i
++
)
nz
+=
src
[
i
]
!=
0
;
return
nz
;
}
static
int
countNonZero8u
(
const
uchar
*
src
,
int
len
)
{
int
i
=
0
,
nz
=
0
;
#if CV_SIMD
int
len0
=
len
&
-
v_uint8
::
nlanes
;
v_uint8
v_zero
=
vx_setzero_u8
();
v_uint8
v_one
=
vx_setall_u8
(
1
);
v_uint32
v_sum32
=
vx_setzero_u32
();
while
(
i
<
len0
)
{
v_uint16
v_sum16
=
vx_setzero_u16
();
int
j
=
i
;
while
(
j
<
std
::
min
(
len0
,
i
+
65280
*
v_uint16
::
nlanes
))
{
v_uint8
v_sum8
=
vx_setzero_u8
();
int
k
=
j
;
for
(;
k
<
std
::
min
(
len0
,
j
+
255
*
v_uint8
::
nlanes
);
k
+=
v_uint8
::
nlanes
)
v_sum8
+=
v_one
&
(
vx_load
(
src
+
k
)
==
v_zero
);
v_uint16
part1
,
part2
;
v_expand
(
v_sum8
,
part1
,
part2
);
v_sum16
+=
part1
+
part2
;
j
=
k
;
}
v_uint32
part1
,
part2
;
v_expand
(
v_sum16
,
part1
,
part2
);
v_sum32
+=
part1
+
part2
;
i
=
j
;
}
nz
=
i
-
v_reduce_sum
(
v_sum32
);
v_cleanup
();
#endif
for
(
;
i
<
len
;
i
++
)
nz
+=
src
[
i
]
!=
0
;
return
nz
;
}
static
int
countNonZero16u
(
const
ushort
*
src
,
int
len
)
{
int
i
=
0
,
nz
=
0
;
#if CV_SIMD
int
len0
=
len
&
-
v_int8
::
nlanes
;
v_uint16
v_zero
=
vx_setzero_u16
();
v_int8
v_one
=
vx_setall_s8
(
1
);
v_int32
v_sum32
=
vx_setzero_s32
();
while
(
i
<
len0
)
{
v_int16
v_sum16
=
vx_setzero_s16
();
int
j
=
i
;
while
(
j
<
std
::
min
(
len0
,
i
+
32766
*
v_int16
::
nlanes
))
{
v_int8
v_sum8
=
vx_setzero_s8
();
int
k
=
j
;
for
(;
k
<
std
::
min
(
len0
,
j
+
127
*
v_int8
::
nlanes
);
k
+=
v_int8
::
nlanes
)
v_sum8
+=
v_one
&
v_pack
(
v_reinterpret_as_s16
(
vx_load
(
src
+
k
)
==
v_zero
),
v_reinterpret_as_s16
(
vx_load
(
src
+
k
+
v_uint16
::
nlanes
)
==
v_zero
));
v_int16
part1
,
part2
;
v_expand
(
v_sum8
,
part1
,
part2
);
v_sum16
+=
part1
+
part2
;
j
=
k
;
}
v_int32
part1
,
part2
;
v_expand
(
v_sum16
,
part1
,
part2
);
v_sum32
+=
part1
+
part2
;
i
=
j
;
}
nz
=
i
-
v_reduce_sum
(
v_sum32
);
v_cleanup
();
#endif
return
nz
+
countNonZero_
(
src
+
i
,
len
-
i
);
}
static
int
countNonZero32s
(
const
int
*
src
,
int
len
)
{
int
i
=
0
,
nz
=
0
;
#if CV_SIMD
int
len0
=
len
&
-
v_int8
::
nlanes
;
v_int32
v_zero
=
vx_setzero_s32
();
v_int8
v_one
=
vx_setall_s8
(
1
);
v_int32
v_sum32
=
vx_setzero_s32
();
while
(
i
<
len0
)
{
v_int16
v_sum16
=
vx_setzero_s16
();
int
j
=
i
;
while
(
j
<
std
::
min
(
len0
,
i
+
32766
*
v_int16
::
nlanes
))
{
v_int8
v_sum8
=
vx_setzero_s8
();
int
k
=
j
;
for
(;
k
<
std
::
min
(
len0
,
j
+
127
*
v_int8
::
nlanes
);
k
+=
v_int8
::
nlanes
)
v_sum8
+=
v_one
&
v_pack
(
v_pack
(
vx_load
(
src
+
k
)
==
v_zero
,
vx_load
(
src
+
k
+
v_int32
::
nlanes
)
==
v_zero
),
v_pack
(
vx_load
(
src
+
k
+
2
*
v_int32
::
nlanes
)
==
v_zero
,
vx_load
(
src
+
k
+
3
*
v_int32
::
nlanes
)
==
v_zero
)
);
v_int16
part1
,
part2
;
v_expand
(
v_sum8
,
part1
,
part2
);
v_sum16
+=
part1
+
part2
;
j
=
k
;
}
v_int32
part1
,
part2
;
v_expand
(
v_sum16
,
part1
,
part2
);
v_sum32
+=
part1
+
part2
;
i
=
j
;
}
nz
=
i
-
v_reduce_sum
(
v_sum32
);
v_cleanup
();
#endif
return
nz
+
countNonZero_
(
src
+
i
,
len
-
i
);
}
static
int
countNonZero32f
(
const
float
*
src
,
int
len
)
{
int
i
=
0
,
nz
=
0
;
#if CV_SIMD
int
len0
=
len
&
-
v_int8
::
nlanes
;
v_float32
v_zero
=
vx_setzero_f32
();
v_int8
v_one
=
vx_setall_s8
(
1
);
v_int32
v_sum32
=
vx_setzero_s32
();
while
(
i
<
len0
)
{
v_int16
v_sum16
=
vx_setzero_s16
();
int
j
=
i
;
while
(
j
<
std
::
min
(
len0
,
i
+
32766
*
v_int16
::
nlanes
))
{
v_int8
v_sum8
=
vx_setzero_s8
();
int
k
=
j
;
for
(;
k
<
std
::
min
(
len0
,
j
+
127
*
v_int8
::
nlanes
);
k
+=
v_int8
::
nlanes
)
v_sum8
+=
v_one
&
v_pack
(
v_pack
(
v_reinterpret_as_s32
(
vx_load
(
src
+
k
)
==
v_zero
),
v_reinterpret_as_s32
(
vx_load
(
src
+
k
+
v_float32
::
nlanes
)
==
v_zero
)),
v_pack
(
v_reinterpret_as_s32
(
vx_load
(
src
+
k
+
2
*
v_float32
::
nlanes
)
==
v_zero
),
v_reinterpret_as_s32
(
vx_load
(
src
+
k
+
3
*
v_float32
::
nlanes
)
==
v_zero
))
);
v_int16
part1
,
part2
;
v_expand
(
v_sum8
,
part1
,
part2
);
v_sum16
+=
part1
+
part2
;
j
=
k
;
}
v_int32
part1
,
part2
;
v_expand
(
v_sum16
,
part1
,
part2
);
v_sum32
+=
part1
+
part2
;
i
=
j
;
}
nz
=
i
-
v_reduce_sum
(
v_sum32
);
v_cleanup
();
#endif
return
nz
+
countNonZero_
(
src
+
i
,
len
-
i
);
}
static
int
countNonZero64f
(
const
double
*
src
,
int
len
)
{
return
countNonZero_
(
src
,
len
);
}
#include "count_non_zero.simd.hpp"
#include "count_non_zero.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content
typedef
int
(
*
CountNonZeroFunc
)(
const
uchar
*
,
int
);
namespace
cv
{
static
CountNonZeroFunc
getCountNonZeroTab
(
int
depth
)
{
static
CountNonZeroFunc
countNonZeroTab
[]
=
{
(
CountNonZeroFunc
)
GET_OPTIMIZED
(
countNonZero8u
),
(
CountNonZeroFunc
)
GET_OPTIMIZED
(
countNonZero8u
),
(
CountNonZeroFunc
)
GET_OPTIMIZED
(
countNonZero16u
),
(
CountNonZeroFunc
)
GET_OPTIMIZED
(
countNonZero16u
),
(
CountNonZeroFunc
)
GET_OPTIMIZED
(
countNonZero32s
),
(
CountNonZeroFunc
)
GET_OPTIMIZED
(
countNonZero32f
),
(
CountNonZeroFunc
)
GET_OPTIMIZED
(
countNonZero64f
),
0
};
return
countNonZeroTab
[
depth
];
CV_INSTRUMENT_REGION
();
CV_CPU_DISPATCH
(
getCountNonZeroTab
,
(
depth
),
CV_CPU_DISPATCH_MODES_ALL
);
}
#ifdef HAVE_OPENCL
static
bool
ocl_countNonZero
(
InputArray
_src
,
int
&
res
)
{
...
...
@@ -288,9 +116,7 @@ static bool ipp_countNonZero( Mat &src, int &res )
}
#endif
}
// cv::
int
cv
::
countNonZero
(
InputArray
_src
)
int
countNonZero
(
InputArray
_src
)
{
CV_INSTRUMENT_REGION
();
...
...
@@ -324,7 +150,7 @@ int cv::countNonZero( InputArray _src )
return
nz
;
}
void
cv
::
findNonZero
(
InputArray
_src
,
OutputArray
_idx
)
void
findNonZero
(
InputArray
_src
,
OutputArray
_idx
)
{
CV_INSTRUMENT_REGION
();
...
...
@@ -351,3 +177,5 @@ void cv::findNonZero( InputArray _src, OutputArray _idx )
*
idx_ptr
++
=
Point
(
j
,
i
);
}
}
}
// namespace
modules/core/src/count_non_zero.simd.hpp
View file @
e3633ec4
...
...
@@ -2,13 +2,20 @@
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html
#include "precomp.hpp"
#include "opencl_kernels_core.hpp"
#include "stat.hpp"
namespace
cv
{
typedef
int
(
*
CountNonZeroFunc
)(
const
uchar
*
,
int
);
CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
CountNonZeroFunc
getCountNonZeroTab
(
int
depth
);
#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
template
<
typename
T
>
static
int
countNonZero_
(
const
T
*
src
,
int
len
)
{
...
...
@@ -175,9 +182,7 @@ static int countNonZero64f( const double* src, int len )
return
countNonZero_
(
src
,
len
);
}
typedef
int
(
*
CountNonZeroFunc
)(
const
uchar
*
,
int
);
static
CountNonZeroFunc
getCountNonZeroTab
(
int
depth
)
CountNonZeroFunc
getCountNonZeroTab
(
int
depth
)
{
static
CountNonZeroFunc
countNonZeroTab
[]
=
{
...
...
@@ -190,164 +195,7 @@ static CountNonZeroFunc getCountNonZeroTab(int depth)
return
countNonZeroTab
[
depth
];
}
#ifdef HAVE_OPENCL
static
bool
ocl_countNonZero
(
InputArray
_src
,
int
&
res
)
{
int
type
=
_src
.
type
(),
depth
=
CV_MAT_DEPTH
(
type
),
kercn
=
ocl
::
predictOptimalVectorWidth
(
_src
);
bool
doubleSupport
=
ocl
::
Device
::
getDefault
().
doubleFPConfig
()
>
0
;
if
(
depth
==
CV_64F
&&
!
doubleSupport
)
return
false
;
int
dbsize
=
ocl
::
Device
::
getDefault
().
maxComputeUnits
();
size_t
wgs
=
ocl
::
Device
::
getDefault
().
maxWorkGroupSize
();
int
wgs2_aligned
=
1
;
while
(
wgs2_aligned
<
(
int
)
wgs
)
wgs2_aligned
<<=
1
;
wgs2_aligned
>>=
1
;
ocl
::
Kernel
k
(
"reduce"
,
ocl
::
core
::
reduce_oclsrc
,
format
(
"-D srcT=%s -D srcT1=%s -D cn=1 -D OP_COUNT_NON_ZERO"
" -D WGS=%d -D kercn=%d -D WGS2_ALIGNED=%d%s%s"
,
ocl
::
typeToStr
(
CV_MAKE_TYPE
(
depth
,
kercn
)),
ocl
::
typeToStr
(
depth
),
(
int
)
wgs
,
kercn
,
wgs2_aligned
,
doubleSupport
?
" -D DOUBLE_SUPPORT"
:
""
,
_src
.
isContinuous
()
?
" -D HAVE_SRC_CONT"
:
""
));
if
(
k
.
empty
())
return
false
;
UMat
src
=
_src
.
getUMat
(),
db
(
1
,
dbsize
,
CV_32SC1
);
k
.
args
(
ocl
::
KernelArg
::
ReadOnlyNoSize
(
src
),
src
.
cols
,
(
int
)
src
.
total
(),
dbsize
,
ocl
::
KernelArg
::
PtrWriteOnly
(
db
));
size_t
globalsize
=
dbsize
*
wgs
;
if
(
k
.
run
(
1
,
&
globalsize
,
&
wgs
,
true
))
return
res
=
saturate_cast
<
int
>
(
cv
::
sum
(
db
.
getMat
(
ACCESS_READ
))[
0
]),
true
;
return
false
;
}
#endif
#if defined HAVE_IPP
static
bool
ipp_countNonZero
(
Mat
&
src
,
int
&
res
)
{
CV_INSTRUMENT_REGION_IPP
();
#if IPP_VERSION_X100 < 201801
// Poor performance of SSE42
if
(
cv
::
ipp
::
getIppTopFeatures
()
==
ippCPUID_SSE42
)
return
false
;
#endif
Ipp32s
count
=
0
;
int
depth
=
src
.
depth
();
if
(
src
.
dims
<=
2
)
{
IppStatus
status
;
IppiSize
size
=
{
src
.
cols
*
src
.
channels
(),
src
.
rows
};
if
(
depth
==
CV_8U
)
status
=
CV_INSTRUMENT_FUN_IPP
(
ippiCountInRange_8u_C1R
,
(
const
Ipp8u
*
)
src
.
ptr
(),
(
int
)
src
.
step
,
size
,
&
count
,
0
,
0
);
else
if
(
depth
==
CV_32F
)
status
=
CV_INSTRUMENT_FUN_IPP
(
ippiCountInRange_32f_C1R
,
(
const
Ipp32f
*
)
src
.
ptr
(),
(
int
)
src
.
step
,
size
,
&
count
,
0
,
0
);
else
return
false
;
if
(
status
<
0
)
return
false
;
res
=
size
.
width
*
size
.
height
-
count
;
}
else
{
IppStatus
status
;
const
Mat
*
arrays
[]
=
{
&
src
,
NULL
};
Mat
planes
[
1
];
NAryMatIterator
it
(
arrays
,
planes
,
1
);
IppiSize
size
=
{(
int
)
it
.
size
*
src
.
channels
(),
1
};
res
=
0
;
for
(
size_t
i
=
0
;
i
<
it
.
nplanes
;
i
++
,
++
it
)
{
if
(
depth
==
CV_8U
)
status
=
CV_INSTRUMENT_FUN_IPP
(
ippiCountInRange_8u_C1R
,
it
.
planes
->
ptr
<
Ipp8u
>
(),
(
int
)
it
.
planes
->
step
,
size
,
&
count
,
0
,
0
);
else
if
(
depth
==
CV_32F
)
status
=
CV_INSTRUMENT_FUN_IPP
(
ippiCountInRange_32f_C1R
,
it
.
planes
->
ptr
<
Ipp32f
>
(),
(
int
)
it
.
planes
->
step
,
size
,
&
count
,
0
,
0
);
else
return
false
;
if
(
status
<
0
||
(
int
)
it
.
planes
->
total
()
*
src
.
channels
()
<
count
)
return
false
;
res
+=
(
int
)
it
.
planes
->
total
()
*
src
.
channels
()
-
count
;
}
}
return
true
;
}
#endif
}
// cv::
int
cv
::
countNonZero
(
InputArray
_src
)
{
CV_INSTRUMENT_REGION
();
int
type
=
_src
.
type
(),
cn
=
CV_MAT_CN
(
type
);
CV_Assert
(
cn
==
1
);
#if defined HAVE_OPENCL || defined HAVE_IPP
int
res
=
-
1
;
#endif
#ifdef HAVE_OPENCL
CV_OCL_RUN_
(
OCL_PERFORMANCE_CHECK
(
_src
.
isUMat
())
&&
_src
.
dims
()
<=
2
,
ocl_countNonZero
(
_src
,
res
),
res
)
#endif
Mat
src
=
_src
.
getMat
();
CV_IPP_RUN_FAST
(
ipp_countNonZero
(
src
,
res
),
res
);
CountNonZeroFunc
func
=
getCountNonZeroTab
(
src
.
depth
());
CV_Assert
(
func
!=
0
);
const
Mat
*
arrays
[]
=
{
&
src
,
0
};
uchar
*
ptrs
[
1
]
=
{};
NAryMatIterator
it
(
arrays
,
ptrs
);
int
total
=
(
int
)
it
.
size
,
nz
=
0
;
for
(
size_t
i
=
0
;
i
<
it
.
nplanes
;
i
++
,
++
it
)
nz
+=
func
(
ptrs
[
0
],
total
);
return
nz
;
}
void
cv
::
findNonZero
(
InputArray
_src
,
OutputArray
_idx
)
{
CV_INSTRUMENT_REGION
();
Mat
src
=
_src
.
getMat
();
CV_Assert
(
src
.
type
()
==
CV_8UC1
);
int
n
=
countNonZero
(
src
);
if
(
n
==
0
)
{
_idx
.
release
();
return
;
}
if
(
_idx
.
kind
()
==
_InputArray
::
MAT
&&
!
_idx
.
getMatRef
().
isContinuous
()
)
_idx
.
release
();
_idx
.
create
(
n
,
1
,
CV_32SC2
);
Mat
idx
=
_idx
.
getMat
();
CV_Assert
(
idx
.
isContinuous
());
Point
*
idx_ptr
=
idx
.
ptr
<
Point
>
();
for
(
int
i
=
0
;
i
<
src
.
rows
;
i
++
)
{
const
uchar
*
bin_ptr
=
src
.
ptr
(
i
);
for
(
int
j
=
0
;
j
<
src
.
cols
;
j
++
)
if
(
bin_ptr
[
j
]
)
*
idx_ptr
++
=
Point
(
j
,
i
);
}
}
CV_CPU_OPTIMIZATION_NAMESPACE_END
}
// namespace
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment