Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
3064e40d
Commit
3064e40d
authored
Feb 20, 2019
by
Alexander Alekhin
Browse files
Options
Browse Files
Download
Plain Diff
Merge pull request #13866 from alalek:core_dispatch_mean
parents
334c4d62
dc84cf99
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
340 additions
and
315 deletions
+340
-315
CMakeLists.txt
modules/core/CMakeLists.txt
+1
-0
mean.dispatch.cpp
modules/core/src/mean.dispatch.cpp
+14
-315
mean.simd.hpp
modules/core/src/mean.simd.hpp
+325
-0
No files found.
modules/core/CMakeLists.txt
View file @
3064e40d
...
...
@@ -7,6 +7,7 @@ ocv_add_dispatched_file(convert SSE2 AVX2)
ocv_add_dispatched_file
(
convert_scale SSE2 AVX2
)
ocv_add_dispatched_file
(
count_non_zero SSE2 AVX2
)
ocv_add_dispatched_file
(
matmul SSE2 AVX2
)
ocv_add_dispatched_file
(
mean SSE2 AVX2
)
ocv_add_dispatched_file
(
sum SSE2 AVX2
)
# dispatching for accuracy tests
...
...
modules/core/src/mean.cpp
→
modules/core/src/mean.
dispatch.
cpp
View file @
3064e40d
...
...
@@ -8,9 +8,12 @@
#include "opencv2/core/openvx/ovx_defs.hpp"
#include "stat.hpp"
#include "mean.simd.hpp"
#include "mean.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content
namespace
cv
{
#if defined HAVE_IPP
namespace
cv
{
static
bool
ipp_mean
(
Mat
&
src
,
Mat
&
mask
,
Scalar
&
ret
)
{
CV_INSTRUMENT_REGION_IPP
();
...
...
@@ -101,10 +104,9 @@ static bool ipp_mean( Mat &src, Mat &mask, Scalar &ret )
return
false
;
#endif
}
}
#endif
cv
::
Scalar
cv
::
mean
(
InputArray
_src
,
InputArray
_mask
)
Scalar
mean
(
InputArray
_src
,
InputArray
_mask
)
{
CV_INSTRUMENT_REGION
();
...
...
@@ -167,314 +169,11 @@ cv::Scalar cv::mean( InputArray _src, InputArray _mask )
return
s
*
(
nz0
?
1.
/
nz0
:
0
);
}
//==================================================================================================
namespace
cv
{
template
<
typename
T
,
typename
ST
,
typename
SQT
>
struct
SumSqr_SIMD
static
SumSqrFunc
getSumSqrFunc
(
int
depth
)
{
int
operator
()
(
const
T
*
,
const
uchar
*
,
ST
*
,
SQT
*
,
int
,
int
)
const
{
return
0
;
}
};
#if CV_SIMD
template
<>
struct
SumSqr_SIMD
<
uchar
,
int
,
int
>
{
int
operator
()
(
const
uchar
*
src0
,
const
uchar
*
mask
,
int
*
sum
,
int
*
sqsum
,
int
len
,
int
cn
)
const
{
if
(
mask
||
(
cn
!=
1
&&
cn
!=
2
&&
cn
!=
4
))
return
0
;
len
*=
cn
;
int
x
=
0
;
v_int32
v_sum
=
vx_setzero_s32
();
v_int32
v_sqsum
=
vx_setzero_s32
();
const
int
len0
=
len
&
-
v_uint8
::
nlanes
;
while
(
x
<
len0
)
{
const
int
len_tmp
=
min
(
x
+
256
*
v_uint16
::
nlanes
,
len0
);
v_uint16
v_sum16
=
vx_setzero_u16
();
for
(
;
x
<
len_tmp
;
x
+=
v_uint8
::
nlanes
)
{
v_uint16
v_src0
=
vx_load_expand
(
src0
+
x
);
v_uint16
v_src1
=
vx_load_expand
(
src0
+
x
+
v_uint16
::
nlanes
);
v_sum16
+=
v_src0
+
v_src1
;
v_int16
v_tmp0
,
v_tmp1
;
v_zip
(
v_reinterpret_as_s16
(
v_src0
),
v_reinterpret_as_s16
(
v_src1
),
v_tmp0
,
v_tmp1
);
v_sqsum
+=
v_dotprod
(
v_tmp0
,
v_tmp0
)
+
v_dotprod
(
v_tmp1
,
v_tmp1
);
}
v_uint32
v_half0
,
v_half1
;
v_expand
(
v_sum16
,
v_half0
,
v_half1
);
v_sum
+=
v_reinterpret_as_s32
(
v_half0
+
v_half1
);
}
if
(
x
<=
len
-
v_uint16
::
nlanes
)
{
v_uint16
v_src
=
vx_load_expand
(
src0
+
x
);
v_uint16
v_half
=
v_combine_high
(
v_src
,
v_src
);
v_uint32
v_tmp0
,
v_tmp1
;
v_expand
(
v_src
+
v_half
,
v_tmp0
,
v_tmp1
);
v_sum
+=
v_reinterpret_as_s32
(
v_tmp0
);
v_int16
v_tmp2
,
v_tmp3
;
v_zip
(
v_reinterpret_as_s16
(
v_src
),
v_reinterpret_as_s16
(
v_half
),
v_tmp2
,
v_tmp3
);
v_sqsum
+=
v_dotprod
(
v_tmp2
,
v_tmp2
);
x
+=
v_uint16
::
nlanes
;
}
if
(
cn
==
1
)
{
*
sum
+=
v_reduce_sum
(
v_sum
);
*
sqsum
+=
v_reduce_sum
(
v_sqsum
);
}
else
{
int
CV_DECL_ALIGNED
(
CV_SIMD_WIDTH
)
ar
[
2
*
v_int32
::
nlanes
];
v_store
(
ar
,
v_sum
);
v_store
(
ar
+
v_int32
::
nlanes
,
v_sqsum
);
for
(
int
i
=
0
;
i
<
v_int32
::
nlanes
;
++
i
)
{
sum
[
i
%
cn
]
+=
ar
[
i
];
sqsum
[
i
%
cn
]
+=
ar
[
v_int32
::
nlanes
+
i
];
}
}
v_cleanup
();
return
x
/
cn
;
}
};
template
<>
struct
SumSqr_SIMD
<
schar
,
int
,
int
>
{
int
operator
()
(
const
schar
*
src0
,
const
uchar
*
mask
,
int
*
sum
,
int
*
sqsum
,
int
len
,
int
cn
)
const
{
if
(
mask
||
(
cn
!=
1
&&
cn
!=
2
&&
cn
!=
4
))
return
0
;
len
*=
cn
;
int
x
=
0
;
v_int32
v_sum
=
vx_setzero_s32
();
v_int32
v_sqsum
=
vx_setzero_s32
();
const
int
len0
=
len
&
-
v_int8
::
nlanes
;
while
(
x
<
len0
)
{
const
int
len_tmp
=
min
(
x
+
256
*
v_int16
::
nlanes
,
len0
);
v_int16
v_sum16
=
vx_setzero_s16
();
for
(;
x
<
len_tmp
;
x
+=
v_int8
::
nlanes
)
{
v_int16
v_src0
=
vx_load_expand
(
src0
+
x
);
v_int16
v_src1
=
vx_load_expand
(
src0
+
x
+
v_int16
::
nlanes
);
v_sum16
+=
v_src0
+
v_src1
;
v_int16
v_tmp0
,
v_tmp1
;
v_zip
(
v_src0
,
v_src1
,
v_tmp0
,
v_tmp1
);
v_sqsum
+=
v_dotprod
(
v_tmp0
,
v_tmp0
)
+
v_dotprod
(
v_tmp1
,
v_tmp1
);
}
v_int32
v_half0
,
v_half1
;
v_expand
(
v_sum16
,
v_half0
,
v_half1
);
v_sum
+=
v_half0
+
v_half1
;
}
if
(
x
<=
len
-
v_int16
::
nlanes
)
{
v_int16
v_src
=
vx_load_expand
(
src0
+
x
);
v_int16
v_half
=
v_combine_high
(
v_src
,
v_src
);
v_int32
v_tmp0
,
v_tmp1
;
v_expand
(
v_src
+
v_half
,
v_tmp0
,
v_tmp1
);
v_sum
+=
v_tmp0
;
v_int16
v_tmp2
,
v_tmp3
;
v_zip
(
v_src
,
v_half
,
v_tmp2
,
v_tmp3
);
v_sqsum
+=
v_dotprod
(
v_tmp2
,
v_tmp2
);
x
+=
v_int16
::
nlanes
;
}
if
(
cn
==
1
)
{
*
sum
+=
v_reduce_sum
(
v_sum
);
*
sqsum
+=
v_reduce_sum
(
v_sqsum
);
}
else
{
int
CV_DECL_ALIGNED
(
CV_SIMD_WIDTH
)
ar
[
2
*
v_int32
::
nlanes
];
v_store
(
ar
,
v_sum
);
v_store
(
ar
+
v_int32
::
nlanes
,
v_sqsum
);
for
(
int
i
=
0
;
i
<
v_int32
::
nlanes
;
++
i
)
{
sum
[
i
%
cn
]
+=
ar
[
i
];
sqsum
[
i
%
cn
]
+=
ar
[
v_int32
::
nlanes
+
i
];
}
}
v_cleanup
();
return
x
/
cn
;
}
};
#endif
template
<
typename
T
,
typename
ST
,
typename
SQT
>
static
int
sumsqr_
(
const
T
*
src0
,
const
uchar
*
mask
,
ST
*
sum
,
SQT
*
sqsum
,
int
len
,
int
cn
)
{
const
T
*
src
=
src0
;
if
(
!
mask
)
{
SumSqr_SIMD
<
T
,
ST
,
SQT
>
vop
;
int
x
=
vop
(
src0
,
mask
,
sum
,
sqsum
,
len
,
cn
),
k
=
cn
%
4
;
src
=
src0
+
x
*
cn
;
if
(
k
==
1
)
{
ST
s0
=
sum
[
0
];
SQT
sq0
=
sqsum
[
0
];
for
(
int
i
=
x
;
i
<
len
;
i
++
,
src
+=
cn
)
{
T
v
=
src
[
0
];
s0
+=
v
;
sq0
+=
(
SQT
)
v
*
v
;
}
sum
[
0
]
=
s0
;
sqsum
[
0
]
=
sq0
;
}
else
if
(
k
==
2
)
{
ST
s0
=
sum
[
0
],
s1
=
sum
[
1
];
SQT
sq0
=
sqsum
[
0
],
sq1
=
sqsum
[
1
];
for
(
int
i
=
x
;
i
<
len
;
i
++
,
src
+=
cn
)
{
T
v0
=
src
[
0
],
v1
=
src
[
1
];
s0
+=
v0
;
sq0
+=
(
SQT
)
v0
*
v0
;
s1
+=
v1
;
sq1
+=
(
SQT
)
v1
*
v1
;
}
sum
[
0
]
=
s0
;
sum
[
1
]
=
s1
;
sqsum
[
0
]
=
sq0
;
sqsum
[
1
]
=
sq1
;
}
else
if
(
k
==
3
)
{
ST
s0
=
sum
[
0
],
s1
=
sum
[
1
],
s2
=
sum
[
2
];
SQT
sq0
=
sqsum
[
0
],
sq1
=
sqsum
[
1
],
sq2
=
sqsum
[
2
];
for
(
int
i
=
x
;
i
<
len
;
i
++
,
src
+=
cn
)
{
T
v0
=
src
[
0
],
v1
=
src
[
1
],
v2
=
src
[
2
];
s0
+=
v0
;
sq0
+=
(
SQT
)
v0
*
v0
;
s1
+=
v1
;
sq1
+=
(
SQT
)
v1
*
v1
;
s2
+=
v2
;
sq2
+=
(
SQT
)
v2
*
v2
;
}
sum
[
0
]
=
s0
;
sum
[
1
]
=
s1
;
sum
[
2
]
=
s2
;
sqsum
[
0
]
=
sq0
;
sqsum
[
1
]
=
sq1
;
sqsum
[
2
]
=
sq2
;
}
for
(
;
k
<
cn
;
k
+=
4
)
{
src
=
src0
+
x
*
cn
+
k
;
ST
s0
=
sum
[
k
],
s1
=
sum
[
k
+
1
],
s2
=
sum
[
k
+
2
],
s3
=
sum
[
k
+
3
];
SQT
sq0
=
sqsum
[
k
],
sq1
=
sqsum
[
k
+
1
],
sq2
=
sqsum
[
k
+
2
],
sq3
=
sqsum
[
k
+
3
];
for
(
int
i
=
x
;
i
<
len
;
i
++
,
src
+=
cn
)
{
T
v0
,
v1
;
v0
=
src
[
0
],
v1
=
src
[
1
];
s0
+=
v0
;
sq0
+=
(
SQT
)
v0
*
v0
;
s1
+=
v1
;
sq1
+=
(
SQT
)
v1
*
v1
;
v0
=
src
[
2
],
v1
=
src
[
3
];
s2
+=
v0
;
sq2
+=
(
SQT
)
v0
*
v0
;
s3
+=
v1
;
sq3
+=
(
SQT
)
v1
*
v1
;
}
sum
[
k
]
=
s0
;
sum
[
k
+
1
]
=
s1
;
sum
[
k
+
2
]
=
s2
;
sum
[
k
+
3
]
=
s3
;
sqsum
[
k
]
=
sq0
;
sqsum
[
k
+
1
]
=
sq1
;
sqsum
[
k
+
2
]
=
sq2
;
sqsum
[
k
+
3
]
=
sq3
;
}
return
len
;
}
int
i
,
nzm
=
0
;
if
(
cn
==
1
)
{
ST
s0
=
sum
[
0
];
SQT
sq0
=
sqsum
[
0
];
for
(
i
=
0
;
i
<
len
;
i
++
)
if
(
mask
[
i
]
)
{
T
v
=
src
[
i
];
s0
+=
v
;
sq0
+=
(
SQT
)
v
*
v
;
nzm
++
;
}
sum
[
0
]
=
s0
;
sqsum
[
0
]
=
sq0
;
}
else
if
(
cn
==
3
)
{
ST
s0
=
sum
[
0
],
s1
=
sum
[
1
],
s2
=
sum
[
2
];
SQT
sq0
=
sqsum
[
0
],
sq1
=
sqsum
[
1
],
sq2
=
sqsum
[
2
];
for
(
i
=
0
;
i
<
len
;
i
++
,
src
+=
3
)
if
(
mask
[
i
]
)
{
T
v0
=
src
[
0
],
v1
=
src
[
1
],
v2
=
src
[
2
];
s0
+=
v0
;
sq0
+=
(
SQT
)
v0
*
v0
;
s1
+=
v1
;
sq1
+=
(
SQT
)
v1
*
v1
;
s2
+=
v2
;
sq2
+=
(
SQT
)
v2
*
v2
;
nzm
++
;
}
sum
[
0
]
=
s0
;
sum
[
1
]
=
s1
;
sum
[
2
]
=
s2
;
sqsum
[
0
]
=
sq0
;
sqsum
[
1
]
=
sq1
;
sqsum
[
2
]
=
sq2
;
}
else
{
for
(
i
=
0
;
i
<
len
;
i
++
,
src
+=
cn
)
if
(
mask
[
i
]
)
{
for
(
int
k
=
0
;
k
<
cn
;
k
++
)
{
T
v
=
src
[
k
];
ST
s
=
sum
[
k
]
+
v
;
SQT
sq
=
sqsum
[
k
]
+
(
SQT
)
v
*
v
;
sum
[
k
]
=
s
;
sqsum
[
k
]
=
sq
;
}
nzm
++
;
}
}
return
nzm
;
}
static
int
sqsum8u
(
const
uchar
*
src
,
const
uchar
*
mask
,
int
*
sum
,
int
*
sqsum
,
int
len
,
int
cn
)
{
return
sumsqr_
(
src
,
mask
,
sum
,
sqsum
,
len
,
cn
);
}
static
int
sqsum8s
(
const
schar
*
src
,
const
uchar
*
mask
,
int
*
sum
,
int
*
sqsum
,
int
len
,
int
cn
)
{
return
sumsqr_
(
src
,
mask
,
sum
,
sqsum
,
len
,
cn
);
}
static
int
sqsum16u
(
const
ushort
*
src
,
const
uchar
*
mask
,
int
*
sum
,
double
*
sqsum
,
int
len
,
int
cn
)
{
return
sumsqr_
(
src
,
mask
,
sum
,
sqsum
,
len
,
cn
);
}
static
int
sqsum16s
(
const
short
*
src
,
const
uchar
*
mask
,
int
*
sum
,
double
*
sqsum
,
int
len
,
int
cn
)
{
return
sumsqr_
(
src
,
mask
,
sum
,
sqsum
,
len
,
cn
);
}
static
int
sqsum32s
(
const
int
*
src
,
const
uchar
*
mask
,
double
*
sum
,
double
*
sqsum
,
int
len
,
int
cn
)
{
return
sumsqr_
(
src
,
mask
,
sum
,
sqsum
,
len
,
cn
);
}
static
int
sqsum32f
(
const
float
*
src
,
const
uchar
*
mask
,
double
*
sum
,
double
*
sqsum
,
int
len
,
int
cn
)
{
return
sumsqr_
(
src
,
mask
,
sum
,
sqsum
,
len
,
cn
);
}
static
int
sqsum64f
(
const
double
*
src
,
const
uchar
*
mask
,
double
*
sum
,
double
*
sqsum
,
int
len
,
int
cn
)
{
return
sumsqr_
(
src
,
mask
,
sum
,
sqsum
,
len
,
cn
);
}
typedef
int
(
*
SumSqrFunc
)(
const
uchar
*
,
const
uchar
*
mask
,
uchar
*
,
uchar
*
,
int
,
int
);
static
SumSqrFunc
getSumSqrTab
(
int
depth
)
{
static
SumSqrFunc
sumSqrTab
[]
=
{
(
SumSqrFunc
)
GET_OPTIMIZED
(
sqsum8u
),
(
SumSqrFunc
)
sqsum8s
,
(
SumSqrFunc
)
sqsum16u
,
(
SumSqrFunc
)
sqsum16s
,
(
SumSqrFunc
)
sqsum32s
,
(
SumSqrFunc
)
GET_OPTIMIZED
(
sqsum32f
),
(
SumSqrFunc
)
sqsum64f
,
0
};
return
sumSqrTab
[
depth
];
CV_INSTRUMENT_REGION
();
CV_CPU_DISPATCH
(
getSumSqrFunc
,
(
depth
),
CV_CPU_DISPATCH_MODES_ALL
);
}
#ifdef HAVE_OPENCL
...
...
@@ -798,9 +497,7 @@ static bool ipp_meanStdDev(Mat& src, OutputArray _mean, OutputArray _sdv, Mat& m
}
#endif
}
// cv::
void
cv
::
meanStdDev
(
InputArray
_src
,
OutputArray
_mean
,
OutputArray
_sdv
,
InputArray
_mask
)
void
meanStdDev
(
InputArray
_src
,
OutputArray
_mean
,
OutputArray
_sdv
,
InputArray
_mask
)
{
CV_INSTRUMENT_REGION
();
...
...
@@ -819,7 +516,7 @@ void cv::meanStdDev( InputArray _src, OutputArray _mean, OutputArray _sdv, Input
int
k
,
cn
=
src
.
channels
(),
depth
=
src
.
depth
();
SumSqrFunc
func
=
getSumSqr
Tab
(
depth
);
SumSqrFunc
func
=
getSumSqr
Func
(
depth
);
CV_Assert
(
func
!=
0
);
...
...
@@ -907,3 +604,5 @@ void cv::meanStdDev( InputArray _src, OutputArray _mean, OutputArray _sdv, Input
dptr
[
k
]
=
0
;
}
}
}
// namespace
modules/core/src/mean.simd.hpp
0 → 100644
View file @
3064e40d
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html
#include "precomp.hpp"
#include "stat.hpp"
namespace
cv
{
typedef
int
(
*
SumSqrFunc
)(
const
uchar
*
,
const
uchar
*
mask
,
uchar
*
,
uchar
*
,
int
,
int
);
CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
SumSqrFunc
getSumSqrFunc
(
int
depth
);
#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
template
<
typename
T
,
typename
ST
,
typename
SQT
>
struct
SumSqr_SIMD
{
inline
int
operator
()
(
const
T
*
,
const
uchar
*
,
ST
*
,
SQT
*
,
int
,
int
)
const
{
return
0
;
}
};
#if CV_SIMD
template
<>
struct
SumSqr_SIMD
<
uchar
,
int
,
int
>
{
int
operator
()
(
const
uchar
*
src0
,
const
uchar
*
mask
,
int
*
sum
,
int
*
sqsum
,
int
len
,
int
cn
)
const
{
if
(
mask
||
(
cn
!=
1
&&
cn
!=
2
&&
cn
!=
4
))
return
0
;
len
*=
cn
;
int
x
=
0
;
v_int32
v_sum
=
vx_setzero_s32
();
v_int32
v_sqsum
=
vx_setzero_s32
();
const
int
len0
=
len
&
-
v_uint8
::
nlanes
;
while
(
x
<
len0
)
{
const
int
len_tmp
=
min
(
x
+
256
*
v_uint16
::
nlanes
,
len0
);
v_uint16
v_sum16
=
vx_setzero_u16
();
for
(
;
x
<
len_tmp
;
x
+=
v_uint8
::
nlanes
)
{
v_uint16
v_src0
=
vx_load_expand
(
src0
+
x
);
v_uint16
v_src1
=
vx_load_expand
(
src0
+
x
+
v_uint16
::
nlanes
);
v_sum16
+=
v_src0
+
v_src1
;
v_int16
v_tmp0
,
v_tmp1
;
v_zip
(
v_reinterpret_as_s16
(
v_src0
),
v_reinterpret_as_s16
(
v_src1
),
v_tmp0
,
v_tmp1
);
v_sqsum
+=
v_dotprod
(
v_tmp0
,
v_tmp0
)
+
v_dotprod
(
v_tmp1
,
v_tmp1
);
}
v_uint32
v_half0
,
v_half1
;
v_expand
(
v_sum16
,
v_half0
,
v_half1
);
v_sum
+=
v_reinterpret_as_s32
(
v_half0
+
v_half1
);
}
if
(
x
<=
len
-
v_uint16
::
nlanes
)
{
v_uint16
v_src
=
vx_load_expand
(
src0
+
x
);
v_uint16
v_half
=
v_combine_high
(
v_src
,
v_src
);
v_uint32
v_tmp0
,
v_tmp1
;
v_expand
(
v_src
+
v_half
,
v_tmp0
,
v_tmp1
);
v_sum
+=
v_reinterpret_as_s32
(
v_tmp0
);
v_int16
v_tmp2
,
v_tmp3
;
v_zip
(
v_reinterpret_as_s16
(
v_src
),
v_reinterpret_as_s16
(
v_half
),
v_tmp2
,
v_tmp3
);
v_sqsum
+=
v_dotprod
(
v_tmp2
,
v_tmp2
);
x
+=
v_uint16
::
nlanes
;
}
if
(
cn
==
1
)
{
*
sum
+=
v_reduce_sum
(
v_sum
);
*
sqsum
+=
v_reduce_sum
(
v_sqsum
);
}
else
{
int
CV_DECL_ALIGNED
(
CV_SIMD_WIDTH
)
ar
[
2
*
v_int32
::
nlanes
];
v_store
(
ar
,
v_sum
);
v_store
(
ar
+
v_int32
::
nlanes
,
v_sqsum
);
for
(
int
i
=
0
;
i
<
v_int32
::
nlanes
;
++
i
)
{
sum
[
i
%
cn
]
+=
ar
[
i
];
sqsum
[
i
%
cn
]
+=
ar
[
v_int32
::
nlanes
+
i
];
}
}
v_cleanup
();
return
x
/
cn
;
}
};
template
<>
struct
SumSqr_SIMD
<
schar
,
int
,
int
>
{
int
operator
()
(
const
schar
*
src0
,
const
uchar
*
mask
,
int
*
sum
,
int
*
sqsum
,
int
len
,
int
cn
)
const
{
if
(
mask
||
(
cn
!=
1
&&
cn
!=
2
&&
cn
!=
4
))
return
0
;
len
*=
cn
;
int
x
=
0
;
v_int32
v_sum
=
vx_setzero_s32
();
v_int32
v_sqsum
=
vx_setzero_s32
();
const
int
len0
=
len
&
-
v_int8
::
nlanes
;
while
(
x
<
len0
)
{
const
int
len_tmp
=
min
(
x
+
256
*
v_int16
::
nlanes
,
len0
);
v_int16
v_sum16
=
vx_setzero_s16
();
for
(;
x
<
len_tmp
;
x
+=
v_int8
::
nlanes
)
{
v_int16
v_src0
=
vx_load_expand
(
src0
+
x
);
v_int16
v_src1
=
vx_load_expand
(
src0
+
x
+
v_int16
::
nlanes
);
v_sum16
+=
v_src0
+
v_src1
;
v_int16
v_tmp0
,
v_tmp1
;
v_zip
(
v_src0
,
v_src1
,
v_tmp0
,
v_tmp1
);
v_sqsum
+=
v_dotprod
(
v_tmp0
,
v_tmp0
)
+
v_dotprod
(
v_tmp1
,
v_tmp1
);
}
v_int32
v_half0
,
v_half1
;
v_expand
(
v_sum16
,
v_half0
,
v_half1
);
v_sum
+=
v_half0
+
v_half1
;
}
if
(
x
<=
len
-
v_int16
::
nlanes
)
{
v_int16
v_src
=
vx_load_expand
(
src0
+
x
);
v_int16
v_half
=
v_combine_high
(
v_src
,
v_src
);
v_int32
v_tmp0
,
v_tmp1
;
v_expand
(
v_src
+
v_half
,
v_tmp0
,
v_tmp1
);
v_sum
+=
v_tmp0
;
v_int16
v_tmp2
,
v_tmp3
;
v_zip
(
v_src
,
v_half
,
v_tmp2
,
v_tmp3
);
v_sqsum
+=
v_dotprod
(
v_tmp2
,
v_tmp2
);
x
+=
v_int16
::
nlanes
;
}
if
(
cn
==
1
)
{
*
sum
+=
v_reduce_sum
(
v_sum
);
*
sqsum
+=
v_reduce_sum
(
v_sqsum
);
}
else
{
int
CV_DECL_ALIGNED
(
CV_SIMD_WIDTH
)
ar
[
2
*
v_int32
::
nlanes
];
v_store
(
ar
,
v_sum
);
v_store
(
ar
+
v_int32
::
nlanes
,
v_sqsum
);
for
(
int
i
=
0
;
i
<
v_int32
::
nlanes
;
++
i
)
{
sum
[
i
%
cn
]
+=
ar
[
i
];
sqsum
[
i
%
cn
]
+=
ar
[
v_int32
::
nlanes
+
i
];
}
}
v_cleanup
();
return
x
/
cn
;
}
};
#endif
template
<
typename
T
,
typename
ST
,
typename
SQT
>
static
int
sumsqr_
(
const
T
*
src0
,
const
uchar
*
mask
,
ST
*
sum
,
SQT
*
sqsum
,
int
len
,
int
cn
)
{
const
T
*
src
=
src0
;
if
(
!
mask
)
{
SumSqr_SIMD
<
T
,
ST
,
SQT
>
vop
;
int
x
=
vop
(
src0
,
mask
,
sum
,
sqsum
,
len
,
cn
),
k
=
cn
%
4
;
src
=
src0
+
x
*
cn
;
if
(
k
==
1
)
{
ST
s0
=
sum
[
0
];
SQT
sq0
=
sqsum
[
0
];
for
(
int
i
=
x
;
i
<
len
;
i
++
,
src
+=
cn
)
{
T
v
=
src
[
0
];
s0
+=
v
;
sq0
+=
(
SQT
)
v
*
v
;
}
sum
[
0
]
=
s0
;
sqsum
[
0
]
=
sq0
;
}
else
if
(
k
==
2
)
{
ST
s0
=
sum
[
0
],
s1
=
sum
[
1
];
SQT
sq0
=
sqsum
[
0
],
sq1
=
sqsum
[
1
];
for
(
int
i
=
x
;
i
<
len
;
i
++
,
src
+=
cn
)
{
T
v0
=
src
[
0
],
v1
=
src
[
1
];
s0
+=
v0
;
sq0
+=
(
SQT
)
v0
*
v0
;
s1
+=
v1
;
sq1
+=
(
SQT
)
v1
*
v1
;
}
sum
[
0
]
=
s0
;
sum
[
1
]
=
s1
;
sqsum
[
0
]
=
sq0
;
sqsum
[
1
]
=
sq1
;
}
else
if
(
k
==
3
)
{
ST
s0
=
sum
[
0
],
s1
=
sum
[
1
],
s2
=
sum
[
2
];
SQT
sq0
=
sqsum
[
0
],
sq1
=
sqsum
[
1
],
sq2
=
sqsum
[
2
];
for
(
int
i
=
x
;
i
<
len
;
i
++
,
src
+=
cn
)
{
T
v0
=
src
[
0
],
v1
=
src
[
1
],
v2
=
src
[
2
];
s0
+=
v0
;
sq0
+=
(
SQT
)
v0
*
v0
;
s1
+=
v1
;
sq1
+=
(
SQT
)
v1
*
v1
;
s2
+=
v2
;
sq2
+=
(
SQT
)
v2
*
v2
;
}
sum
[
0
]
=
s0
;
sum
[
1
]
=
s1
;
sum
[
2
]
=
s2
;
sqsum
[
0
]
=
sq0
;
sqsum
[
1
]
=
sq1
;
sqsum
[
2
]
=
sq2
;
}
for
(
;
k
<
cn
;
k
+=
4
)
{
src
=
src0
+
x
*
cn
+
k
;
ST
s0
=
sum
[
k
],
s1
=
sum
[
k
+
1
],
s2
=
sum
[
k
+
2
],
s3
=
sum
[
k
+
3
];
SQT
sq0
=
sqsum
[
k
],
sq1
=
sqsum
[
k
+
1
],
sq2
=
sqsum
[
k
+
2
],
sq3
=
sqsum
[
k
+
3
];
for
(
int
i
=
x
;
i
<
len
;
i
++
,
src
+=
cn
)
{
T
v0
,
v1
;
v0
=
src
[
0
],
v1
=
src
[
1
];
s0
+=
v0
;
sq0
+=
(
SQT
)
v0
*
v0
;
s1
+=
v1
;
sq1
+=
(
SQT
)
v1
*
v1
;
v0
=
src
[
2
],
v1
=
src
[
3
];
s2
+=
v0
;
sq2
+=
(
SQT
)
v0
*
v0
;
s3
+=
v1
;
sq3
+=
(
SQT
)
v1
*
v1
;
}
sum
[
k
]
=
s0
;
sum
[
k
+
1
]
=
s1
;
sum
[
k
+
2
]
=
s2
;
sum
[
k
+
3
]
=
s3
;
sqsum
[
k
]
=
sq0
;
sqsum
[
k
+
1
]
=
sq1
;
sqsum
[
k
+
2
]
=
sq2
;
sqsum
[
k
+
3
]
=
sq3
;
}
return
len
;
}
int
i
,
nzm
=
0
;
if
(
cn
==
1
)
{
ST
s0
=
sum
[
0
];
SQT
sq0
=
sqsum
[
0
];
for
(
i
=
0
;
i
<
len
;
i
++
)
if
(
mask
[
i
]
)
{
T
v
=
src
[
i
];
s0
+=
v
;
sq0
+=
(
SQT
)
v
*
v
;
nzm
++
;
}
sum
[
0
]
=
s0
;
sqsum
[
0
]
=
sq0
;
}
else
if
(
cn
==
3
)
{
ST
s0
=
sum
[
0
],
s1
=
sum
[
1
],
s2
=
sum
[
2
];
SQT
sq0
=
sqsum
[
0
],
sq1
=
sqsum
[
1
],
sq2
=
sqsum
[
2
];
for
(
i
=
0
;
i
<
len
;
i
++
,
src
+=
3
)
if
(
mask
[
i
]
)
{
T
v0
=
src
[
0
],
v1
=
src
[
1
],
v2
=
src
[
2
];
s0
+=
v0
;
sq0
+=
(
SQT
)
v0
*
v0
;
s1
+=
v1
;
sq1
+=
(
SQT
)
v1
*
v1
;
s2
+=
v2
;
sq2
+=
(
SQT
)
v2
*
v2
;
nzm
++
;
}
sum
[
0
]
=
s0
;
sum
[
1
]
=
s1
;
sum
[
2
]
=
s2
;
sqsum
[
0
]
=
sq0
;
sqsum
[
1
]
=
sq1
;
sqsum
[
2
]
=
sq2
;
}
else
{
for
(
i
=
0
;
i
<
len
;
i
++
,
src
+=
cn
)
if
(
mask
[
i
]
)
{
for
(
int
k
=
0
;
k
<
cn
;
k
++
)
{
T
v
=
src
[
k
];
ST
s
=
sum
[
k
]
+
v
;
SQT
sq
=
sqsum
[
k
]
+
(
SQT
)
v
*
v
;
sum
[
k
]
=
s
;
sqsum
[
k
]
=
sq
;
}
nzm
++
;
}
}
return
nzm
;
}
static
int
sqsum8u
(
const
uchar
*
src
,
const
uchar
*
mask
,
int
*
sum
,
int
*
sqsum
,
int
len
,
int
cn
)
{
CV_INSTRUMENT_REGION
();
return
sumsqr_
(
src
,
mask
,
sum
,
sqsum
,
len
,
cn
);
}
static
int
sqsum8s
(
const
schar
*
src
,
const
uchar
*
mask
,
int
*
sum
,
int
*
sqsum
,
int
len
,
int
cn
)
{
CV_INSTRUMENT_REGION
();
return
sumsqr_
(
src
,
mask
,
sum
,
sqsum
,
len
,
cn
);
}
static
int
sqsum16u
(
const
ushort
*
src
,
const
uchar
*
mask
,
int
*
sum
,
double
*
sqsum
,
int
len
,
int
cn
)
{
CV_INSTRUMENT_REGION
();
return
sumsqr_
(
src
,
mask
,
sum
,
sqsum
,
len
,
cn
);
}
static
int
sqsum16s
(
const
short
*
src
,
const
uchar
*
mask
,
int
*
sum
,
double
*
sqsum
,
int
len
,
int
cn
)
{
CV_INSTRUMENT_REGION
();
return
sumsqr_
(
src
,
mask
,
sum
,
sqsum
,
len
,
cn
);
}
static
int
sqsum32s
(
const
int
*
src
,
const
uchar
*
mask
,
double
*
sum
,
double
*
sqsum
,
int
len
,
int
cn
)
{
CV_INSTRUMENT_REGION
();
return
sumsqr_
(
src
,
mask
,
sum
,
sqsum
,
len
,
cn
);
}
static
int
sqsum32f
(
const
float
*
src
,
const
uchar
*
mask
,
double
*
sum
,
double
*
sqsum
,
int
len
,
int
cn
)
{
CV_INSTRUMENT_REGION
();
return
sumsqr_
(
src
,
mask
,
sum
,
sqsum
,
len
,
cn
);
}
static
int
sqsum64f
(
const
double
*
src
,
const
uchar
*
mask
,
double
*
sum
,
double
*
sqsum
,
int
len
,
int
cn
)
{
CV_INSTRUMENT_REGION
();
return
sumsqr_
(
src
,
mask
,
sum
,
sqsum
,
len
,
cn
);
}
SumSqrFunc
getSumSqrFunc
(
int
depth
)
{
CV_INSTRUMENT_REGION
();
static
SumSqrFunc
sumSqrTab
[]
=
{
(
SumSqrFunc
)
GET_OPTIMIZED
(
sqsum8u
),
(
SumSqrFunc
)
sqsum8s
,
(
SumSqrFunc
)
sqsum16u
,
(
SumSqrFunc
)
sqsum16s
,
(
SumSqrFunc
)
sqsum32s
,
(
SumSqrFunc
)
GET_OPTIMIZED
(
sqsum32f
),
(
SumSqrFunc
)
sqsum64f
,
0
};
return
sumSqrTab
[
depth
];
}
#endif
CV_CPU_OPTIMIZATION_NAMESPACE_END
}
// namespace
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment