Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
10e6491c
Commit
10e6491c
authored
Jul 14, 2017
by
Alexander Alekhin
Browse files
Options
Browse Files
Download
Plain Diff
Merge pull request #9024 from tomoaki0705:featureDispatchAccumulate
parents
4238add3
e7d5dbfe
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
87 additions
and
1707 deletions
+87
-1707
intrin_sse.hpp
modules/core/include/opencv2/core/hal/intrin_sse.hpp
+59
-0
CMakeLists.txt
modules/imgproc/CMakeLists.txt
+1
-0
accum.cpp
modules/imgproc/src/accum.cpp
+6
-1707
accum.dispatch.cpp
modules/imgproc/src/accum.dispatch.cpp
+21
-0
accum.simd.hpp
modules/imgproc/src/accum.simd.hpp
+0
-0
No files found.
modules/core/include/opencv2/core/hal/intrin_sse.hpp
View file @
10e6491c
...
...
@@ -899,6 +899,15 @@ inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
OPENCV_HAL_IMPL_SSE_FLT_CMP_OP
(
v_float32x4
,
ps
)
OPENCV_HAL_IMPL_SSE_FLT_CMP_OP
(
v_float64x2
,
pd
)
#define OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(_Tpvec, cast) \
inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
{ return cast(v_reinterpret_as_f64(a) == v_reinterpret_as_f64(b)); } \
inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
{ return cast(v_reinterpret_as_f64(a) != v_reinterpret_as_f64(b)); }
OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP
(
v_uint64x2
,
v_reinterpret_as_u64
);
OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP
(
v_int64x2
,
v_reinterpret_as_s64
);
OPENCV_HAL_IMPL_SSE_BIN_FUNC
(
v_uint8x16
,
v_add_wrap
,
_mm_add_epi8
)
OPENCV_HAL_IMPL_SSE_BIN_FUNC
(
v_int8x16
,
v_add_wrap
,
_mm_add_epi8
)
OPENCV_HAL_IMPL_SSE_BIN_FUNC
(
v_uint16x8
,
v_add_wrap
,
_mm_add_epi16
)
...
...
@@ -1520,6 +1529,35 @@ inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4&
v_transpose4x4
(
u0
,
u1
,
u2
,
u3
,
a
,
b
,
c
,
d
);
}
inline
void
v_load_deinterleave
(
const
uint64
*
ptr
,
v_uint64x2
&
a
,
v_uint64x2
&
b
,
v_uint64x2
&
c
)
{
__m128i
t0
=
_mm_loadu_si128
((
const
__m128i
*
)
ptr
);
__m128i
t1
=
_mm_loadu_si128
((
const
__m128i
*
)(
ptr
+
2
));
__m128i
t2
=
_mm_loadu_si128
((
const
__m128i
*
)(
ptr
+
4
));
a
=
v_uint64x2
(
_mm_unpacklo_epi64
(
t0
,
_mm_unpackhi_epi64
(
t1
,
t1
)));
b
=
v_uint64x2
(
_mm_unpacklo_epi64
(
_mm_unpackhi_epi64
(
t0
,
t0
),
t2
));
c
=
v_uint64x2
(
_mm_unpacklo_epi64
(
t1
,
_mm_unpackhi_epi64
(
t2
,
t2
)));
}
inline
void
v_load_deinterleave
(
const
int64
*
ptr
,
v_int64x2
&
a
,
v_int64x2
&
b
,
v_int64x2
&
c
)
{
v_uint64x2
t0
,
t1
,
t2
;
v_load_deinterleave
((
const
uint64
*
)
ptr
,
t0
,
t1
,
t2
);
a
=
v_reinterpret_as_s64
(
t0
);
b
=
v_reinterpret_as_s64
(
t1
);
c
=
v_reinterpret_as_s64
(
t2
);
}
inline
void
v_load_deinterleave
(
const
double
*
ptr
,
v_float64x2
&
a
,
v_float64x2
&
b
,
v_float64x2
&
c
)
{
v_uint64x2
t0
,
t1
,
t2
;
v_load_deinterleave
((
const
uint64
*
)
ptr
,
t0
,
t1
,
t2
);
a
=
v_reinterpret_as_f64
(
t0
);
b
=
v_reinterpret_as_f64
(
t1
);
c
=
v_reinterpret_as_f64
(
t2
);
}
// 2-channel, float only
inline
void
v_load_deinterleave
(
const
float
*
ptr
,
v_float32x4
&
a
,
v_float32x4
&
b
)
{
...
...
@@ -1717,6 +1755,27 @@ inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32
_mm_storeu_ps
((
ptr
+
4
),
u1
);
}
inline
void
v_store_interleave
(
uint64
*
ptr
,
const
v_uint64x2
&
a
,
const
v_uint64x2
&
b
,
const
v_uint64x2
&
c
)
{
__m128i
t0
=
_mm_unpacklo_epi64
(
a
.
val
,
b
.
val
);
__m128i
t1
=
_mm_unpacklo_epi64
(
c
.
val
,
_mm_unpackhi_epi64
(
a
.
val
,
a
.
val
));
__m128i
t2
=
_mm_unpackhi_epi64
(
b
.
val
,
c
.
val
);
_mm_storeu_si128
((
__m128i
*
)
ptr
,
t0
);
_mm_storeu_si128
((
__m128i
*
)(
ptr
+
2
),
t1
);
_mm_storeu_si128
((
__m128i
*
)(
ptr
+
4
),
t2
);
}
inline
void
v_store_interleave
(
int64
*
ptr
,
const
v_int64x2
&
a
,
const
v_int64x2
&
b
,
const
v_int64x2
&
c
)
{
v_store_interleave
((
uint64
*
)
ptr
,
v_reinterpret_as_u64
(
a
),
v_reinterpret_as_u64
(
b
),
v_reinterpret_as_u64
(
c
));
}
inline
void
v_store_interleave
(
double
*
ptr
,
const
v_float64x2
&
a
,
const
v_float64x2
&
b
,
const
v_float64x2
&
c
)
{
v_store_interleave
((
uint64
*
)
ptr
,
v_reinterpret_as_u64
(
a
),
v_reinterpret_as_u64
(
b
),
v_reinterpret_as_u64
(
c
));
}
#define OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(_Tpvec, _Tp, suffix, _Tpuvec, _Tpu, usuffix) \
inline void v_load_deinterleave( const _Tp* ptr, _Tpvec& a0, \
_Tpvec& b0, _Tpvec& c0 ) \
...
...
modules/imgproc/CMakeLists.txt
View file @
10e6491c
set
(
the_description
"Image Processing"
)
ocv_add_dispatched_file
(
accum SSE2 AVX NEON
)
ocv_define_module
(
imgproc opencv_core WRAP java python
)
modules/imgproc/src/accum.cpp
View file @
10e6491c
...
...
@@ -44,1718 +44,17 @@
#include "precomp.hpp"
#include "opencl_kernels_imgproc.hpp"
#include "opencv2/core/hal/intrin.hpp"
#define CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
#include "accum.simd.hpp"
#include "accum.simd_declarations.hpp"
#include "opencv2/core/openvx/ovx_defs.hpp"
namespace
cv
{
template
<
typename
T
,
typename
AT
>
struct
Acc_SIMD
{
int
operator
()
(
const
T
*
,
AT
*
,
const
uchar
*
,
int
,
int
)
const
{
return
0
;
}
};
template
<
typename
T
,
typename
AT
>
struct
AccSqr_SIMD
{
int
operator
()
(
const
T
*
,
AT
*
,
const
uchar
*
,
int
,
int
)
const
{
return
0
;
}
};
template
<
typename
T
,
typename
AT
>
struct
AccProd_SIMD
{
int
operator
()
(
const
T
*
,
const
T
*
,
AT
*
,
const
uchar
*
,
int
,
int
)
const
{
return
0
;
}
};
template
<
typename
T
,
typename
AT
>
struct
AccW_SIMD
{
int
operator
()
(
const
T
*
,
AT
*
,
const
uchar
*
,
int
,
int
,
AT
)
const
{
return
0
;
}
};
#if CV_AVX
template
<>
struct
Acc_SIMD
<
float
,
float
>
{
int
operator
()
(
const
float
*
src
,
float
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
)
const
{
int
x
=
0
;
if
(
!
mask
)
{
len
*=
cn
;
for
(
;
x
<=
len
-
8
;
x
+=
8
)
{
__m256
v_src
=
_mm256_loadu_ps
(
src
+
x
);
__m256
v_dst
=
_mm256_loadu_ps
(
dst
+
x
);
v_dst
=
_mm256_add_ps
(
v_src
,
v_dst
);
_mm256_storeu_ps
(
dst
+
x
,
v_dst
);
}
}
return
x
;
}
};
template
<>
struct
Acc_SIMD
<
float
,
double
>
{
int
operator
()
(
const
float
*
src
,
double
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
)
const
{
int
x
=
0
;
if
(
!
mask
)
{
len
*=
cn
;
for
(
;
x
<=
len
-
8
;
x
+=
8
)
{
__m256
v_src
=
_mm256_loadu_ps
(
src
+
x
);
__m256d
v_src0
=
_mm256_cvtps_pd
(
_mm256_extractf128_ps
(
v_src
,
0
));
__m256d
v_src1
=
_mm256_cvtps_pd
(
_mm256_extractf128_ps
(
v_src
,
1
));
__m256d
v_dst0
=
_mm256_loadu_pd
(
dst
+
x
);
__m256d
v_dst1
=
_mm256_loadu_pd
(
dst
+
x
+
4
);
v_dst0
=
_mm256_add_pd
(
v_src0
,
v_dst0
);
v_dst1
=
_mm256_add_pd
(
v_src1
,
v_dst1
);
_mm256_storeu_pd
(
dst
+
x
,
v_dst0
);
_mm256_storeu_pd
(
dst
+
x
+
4
,
v_dst1
);
}
}
return
x
;
}
};
template
<>
struct
Acc_SIMD
<
double
,
double
>
{
int
operator
()
(
const
double
*
src
,
double
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
)
const
{
int
x
=
0
;
if
(
!
mask
)
{
len
*=
cn
;
for
(
;
x
<=
len
-
4
;
x
+=
4
)
{
__m256d
v_src
=
_mm256_loadu_pd
(
src
+
x
);
__m256d
v_dst
=
_mm256_loadu_pd
(
dst
+
x
);
v_dst
=
_mm256_add_pd
(
v_dst
,
v_src
);
_mm256_storeu_pd
(
dst
+
x
,
v_dst
);
}
}
return
x
;
}
};
template
<>
struct
AccSqr_SIMD
<
float
,
float
>
{
int
operator
()
(
const
float
*
src
,
float
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
)
const
{
int
x
=
0
;
if
(
!
mask
)
{
len
*=
cn
;
for
(
;
x
<=
len
-
8
;
x
+=
8
)
{
__m256
v_src
=
_mm256_loadu_ps
(
src
+
x
);
__m256
v_dst
=
_mm256_loadu_ps
(
dst
+
x
);
v_src
=
_mm256_mul_ps
(
v_src
,
v_src
);
v_dst
=
_mm256_add_ps
(
v_src
,
v_dst
);
_mm256_storeu_ps
(
dst
+
x
,
v_dst
);
}
}
return
x
;
}
};
template
<>
struct
AccSqr_SIMD
<
float
,
double
>
{
int
operator
()
(
const
float
*
src
,
double
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
)
const
{
int
x
=
0
;
if
(
!
mask
)
{
len
*=
cn
;
for
(
;
x
<=
len
-
8
;
x
+=
8
)
{
__m256
v_src
=
_mm256_loadu_ps
(
src
+
x
);
__m256d
v_src0
=
_mm256_cvtps_pd
(
_mm256_extractf128_ps
(
v_src
,
0
));
__m256d
v_src1
=
_mm256_cvtps_pd
(
_mm256_extractf128_ps
(
v_src
,
1
));
__m256d
v_dst0
=
_mm256_loadu_pd
(
dst
+
x
);
__m256d
v_dst1
=
_mm256_loadu_pd
(
dst
+
x
+
4
);
v_src0
=
_mm256_mul_pd
(
v_src0
,
v_src0
);
v_src1
=
_mm256_mul_pd
(
v_src1
,
v_src1
);
v_dst0
=
_mm256_add_pd
(
v_src0
,
v_dst0
);
v_dst1
=
_mm256_add_pd
(
v_src1
,
v_dst1
);
_mm256_storeu_pd
(
dst
+
x
,
v_dst0
);
_mm256_storeu_pd
(
dst
+
x
+
4
,
v_dst1
);
}
}
return
x
;
}
};
template
<>
struct
AccSqr_SIMD
<
double
,
double
>
{
int
operator
()
(
const
double
*
src
,
double
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
)
const
{
int
x
=
0
;
if
(
!
mask
)
{
len
*=
cn
;
for
(
;
x
<=
len
-
4
;
x
+=
4
)
{
__m256d
v_src
=
_mm256_loadu_pd
(
src
+
x
);
__m256d
v_dst
=
_mm256_loadu_pd
(
dst
+
x
);
v_src
=
_mm256_mul_pd
(
v_src
,
v_src
);
v_dst
=
_mm256_add_pd
(
v_dst
,
v_src
);
_mm256_storeu_pd
(
dst
+
x
,
v_dst
);
}
}
return
x
;
}
};
template
<>
struct
AccProd_SIMD
<
float
,
float
>
{
int
operator
()
(
const
float
*
src1
,
const
float
*
src2
,
float
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
)
const
{
int
x
=
0
;
if
(
!
mask
)
{
len
*=
cn
;
for
(
;
x
<=
len
-
8
;
x
+=
8
)
{
__m256
v_src0
=
_mm256_loadu_ps
(
src1
+
x
);
__m256
v_src1
=
_mm256_loadu_ps
(
src2
+
x
);
__m256
v_dst
=
_mm256_loadu_ps
(
dst
+
x
);
__m256
v_src
=
_mm256_mul_ps
(
v_src0
,
v_src1
);
v_dst
=
_mm256_add_ps
(
v_src
,
v_dst
);
_mm256_storeu_ps
(
dst
+
x
,
v_dst
);
}
}
return
x
;
}
};
template
<>
struct
AccProd_SIMD
<
float
,
double
>
{
int
operator
()
(
const
float
*
src1
,
const
float
*
src2
,
double
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
)
const
{
int
x
=
0
;
if
(
!
mask
)
{
len
*=
cn
;
for
(
;
x
<=
len
-
8
;
x
+=
8
)
{
__m256
v_1src
=
_mm256_loadu_ps
(
src1
+
x
);
__m256
v_2src
=
_mm256_loadu_ps
(
src2
+
x
);
__m256d
v_src00
=
_mm256_cvtps_pd
(
_mm256_extractf128_ps
(
v_1src
,
0
));
__m256d
v_src01
=
_mm256_cvtps_pd
(
_mm256_extractf128_ps
(
v_1src
,
1
));
__m256d
v_src10
=
_mm256_cvtps_pd
(
_mm256_extractf128_ps
(
v_2src
,
0
));
__m256d
v_src11
=
_mm256_cvtps_pd
(
_mm256_extractf128_ps
(
v_2src
,
1
));
__m256d
v_dst0
=
_mm256_loadu_pd
(
dst
+
x
);
__m256d
v_dst1
=
_mm256_loadu_pd
(
dst
+
x
+
4
);
__m256d
v_src0
=
_mm256_mul_pd
(
v_src00
,
v_src10
);
__m256d
v_src1
=
_mm256_mul_pd
(
v_src01
,
v_src11
);
v_dst0
=
_mm256_add_pd
(
v_src0
,
v_dst0
);
v_dst1
=
_mm256_add_pd
(
v_src1
,
v_dst1
);
_mm256_storeu_pd
(
dst
+
x
,
v_dst0
);
_mm256_storeu_pd
(
dst
+
x
+
4
,
v_dst1
);
}
}
return
x
;
}
};
template
<>
struct
AccProd_SIMD
<
double
,
double
>
{
int
operator
()
(
const
double
*
src1
,
const
double
*
src2
,
double
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
)
const
{
int
x
=
0
;
if
(
!
mask
)
{
len
*=
cn
;
for
(
;
x
<=
len
-
4
;
x
+=
4
)
{
__m256d
v_src0
=
_mm256_loadu_pd
(
src1
+
x
);
__m256d
v_src1
=
_mm256_loadu_pd
(
src2
+
x
);
__m256d
v_dst
=
_mm256_loadu_pd
(
dst
+
x
);
v_src0
=
_mm256_mul_pd
(
v_src0
,
v_src1
);
v_dst
=
_mm256_add_pd
(
v_dst
,
v_src0
);
_mm256_storeu_pd
(
dst
+
x
,
v_dst
);
}
}
return
x
;
}
};
template
<>
struct
AccW_SIMD
<
float
,
float
>
{
int
operator
()
(
const
float
*
src
,
float
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
,
float
alpha
)
const
{
int
x
=
0
;
__m256
v_alpha
=
_mm256_set1_ps
(
alpha
);
__m256
v_beta
=
_mm256_set1_ps
(
1.0
f
-
alpha
);
if
(
!
mask
)
{
len
*=
cn
;
for
(
;
x
<=
len
-
16
;
x
+=
16
)
{
_mm256_storeu_ps
(
dst
+
x
,
_mm256_add_ps
(
_mm256_mul_ps
(
_mm256_loadu_ps
(
dst
+
x
),
v_beta
),
_mm256_mul_ps
(
_mm256_loadu_ps
(
src
+
x
),
v_alpha
)));
_mm256_storeu_ps
(
dst
+
x
+
8
,
_mm256_add_ps
(
_mm256_mul_ps
(
_mm256_loadu_ps
(
dst
+
x
+
8
),
v_beta
),
_mm256_mul_ps
(
_mm256_loadu_ps
(
src
+
x
+
8
),
v_alpha
)));
}
}
return
x
;
}
};
template
<>
struct
AccW_SIMD
<
float
,
double
>
{
int
operator
()
(
const
float
*
src
,
double
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
,
double
alpha
)
const
{
int
x
=
0
;
__m256d
v_alpha
=
_mm256_set1_pd
(
alpha
);
__m256d
v_beta
=
_mm256_set1_pd
(
1.0
f
-
alpha
);
if
(
!
mask
)
{
len
*=
cn
;
for
(
;
x
<=
len
-
16
;
x
+=
16
)
{
__m256
v_src0
=
_mm256_loadu_ps
(
src
+
x
);
__m256
v_src1
=
_mm256_loadu_ps
(
src
+
x
+
8
);
__m256d
v_src00
=
_mm256_cvtps_pd
(
_mm256_extractf128_ps
(
v_src0
,
0
));
__m256d
v_src01
=
_mm256_cvtps_pd
(
_mm256_extractf128_ps
(
v_src0
,
1
));
__m256d
v_src10
=
_mm256_cvtps_pd
(
_mm256_extractf128_ps
(
v_src1
,
0
));
__m256d
v_src11
=
_mm256_cvtps_pd
(
_mm256_extractf128_ps
(
v_src1
,
1
));
_mm256_storeu_pd
(
dst
+
x
,
_mm256_add_pd
(
_mm256_mul_pd
(
_mm256_loadu_pd
(
dst
+
x
),
v_beta
),
_mm256_mul_pd
(
v_src00
,
v_alpha
)));
_mm256_storeu_pd
(
dst
+
x
+
4
,
_mm256_add_pd
(
_mm256_mul_pd
(
_mm256_loadu_pd
(
dst
+
x
+
4
),
v_beta
),
_mm256_mul_pd
(
v_src01
,
v_alpha
)));
_mm256_storeu_pd
(
dst
+
x
+
8
,
_mm256_add_pd
(
_mm256_mul_pd
(
_mm256_loadu_pd
(
dst
+
x
+
8
),
v_beta
),
_mm256_mul_pd
(
v_src10
,
v_alpha
)));
_mm256_storeu_pd
(
dst
+
x
+
12
,
_mm256_add_pd
(
_mm256_mul_pd
(
_mm256_loadu_pd
(
dst
+
x
+
12
),
v_beta
),
_mm256_mul_pd
(
v_src11
,
v_alpha
)));
}
}
return
x
;
}
};
template
<>
struct
AccW_SIMD
<
double
,
double
>
{
int
operator
()
(
const
double
*
src
,
double
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
,
double
alpha
)
const
{
int
x
=
0
;
__m256d
v_alpha
=
_mm256_set1_pd
(
alpha
);
__m256d
v_beta
=
_mm256_set1_pd
(
1.0
f
-
alpha
);
if
(
!
mask
)
{
len
*=
cn
;
for
(
;
x
<=
len
-
8
;
x
+=
8
)
{
__m256d
v_src0
=
_mm256_loadu_pd
(
src
+
x
);
__m256d
v_src1
=
_mm256_loadu_pd
(
src
+
x
+
4
);
_mm256_storeu_pd
(
dst
+
x
,
_mm256_add_pd
(
_mm256_mul_pd
(
_mm256_loadu_pd
(
dst
+
x
),
v_beta
),
_mm256_mul_pd
(
v_src0
,
v_alpha
)));
_mm256_storeu_pd
(
dst
+
x
+
4
,
_mm256_add_pd
(
_mm256_mul_pd
(
_mm256_loadu_pd
(
dst
+
x
+
4
),
v_beta
),
_mm256_mul_pd
(
v_src1
,
v_alpha
)));
}
}
return
x
;
}
};
#elif CV_SIMD128
template
<>
struct
Acc_SIMD
<
float
,
float
>
{
int
operator
()
(
const
float
*
src
,
float
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
)
const
{
int
x
=
0
;
if
(
!
mask
)
{
len
*=
cn
;
for
(
;
x
<=
len
-
8
;
x
+=
8
)
{
v_store
(
dst
+
x
,
v_load
(
dst
+
x
)
+
v_load
(
src
+
x
));
v_store
(
dst
+
x
+
4
,
v_load
(
dst
+
x
+
4
)
+
v_load
(
src
+
x
+
4
));
}
}
return
x
;
}
};
#if CV_SIMD128_64F
template
<>
struct
Acc_SIMD
<
float
,
double
>
{
int
operator
()
(
const
float
*
src
,
double
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
)
const
{
int
x
=
0
;
if
(
!
mask
)
{
len
*=
cn
;
for
(
;
x
<=
len
-
4
;
x
+=
4
)
{
v_float32x4
v_src
=
v_load
(
src
+
x
);
v_float64x2
v_src0
=
v_cvt_f64
(
v_src
);
v_float64x2
v_src1
=
v_cvt_f64_high
(
v_src
);
v_store
(
dst
+
x
,
v_load
(
dst
+
x
)
+
v_src0
);
v_store
(
dst
+
x
+
2
,
v_load
(
dst
+
x
+
2
)
+
v_src1
);
}
}
return
x
;
}
};
template
<>
struct
Acc_SIMD
<
double
,
double
>
{
int
operator
()
(
const
double
*
src
,
double
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
)
const
{
int
x
=
0
;
if
(
!
mask
)
{
len
*=
cn
;
for
(
;
x
<=
len
-
4
;
x
+=
4
)
{
v_float64x2
v_src0
=
v_load
(
src
+
x
);
v_float64x2
v_src1
=
v_load
(
src
+
x
+
2
);
v_store
(
dst
+
x
,
v_load
(
dst
+
x
)
+
v_src0
);
v_store
(
dst
+
x
+
2
,
v_load
(
dst
+
x
+
2
)
+
v_src1
);
}
}
return
x
;
}
};
#endif //CV_SIMD128_64F
template
<>
struct
AccSqr_SIMD
<
float
,
float
>
{
int
operator
()
(
const
float
*
src
,
float
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
)
const
{
int
x
=
0
;
if
(
!
mask
)
{
len
*=
cn
;
for
(
;
x
<=
len
-
8
;
x
+=
8
)
{
v_float32x4
v_src0
=
v_load
(
src
+
x
);
v_float32x4
v_src1
=
v_load
(
src
+
x
+
4
);
v_src0
=
v_src0
*
v_src0
;
v_src1
=
v_src1
*
v_src1
;
v_store
(
dst
+
x
,
v_load
(
dst
+
x
)
+
v_src0
);
v_store
(
dst
+
x
+
4
,
v_load
(
dst
+
x
+
4
)
+
v_src1
);
}
}
return
x
;
}
};
#if CV_SIMD128_64F
template
<>
struct
AccSqr_SIMD
<
float
,
double
>
{
int
operator
()
(
const
float
*
src
,
double
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
)
const
{
int
x
=
0
;
if
(
!
mask
)
{
len
*=
cn
;
for
(
;
x
<=
len
-
4
;
x
+=
4
)
{
v_float32x4
v_src
=
v_load
(
src
+
x
);
v_float64x2
v_src0
=
v_cvt_f64
(
v_src
);
v_float64x2
v_src1
=
v_cvt_f64_high
(
v_src
);
v_src0
=
v_src0
*
v_src0
;
v_src1
=
v_src1
*
v_src1
;
v_store
(
dst
+
x
,
v_load
(
dst
+
x
)
+
v_src0
);
v_store
(
dst
+
x
+
2
,
v_load
(
dst
+
x
+
2
)
+
v_src1
);
}
}
return
x
;
}
};
template
<>
struct
AccSqr_SIMD
<
double
,
double
>
{
int
operator
()
(
const
double
*
src
,
double
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
)
const
{
int
x
=
0
;
if
(
!
mask
)
{
len
*=
cn
;
for
(
;
x
<=
len
-
4
;
x
+=
4
)
{
v_float64x2
v_src0
=
v_load
(
src
+
x
);
v_float64x2
v_src1
=
v_load
(
src
+
x
+
2
);
v_src0
=
v_src0
*
v_src0
;
v_src1
=
v_src1
*
v_src1
;
v_store
(
dst
+
x
,
v_load
(
dst
+
x
)
+
v_src0
);
v_store
(
dst
+
x
+
2
,
v_load
(
dst
+
x
+
2
)
+
v_src1
);
}
}
return
x
;
}
};
#endif //CV_SIMD128_64F
template
<>
struct
AccProd_SIMD
<
float
,
float
>
{
int
operator
()
(
const
float
*
src1
,
const
float
*
src2
,
float
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
)
const
{
int
x
=
0
;
if
(
!
mask
)
{
len
*=
cn
;
for
(
;
x
<=
len
-
8
;
x
+=
8
)
{
v_store
(
dst
+
x
,
v_load
(
dst
+
x
)
+
v_load
(
src1
+
x
)
*
v_load
(
src2
+
x
));
v_store
(
dst
+
x
+
4
,
v_load
(
dst
+
x
+
4
)
+
v_load
(
src1
+
x
+
4
)
*
v_load
(
src2
+
x
+
4
));
}
}
return
x
;
}
};
#if CV_SIMD128_64F
template
<>
struct
AccProd_SIMD
<
float
,
double
>
{
int
operator
()
(
const
float
*
src1
,
const
float
*
src2
,
double
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
)
const
{
int
x
=
0
;
if
(
!
mask
)
{
len
*=
cn
;
for
(
;
x
<=
len
-
4
;
x
+=
4
)
{
v_float32x4
v_1src
=
v_load
(
src1
+
x
);
v_float32x4
v_2src
=
v_load
(
src2
+
x
);
v_float64x2
v_1src0
=
v_cvt_f64
(
v_1src
);
v_float64x2
v_1src1
=
v_cvt_f64_high
(
v_1src
);
v_float64x2
v_2src0
=
v_cvt_f64
(
v_2src
);
v_float64x2
v_2src1
=
v_cvt_f64_high
(
v_2src
);
v_store
(
dst
+
x
,
v_load
(
dst
+
x
)
+
(
v_1src0
*
v_2src0
));
v_store
(
dst
+
x
+
2
,
v_load
(
dst
+
x
+
2
)
+
(
v_1src1
*
v_2src1
));
}
}
return
x
;
}
};
template
<>
struct
AccProd_SIMD
<
double
,
double
>
{
int
operator
()
(
const
double
*
src1
,
const
double
*
src2
,
double
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
)
const
{
int
x
=
0
;
if
(
!
mask
)
{
len
*=
cn
;
for
(
;
x
<=
len
-
4
;
x
+=
4
)
{
v_float64x2
v_src00
=
v_load
(
src1
+
x
);
v_float64x2
v_src01
=
v_load
(
src1
+
x
+
2
);
v_float64x2
v_src10
=
v_load
(
src2
+
x
);
v_float64x2
v_src11
=
v_load
(
src2
+
x
+
2
);
v_store
(
dst
+
x
,
v_load
(
dst
+
x
)
+
(
v_src00
*
v_src10
));
v_store
(
dst
+
x
+
2
,
v_load
(
dst
+
x
+
2
)
+
(
v_src01
*
v_src11
));
}
}
return
x
;
}
};
#endif //CV_SIMD128_64F
template
<>
struct
AccW_SIMD
<
float
,
float
>
{
int
operator
()
(
const
float
*
src
,
float
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
,
float
alpha
)
const
{
int
x
=
0
;
v_float32x4
v_alpha
=
v_setall_f32
(
alpha
);
v_float32x4
v_beta
=
v_setall_f32
(
1.0
f
-
alpha
);
if
(
!
mask
)
{
len
*=
cn
;
for
(
;
x
<=
len
-
8
;
x
+=
8
)
{
v_store
(
dst
+
x
,
((
v_load
(
dst
+
x
)
*
v_beta
)
+
(
v_load
(
src
+
x
)
*
v_alpha
)));
v_store
(
dst
+
x
+
4
,
((
v_load
(
dst
+
x
+
4
)
*
v_beta
)
+
(
v_load
(
src
+
x
+
4
)
*
v_alpha
)));
}
}
return
x
;
}
};
#if CV_SIMD128_64F
template
<>
struct
AccW_SIMD
<
float
,
double
>
{
int
operator
()
(
const
float
*
src
,
double
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
,
double
alpha
)
const
{
int
x
=
0
;
v_float64x2
v_alpha
=
v_setall_f64
(
alpha
);
v_float64x2
v_beta
=
v_setall_f64
(
1.0
f
-
alpha
);
if
(
!
mask
)
{
len
*=
cn
;
for
(
;
x
<=
len
-
8
;
x
+=
8
)
{
v_float32x4
v_src0
=
v_load
(
src
+
x
);
v_float32x4
v_src1
=
v_load
(
src
+
x
+
4
);
v_float64x2
v_src00
=
v_cvt_f64
(
v_src0
);
v_float64x2
v_src01
=
v_cvt_f64_high
(
v_src0
);
v_float64x2
v_src10
=
v_cvt_f64
(
v_src1
);
v_float64x2
v_src11
=
v_cvt_f64_high
(
v_src1
);
v_store
(
dst
+
x
,
((
v_load
(
dst
+
x
)
*
v_beta
)
+
(
v_src00
*
v_alpha
)));
v_store
(
dst
+
x
+
2
,
((
v_load
(
dst
+
x
+
2
)
*
v_beta
)
+
(
v_src01
*
v_alpha
)));
v_store
(
dst
+
x
+
4
,
((
v_load
(
dst
+
x
+
4
)
*
v_beta
)
+
(
v_src10
*
v_alpha
)));
v_store
(
dst
+
x
+
6
,
((
v_load
(
dst
+
x
+
6
)
*
v_beta
)
+
(
v_src11
*
v_alpha
)));
}
}
return
x
;
}
};
template
<>
struct
AccW_SIMD
<
double
,
double
>
{
int
operator
()
(
const
double
*
src
,
double
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
,
double
alpha
)
const
{
int
x
=
0
;
v_float64x2
v_alpha
=
v_setall_f64
(
alpha
);
v_float64x2
v_beta
=
v_setall_f64
(
1.0
f
-
alpha
);
if
(
!
mask
)
{
len
*=
cn
;
for
(
;
x
<=
len
-
4
;
x
+=
4
)
{
v_float64x2
v_src0
=
v_load
(
src
+
x
);
v_float64x2
v_src1
=
v_load
(
src
+
x
+
2
);
v_store
(
dst
+
x
,
((
v_load
(
dst
+
x
)
*
v_beta
)
+
(
v_src0
*
v_alpha
)));
v_store
(
dst
+
x
+
2
,
((
v_load
(
dst
+
x
+
2
)
*
v_beta
)
+
(
v_src1
*
v_alpha
)));
}
}
return
x
;
}
};
#endif //CV_SIMD128_64F
#endif //CV_SIMD128
#if CV_SIMD128
template
<>
struct
Acc_SIMD
<
uchar
,
float
>
{
int
operator
()
(
const
uchar
*
src
,
float
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
)
const
{
int
x
=
0
;
if
(
!
mask
)
{
len
*=
cn
;
for
(
;
x
<=
len
-
16
;
x
+=
16
)
{
v_uint8x16
v_src
=
v_load
(
src
+
x
);
v_uint16x8
v_src0
,
v_src1
;
v_expand
(
v_src
,
v_src0
,
v_src1
);
v_uint32x4
v_src00
,
v_src01
,
v_src10
,
v_src11
;
v_expand
(
v_src0
,
v_src00
,
v_src01
);
v_expand
(
v_src1
,
v_src10
,
v_src11
);
v_store
(
dst
+
x
,
v_load
(
dst
+
x
)
+
v_cvt_f32
(
v_reinterpret_as_s32
(
v_src00
)));
v_store
(
dst
+
x
+
4
,
v_load
(
dst
+
x
+
4
)
+
v_cvt_f32
(
v_reinterpret_as_s32
(
v_src01
)));
v_store
(
dst
+
x
+
8
,
v_load
(
dst
+
x
+
8
)
+
v_cvt_f32
(
v_reinterpret_as_s32
(
v_src10
)));
v_store
(
dst
+
x
+
12
,
v_load
(
dst
+
x
+
12
)
+
v_cvt_f32
(
v_reinterpret_as_s32
(
v_src11
)));
}
}
else
if
(
cn
==
1
)
{
v_uint8x16
v_0
=
v_setall_u8
(
0
);
for
(
;
x
<=
len
-
16
;
x
+=
16
)
{
v_uint8x16
v_mask
=
v_load
(
mask
+
x
);
v_mask
=
~
(
v_0
==
v_mask
);
v_uint8x16
v_src
=
v_load
(
src
+
x
);
v_src
=
v_src
&
v_mask
;
v_uint16x8
v_src0
,
v_src1
;
v_expand
(
v_src
,
v_src0
,
v_src1
);
v_uint32x4
v_src00
,
v_src01
,
v_src10
,
v_src11
;
v_expand
(
v_src0
,
v_src00
,
v_src01
);
v_expand
(
v_src1
,
v_src10
,
v_src11
);
v_store
(
dst
+
x
,
v_load
(
dst
+
x
)
+
v_cvt_f32
(
v_reinterpret_as_s32
(
v_src00
)));
v_store
(
dst
+
x
+
4
,
v_load
(
dst
+
x
+
4
)
+
v_cvt_f32
(
v_reinterpret_as_s32
(
v_src01
)));
v_store
(
dst
+
x
+
8
,
v_load
(
dst
+
x
+
8
)
+
v_cvt_f32
(
v_reinterpret_as_s32
(
v_src10
)));
v_store
(
dst
+
x
+
12
,
v_load
(
dst
+
x
+
12
)
+
v_cvt_f32
(
v_reinterpret_as_s32
(
v_src11
)));
}
}
return
x
;
}
};
template
<>
struct
Acc_SIMD
<
ushort
,
float
>
{
int
operator
()
(
const
ushort
*
src
,
float
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
)
const
{
int
x
=
0
;
if
(
!
mask
)
{
len
*=
cn
;
for
(
;
x
<=
len
-
8
;
x
+=
8
)
{
v_uint16x8
v_src
=
v_load
(
src
+
x
);
v_uint32x4
v_src0
,
v_src1
;
v_expand
(
v_src
,
v_src0
,
v_src1
);
v_store
(
dst
+
x
,
v_load
(
dst
+
x
)
+
v_cvt_f32
(
v_reinterpret_as_s32
(
v_src0
)));
v_store
(
dst
+
x
+
4
,
v_load
(
dst
+
x
+
4
)
+
v_cvt_f32
(
v_reinterpret_as_s32
(
v_src1
)));
}
}
return
x
;
}
};
#if CV_SIMD128_64F
template
<>
struct
Acc_SIMD
<
uchar
,
double
>
{
int
operator
()
(
const
uchar
*
src
,
double
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
)
const
{
int
x
=
0
;
if
(
!
mask
)
{
len
*=
cn
;
for
(
;
x
<=
len
-
16
;
x
+=
16
)
{
v_uint8x16
v_src
=
v_load
(
src
+
x
);
v_uint16x8
v_int0
,
v_int1
;
v_expand
(
v_src
,
v_int0
,
v_int1
);
v_uint32x4
v_int00
,
v_int01
,
v_int10
,
v_int11
;
v_expand
(
v_int0
,
v_int00
,
v_int01
);
v_expand
(
v_int1
,
v_int10
,
v_int11
);
v_float64x2
v_src0
=
v_cvt_f64
(
v_reinterpret_as_s32
(
v_int00
));
v_float64x2
v_src1
=
v_cvt_f64_high
(
v_reinterpret_as_s32
(
v_int00
));
v_float64x2
v_src2
=
v_cvt_f64
(
v_reinterpret_as_s32
(
v_int01
));
v_float64x2
v_src3
=
v_cvt_f64_high
(
v_reinterpret_as_s32
(
v_int01
));
v_float64x2
v_src4
=
v_cvt_f64
(
v_reinterpret_as_s32
(
v_int10
));
v_float64x2
v_src5
=
v_cvt_f64_high
(
v_reinterpret_as_s32
(
v_int10
));
v_float64x2
v_src6
=
v_cvt_f64
(
v_reinterpret_as_s32
(
v_int11
));
v_float64x2
v_src7
=
v_cvt_f64_high
(
v_reinterpret_as_s32
(
v_int11
));
v_float64x2
v_dst0
=
v_load
(
dst
+
x
);
v_float64x2
v_dst1
=
v_load
(
dst
+
x
+
2
);
v_float64x2
v_dst2
=
v_load
(
dst
+
x
+
4
);
v_float64x2
v_dst3
=
v_load
(
dst
+
x
+
6
);
v_float64x2
v_dst4
=
v_load
(
dst
+
x
+
8
);
v_float64x2
v_dst5
=
v_load
(
dst
+
x
+
10
);
v_float64x2
v_dst6
=
v_load
(
dst
+
x
+
12
);
v_float64x2
v_dst7
=
v_load
(
dst
+
x
+
14
);
v_dst0
=
v_dst0
+
v_src0
;
v_dst1
=
v_dst1
+
v_src1
;
v_dst2
=
v_dst2
+
v_src2
;
v_dst3
=
v_dst3
+
v_src3
;
v_dst4
=
v_dst4
+
v_src4
;
v_dst5
=
v_dst5
+
v_src5
;
v_dst6
=
v_dst6
+
v_src6
;
v_dst7
=
v_dst7
+
v_src7
;
v_store
(
dst
+
x
,
v_dst0
);
v_store
(
dst
+
x
+
2
,
v_dst1
);
v_store
(
dst
+
x
+
4
,
v_dst2
);
v_store
(
dst
+
x
+
6
,
v_dst3
);
v_store
(
dst
+
x
+
8
,
v_dst4
);
v_store
(
dst
+
x
+
10
,
v_dst5
);
v_store
(
dst
+
x
+
12
,
v_dst6
);
v_store
(
dst
+
x
+
14
,
v_dst7
);
}
}
return
x
;
}
};
template
<>
struct
Acc_SIMD
<
ushort
,
double
>
{
int
operator
()
(
const
ushort
*
src
,
double
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
)
const
{
int
x
=
0
;
if
(
!
mask
)
{
len
*=
cn
;
for
(
;
x
<=
len
-
8
;
x
+=
8
)
{
v_uint16x8
v_src
=
v_load
(
src
+
x
);
v_uint32x4
v_int0
,
v_int1
;
v_expand
(
v_src
,
v_int0
,
v_int1
);
v_float64x2
v_src0
=
v_cvt_f64
(
v_reinterpret_as_s32
(
v_int0
));
v_float64x2
v_src1
=
v_cvt_f64_high
(
v_reinterpret_as_s32
(
v_int0
));
v_float64x2
v_src2
=
v_cvt_f64
(
v_reinterpret_as_s32
(
v_int1
));
v_float64x2
v_src3
=
v_cvt_f64_high
(
v_reinterpret_as_s32
(
v_int1
));
v_float64x2
v_dst0
=
v_load
(
dst
+
x
);
v_float64x2
v_dst1
=
v_load
(
dst
+
x
+
2
);
v_float64x2
v_dst2
=
v_load
(
dst
+
x
+
4
);
v_float64x2
v_dst3
=
v_load
(
dst
+
x
+
6
);
v_dst0
=
v_dst0
+
v_src0
;
v_dst1
=
v_dst1
+
v_src1
;
v_dst2
=
v_dst2
+
v_src2
;
v_dst3
=
v_dst3
+
v_src3
;
v_store
(
dst
+
x
,
v_dst0
);
v_store
(
dst
+
x
+
2
,
v_dst1
);
v_store
(
dst
+
x
+
4
,
v_dst2
);
v_store
(
dst
+
x
+
6
,
v_dst3
);
}
}
return
x
;
}
};
#endif
template
<>
struct
AccSqr_SIMD
<
uchar
,
float
>
{
int
operator
()
(
const
uchar
*
src
,
float
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
)
const
{
int
x
=
0
;
if
(
!
mask
)
{
len
*=
cn
;
for
(
;
x
<=
len
-
16
;
x
+=
16
)
{
v_uint8x16
v_src
=
v_load
(
src
+
x
);
v_uint16x8
v_src0
,
v_src1
;
v_expand
(
v_src
,
v_src0
,
v_src1
);
v_src0
=
v_src0
*
v_src0
;
v_src1
=
v_src1
*
v_src1
;
v_uint32x4
v_src00
,
v_src01
,
v_src10
,
v_src11
;
v_expand
(
v_src0
,
v_src00
,
v_src01
);
v_expand
(
v_src1
,
v_src10
,
v_src11
);
v_store
(
dst
+
x
,
v_load
(
dst
+
x
)
+
v_cvt_f32
(
v_reinterpret_as_s32
(
v_src00
)));
v_store
(
dst
+
x
+
4
,
v_load
(
dst
+
x
+
4
)
+
v_cvt_f32
(
v_reinterpret_as_s32
(
v_src01
)));
v_store
(
dst
+
x
+
8
,
v_load
(
dst
+
x
+
8
)
+
v_cvt_f32
(
v_reinterpret_as_s32
(
v_src10
)));
v_store
(
dst
+
x
+
12
,
v_load
(
dst
+
x
+
12
)
+
v_cvt_f32
(
v_reinterpret_as_s32
(
v_src11
)));
}
}
else
if
(
cn
==
1
)
{
v_uint8x16
v_0
=
v_setall_u8
(
0
);
for
(
;
x
<=
len
-
16
;
x
+=
16
)
{
v_uint8x16
v_mask
=
v_load
(
mask
+
x
);
v_mask
=
~
(
v_0
==
v_mask
);
v_uint8x16
v_src
=
v_load
(
src
+
x
);
v_src
=
v_src
&
v_mask
;
v_uint16x8
v_src0
,
v_src1
;
v_expand
(
v_src
,
v_src0
,
v_src1
);
v_src0
=
v_src0
*
v_src0
;
v_src1
=
v_src1
*
v_src1
;
v_uint32x4
v_src00
,
v_src01
,
v_src10
,
v_src11
;
v_expand
(
v_src0
,
v_src00
,
v_src01
);
v_expand
(
v_src1
,
v_src10
,
v_src11
);
v_store
(
dst
+
x
,
v_load
(
dst
+
x
)
+
v_cvt_f32
(
v_reinterpret_as_s32
(
v_src00
)));
v_store
(
dst
+
x
+
4
,
v_load
(
dst
+
x
+
4
)
+
v_cvt_f32
(
v_reinterpret_as_s32
(
v_src01
)));
v_store
(
dst
+
x
+
8
,
v_load
(
dst
+
x
+
8
)
+
v_cvt_f32
(
v_reinterpret_as_s32
(
v_src10
)));
v_store
(
dst
+
x
+
12
,
v_load
(
dst
+
x
+
12
)
+
v_cvt_f32
(
v_reinterpret_as_s32
(
v_src11
)));
}
}
return
x
;
}
};
template
<>
struct
AccSqr_SIMD
<
ushort
,
float
>
{
int
operator
()
(
const
ushort
*
src
,
float
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
)
const
{
int
x
=
0
;
if
(
!
mask
)
{
len
*=
cn
;
for
(
;
x
<=
len
-
8
;
x
+=
8
)
{
v_uint16x8
v_src
=
v_load
(
src
+
x
);
v_uint32x4
v_src0
,
v_src1
;
v_expand
(
v_src
,
v_src0
,
v_src1
);
v_float32x4
v_float0
,
v_float1
;
v_float0
=
v_cvt_f32
(
v_reinterpret_as_s32
(
v_src0
));
v_float1
=
v_cvt_f32
(
v_reinterpret_as_s32
(
v_src1
));
v_float0
=
v_float0
*
v_float0
;
v_float1
=
v_float1
*
v_float1
;
v_store
(
dst
+
x
,
v_load
(
dst
+
x
)
+
v_float0
);
v_store
(
dst
+
x
+
4
,
v_load
(
dst
+
x
+
4
)
+
v_float1
);
}
}
return
x
;
}
};
#if CV_SIMD128_64F
template
<>
struct
AccSqr_SIMD
<
uchar
,
double
>
{
int
operator
()
(
const
uchar
*
src
,
double
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
)
const
{
int
x
=
0
;
if
(
!
mask
)
{
len
*=
cn
;
for
(
;
x
<=
len
-
8
;
x
+=
8
)
{
v_uint8x16
v_src
=
v_load
(
src
+
x
);
v_uint16x8
v_int
,
dummy
;
v_expand
(
v_src
,
v_int
,
dummy
);
v_uint32x4
v_int0
,
v_int1
;
v_expand
(
v_int
,
v_int0
,
v_int1
);
v_float64x2
v_src0
=
v_cvt_f64
(
v_reinterpret_as_s32
(
v_int0
));
v_float64x2
v_src1
=
v_cvt_f64_high
(
v_reinterpret_as_s32
(
v_int0
));
v_float64x2
v_src2
=
v_cvt_f64
(
v_reinterpret_as_s32
(
v_int1
));
v_float64x2
v_src3
=
v_cvt_f64_high
(
v_reinterpret_as_s32
(
v_int1
));
v_src0
=
v_src0
*
v_src0
;
v_src1
=
v_src1
*
v_src1
;
v_src2
=
v_src2
*
v_src2
;
v_src3
=
v_src3
*
v_src3
;
v_float64x2
v_dst0
=
v_load
(
dst
+
x
);
v_float64x2
v_dst1
=
v_load
(
dst
+
x
+
2
);
v_float64x2
v_dst2
=
v_load
(
dst
+
x
+
4
);
v_float64x2
v_dst3
=
v_load
(
dst
+
x
+
6
);
v_dst0
+=
v_src0
;
v_dst1
+=
v_src1
;
v_dst2
+=
v_src2
;
v_dst3
+=
v_src3
;
v_store
(
dst
+
x
,
v_dst0
);
v_store
(
dst
+
x
+
2
,
v_dst1
);
v_store
(
dst
+
x
+
4
,
v_dst2
);
v_store
(
dst
+
x
+
6
,
v_dst3
);
}
}
return
x
;
}
};
template
<>
struct
AccSqr_SIMD
<
ushort
,
double
>
{
int
operator
()
(
const
ushort
*
src
,
double
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
)
const
{
int
x
=
0
;
if
(
!
mask
)
{
len
*=
cn
;
for
(
;
x
<=
len
-
8
;
x
+=
8
)
{
v_uint16x8
v_src
=
v_load
(
src
+
x
);
v_uint32x4
v_int_0
,
v_int_1
;
v_expand
(
v_src
,
v_int_0
,
v_int_1
);
v_int32x4
v_int0
=
v_reinterpret_as_s32
(
v_int_0
);
v_int32x4
v_int1
=
v_reinterpret_as_s32
(
v_int_1
);
v_float64x2
v_src0
=
v_cvt_f64
(
v_int0
);
v_float64x2
v_src1
=
v_cvt_f64_high
(
v_int0
);
v_float64x2
v_src2
=
v_cvt_f64
(
v_int1
);
v_float64x2
v_src3
=
v_cvt_f64_high
(
v_int1
);
v_src0
=
v_src0
*
v_src0
;
v_src1
=
v_src1
*
v_src1
;
v_src2
=
v_src2
*
v_src2
;
v_src3
=
v_src3
*
v_src3
;
v_float64x2
v_dst0
=
v_load
(
dst
+
x
);
v_float64x2
v_dst1
=
v_load
(
dst
+
x
+
2
);
v_float64x2
v_dst2
=
v_load
(
dst
+
x
+
4
);
v_float64x2
v_dst3
=
v_load
(
dst
+
x
+
6
);
v_dst0
+=
v_src0
;
v_dst1
+=
v_src1
;
v_dst2
+=
v_src2
;
v_dst3
+=
v_src3
;
v_store
(
dst
+
x
,
v_dst0
);
v_store
(
dst
+
x
+
2
,
v_dst1
);
v_store
(
dst
+
x
+
4
,
v_dst2
);
v_store
(
dst
+
x
+
6
,
v_dst3
);
}
}
return
x
;
}
};
#endif
template
<>
struct
AccProd_SIMD
<
uchar
,
float
>
{
int
operator
()
(
const
uchar
*
src1
,
const
uchar
*
src2
,
float
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
)
const
{
int
x
=
0
;
len
*=
cn
;
if
(
!
mask
)
{
for
(
;
x
<=
len
-
16
;
x
+=
16
)
{
v_uint8x16
v_1src
=
v_load
(
src1
+
x
);
v_uint8x16
v_2src
=
v_load
(
src2
+
x
);
v_uint16x8
v_1src0
,
v_1src1
,
v_2src0
,
v_2src1
;
v_expand
(
v_1src
,
v_1src0
,
v_1src1
);
v_expand
(
v_2src
,
v_2src0
,
v_2src1
);
v_uint16x8
v_src0
,
v_src1
;
v_src0
=
v_1src0
*
v_2src0
;
v_src1
=
v_1src1
*
v_2src1
;
v_uint32x4
v_src00
,
v_src01
,
v_src10
,
v_src11
;
v_expand
(
v_src0
,
v_src00
,
v_src01
);
v_expand
(
v_src1
,
v_src10
,
v_src11
);
v_store
(
dst
+
x
,
v_load
(
dst
+
x
)
+
v_cvt_f32
(
v_reinterpret_as_s32
(
v_src00
)));
v_store
(
dst
+
x
+
4
,
v_load
(
dst
+
x
+
4
)
+
v_cvt_f32
(
v_reinterpret_as_s32
(
v_src01
)));
v_store
(
dst
+
x
+
8
,
v_load
(
dst
+
x
+
8
)
+
v_cvt_f32
(
v_reinterpret_as_s32
(
v_src10
)));
v_store
(
dst
+
x
+
12
,
v_load
(
dst
+
x
+
12
)
+
v_cvt_f32
(
v_reinterpret_as_s32
(
v_src11
)));
}
}
else
if
(
cn
==
1
)
{
v_uint8x16
v_0
=
v_setzero_u8
();
for
(
;
x
<=
len
-
16
;
x
+=
16
)
{
v_uint8x16
v_mask
=
v_load
(
mask
+
x
);
v_mask
=
~
(
v_0
==
v_mask
);
v_uint8x16
v_1src
=
v_load
(
src1
+
x
)
&
v_mask
;
v_uint8x16
v_2src
=
v_load
(
src2
+
x
)
&
v_mask
;
v_uint16x8
v_1src0
,
v_1src1
,
v_2src0
,
v_2src1
;
v_expand
(
v_1src
,
v_1src0
,
v_1src1
);
v_expand
(
v_2src
,
v_2src0
,
v_2src1
);
v_uint16x8
v_src0
,
v_src1
;
v_src0
=
v_1src0
*
v_2src0
;
v_src1
=
v_1src1
*
v_2src1
;
v_uint32x4
v_src00
,
v_src01
,
v_src10
,
v_src11
;
v_expand
(
v_src0
,
v_src00
,
v_src01
);
v_expand
(
v_src1
,
v_src10
,
v_src11
);
v_store
(
dst
+
x
,
v_load
(
dst
+
x
)
+
v_cvt_f32
(
v_reinterpret_as_s32
(
v_src00
)));
v_store
(
dst
+
x
+
4
,
v_load
(
dst
+
x
+
4
)
+
v_cvt_f32
(
v_reinterpret_as_s32
(
v_src01
)));
v_store
(
dst
+
x
+
8
,
v_load
(
dst
+
x
+
8
)
+
v_cvt_f32
(
v_reinterpret_as_s32
(
v_src10
)));
v_store
(
dst
+
x
+
12
,
v_load
(
dst
+
x
+
12
)
+
v_cvt_f32
(
v_reinterpret_as_s32
(
v_src11
)));
}
}
return
x
;
}
};
template
<>
struct
AccProd_SIMD
<
ushort
,
float
>
{
int
operator
()
(
const
ushort
*
src1
,
const
ushort
*
src2
,
float
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
)
const
{
int
x
=
0
;
if
(
!
mask
)
{
len
*=
cn
;
for
(
;
x
<=
len
-
8
;
x
+=
8
)
{
v_uint16x8
v_1src
=
v_load
(
src1
+
x
);
v_uint16x8
v_2src
=
v_load
(
src2
+
x
);
v_uint32x4
v_1src0
,
v_1src1
,
v_2src0
,
v_2src1
;
v_expand
(
v_1src
,
v_1src0
,
v_1src1
);
v_expand
(
v_2src
,
v_2src0
,
v_2src1
);
v_float32x4
v_1float0
=
v_cvt_f32
(
v_reinterpret_as_s32
(
v_1src0
));
v_float32x4
v_1float1
=
v_cvt_f32
(
v_reinterpret_as_s32
(
v_1src1
));
v_float32x4
v_2float0
=
v_cvt_f32
(
v_reinterpret_as_s32
(
v_2src0
));
v_float32x4
v_2float1
=
v_cvt_f32
(
v_reinterpret_as_s32
(
v_2src1
));
v_float32x4
v_src0
=
v_1float0
*
v_2float0
;
v_float32x4
v_src1
=
v_1float1
*
v_2float1
;
v_store
(
dst
+
x
,
v_load
(
dst
+
x
)
+
v_src0
);
v_store
(
dst
+
x
+
4
,
v_load
(
dst
+
x
+
4
)
+
v_src1
);
}
}
else
if
(
cn
==
1
)
{
v_uint16x8
v_0
=
v_setzero_u16
();
for
(
;
x
<=
len
-
8
;
x
+=
8
)
{
v_uint8x16
v_mask
=
v_load_halves
(
mask
+
x
,
mask
+
x
);
v_uint16x8
v_mask0
,
v_mask1
;
v_expand
(
v_mask
,
v_mask0
,
v_mask1
);
v_mask0
=
~
(
v_0
==
v_mask0
);
v_uint16x8
v_1src
=
v_load
(
src1
+
x
)
&
v_mask0
;
v_uint16x8
v_2src
=
v_load
(
src2
+
x
)
&
v_mask0
;
v_uint32x4
v_1src0
,
v_1src1
,
v_2src0
,
v_2src1
;
v_expand
(
v_1src
,
v_1src0
,
v_1src1
);
v_expand
(
v_2src
,
v_2src0
,
v_2src1
);
v_float32x4
v_1float0
=
v_cvt_f32
(
v_reinterpret_as_s32
(
v_1src0
));
v_float32x4
v_1float1
=
v_cvt_f32
(
v_reinterpret_as_s32
(
v_1src1
));
v_float32x4
v_2float0
=
v_cvt_f32
(
v_reinterpret_as_s32
(
v_2src0
));
v_float32x4
v_2float1
=
v_cvt_f32
(
v_reinterpret_as_s32
(
v_2src1
));
v_float32x4
v_src0
=
v_1float0
*
v_2float0
;
v_float32x4
v_src1
=
v_1float1
*
v_2float1
;
v_store
(
dst
+
x
,
v_load
(
dst
+
x
)
+
v_src0
);
v_store
(
dst
+
x
+
4
,
v_load
(
dst
+
x
+
4
)
+
v_src1
);
}
}
return
x
;
}
};
#if CV_SIMD128_64F
template
<>
struct
AccProd_SIMD
<
uchar
,
double
>
{
int
operator
()
(
const
uchar
*
src1
,
const
uchar
*
src2
,
double
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
)
const
{
int
x
=
0
;
if
(
!
mask
)
{
len
*=
cn
;
for
(
;
x
<=
len
-
8
;
x
+=
8
)
{
v_uint8x16
v_1src
=
v_load
(
src1
+
x
);
v_uint8x16
v_2src
=
v_load
(
src2
+
x
);
v_uint16x8
v_1int
,
v_2int
,
dummy
;
v_expand
(
v_1src
,
v_1int
,
dummy
);
v_expand
(
v_2src
,
v_2int
,
dummy
);
v_uint32x4
v_1int_0
,
v_1int_1
,
v_2int_0
,
v_2int_1
;
v_expand
(
v_1int
,
v_1int_0
,
v_1int_1
);
v_expand
(
v_2int
,
v_2int_0
,
v_2int_1
);
v_int32x4
v_1int0
=
v_reinterpret_as_s32
(
v_1int_0
);
v_int32x4
v_1int1
=
v_reinterpret_as_s32
(
v_1int_1
);
v_int32x4
v_2int0
=
v_reinterpret_as_s32
(
v_2int_0
);
v_int32x4
v_2int1
=
v_reinterpret_as_s32
(
v_2int_1
);
v_float64x2
v_src0
=
v_cvt_f64
(
v_1int0
)
*
v_cvt_f64
(
v_2int0
);
v_float64x2
v_src1
=
v_cvt_f64_high
(
v_1int0
)
*
v_cvt_f64_high
(
v_2int0
);
v_float64x2
v_src2
=
v_cvt_f64
(
v_1int1
)
*
v_cvt_f64
(
v_2int1
);
v_float64x2
v_src3
=
v_cvt_f64_high
(
v_1int1
)
*
v_cvt_f64_high
(
v_2int1
);
v_float64x2
v_dst0
=
v_load
(
dst
+
x
);
v_float64x2
v_dst1
=
v_load
(
dst
+
x
+
2
);
v_float64x2
v_dst2
=
v_load
(
dst
+
x
+
4
);
v_float64x2
v_dst3
=
v_load
(
dst
+
x
+
6
);
v_dst0
+=
v_src0
;
v_dst1
+=
v_src1
;
v_dst2
+=
v_src2
;
v_dst3
+=
v_src3
;
v_store
(
dst
+
x
,
v_dst0
);
v_store
(
dst
+
x
+
2
,
v_dst1
);
v_store
(
dst
+
x
+
4
,
v_dst2
);
v_store
(
dst
+
x
+
6
,
v_dst3
);
}
}
return
x
;
}
};
template
<>
struct
AccProd_SIMD
<
ushort
,
double
>
{
int
operator
()
(
const
ushort
*
src1
,
const
ushort
*
src2
,
double
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
)
const
{
int
x
=
0
;
if
(
!
mask
)
{
len
*=
cn
;
for
(
;
x
<=
len
-
8
;
x
+=
8
)
{
v_uint16x8
v_1src
=
v_load
(
src1
+
x
);
v_uint16x8
v_2src
=
v_load
(
src2
+
x
);
v_uint32x4
v_1int_0
,
v_1int_1
,
v_2int_0
,
v_2int_1
;
v_expand
(
v_1src
,
v_1int_0
,
v_1int_1
);
v_expand
(
v_2src
,
v_2int_0
,
v_2int_1
);
v_int32x4
v_1int0
=
v_reinterpret_as_s32
(
v_1int_0
);
v_int32x4
v_1int1
=
v_reinterpret_as_s32
(
v_1int_1
);
v_int32x4
v_2int0
=
v_reinterpret_as_s32
(
v_2int_0
);
v_int32x4
v_2int1
=
v_reinterpret_as_s32
(
v_2int_1
);
v_float64x2
v_src0
=
v_cvt_f64
(
v_1int0
)
*
v_cvt_f64
(
v_2int0
);
v_float64x2
v_src1
=
v_cvt_f64_high
(
v_1int0
)
*
v_cvt_f64_high
(
v_2int0
);
v_float64x2
v_src2
=
v_cvt_f64
(
v_1int1
)
*
v_cvt_f64
(
v_2int1
);
v_float64x2
v_src3
=
v_cvt_f64_high
(
v_1int1
)
*
v_cvt_f64_high
(
v_2int1
);
v_float64x2
v_dst0
=
v_load
(
dst
+
x
);
v_float64x2
v_dst1
=
v_load
(
dst
+
x
+
2
);
v_float64x2
v_dst2
=
v_load
(
dst
+
x
+
4
);
v_float64x2
v_dst3
=
v_load
(
dst
+
x
+
6
);
v_dst0
=
v_dst0
+
v_src0
;
v_dst1
=
v_dst1
+
v_src1
;
v_dst2
=
v_dst2
+
v_src2
;
v_dst3
=
v_dst3
+
v_src3
;
v_store
(
dst
+
x
,
v_dst0
);
v_store
(
dst
+
x
+
2
,
v_dst1
);
v_store
(
dst
+
x
+
4
,
v_dst2
);
v_store
(
dst
+
x
+
6
,
v_dst3
);
}
}
return
x
;
}
};
#endif
template
<>
struct
AccW_SIMD
<
uchar
,
float
>
{
int
operator
()
(
const
uchar
*
src
,
float
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
,
float
alpha
)
const
{
int
x
=
0
;
v_float32x4
v_alpha
=
v_setall_f32
(
alpha
);
v_float32x4
v_beta
=
v_setall_f32
(
1.0
f
-
alpha
);
if
(
!
mask
)
{
len
*=
cn
;
for
(
;
x
<=
len
-
16
;
x
+=
16
)
{
v_uint8x16
v_src
=
v_load
(
src
+
x
);
v_uint16x8
v_src0
,
v_src1
;
v_expand
(
v_src
,
v_src0
,
v_src1
);
v_uint32x4
v_src00
,
v_src01
,
v_src10
,
v_src11
;
v_expand
(
v_src0
,
v_src00
,
v_src01
);
v_expand
(
v_src1
,
v_src10
,
v_src11
);
v_float32x4
v_dst00
=
v_load
(
dst
+
x
);
v_float32x4
v_dst01
=
v_load
(
dst
+
x
+
4
);
v_float32x4
v_dst10
=
v_load
(
dst
+
x
+
8
);
v_float32x4
v_dst11
=
v_load
(
dst
+
x
+
12
);
v_dst00
=
(
v_dst00
*
v_beta
)
+
(
v_cvt_f32
(
v_reinterpret_as_s32
(
v_src00
))
*
v_alpha
);
v_dst01
=
(
v_dst01
*
v_beta
)
+
(
v_cvt_f32
(
v_reinterpret_as_s32
(
v_src01
))
*
v_alpha
);
v_dst10
=
(
v_dst10
*
v_beta
)
+
(
v_cvt_f32
(
v_reinterpret_as_s32
(
v_src10
))
*
v_alpha
);
v_dst11
=
(
v_dst11
*
v_beta
)
+
(
v_cvt_f32
(
v_reinterpret_as_s32
(
v_src11
))
*
v_alpha
);
v_store
(
dst
+
x
,
v_dst00
);
v_store
(
dst
+
x
+
4
,
v_dst01
);
v_store
(
dst
+
x
+
8
,
v_dst10
);
v_store
(
dst
+
x
+
12
,
v_dst11
);
}
}
return
x
;
}
};
template
<>
struct
AccW_SIMD
<
ushort
,
float
>
{
int
operator
()
(
const
ushort
*
src
,
float
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
,
float
alpha
)
const
{
int
x
=
0
;
v_float32x4
v_alpha
=
v_setall_f32
(
alpha
);
v_float32x4
v_beta
=
v_setall_f32
(
1.0
f
-
alpha
);
if
(
!
mask
)
{
len
*=
cn
;
for
(
;
x
<=
len
-
8
;
x
+=
8
)
{
v_uint16x8
v_src
=
v_load
(
src
+
x
);
v_uint32x4
v_int0
,
v_int1
;
v_expand
(
v_src
,
v_int0
,
v_int1
);
v_float32x4
v_src0
=
v_cvt_f32
(
v_reinterpret_as_s32
(
v_int0
));
v_float32x4
v_src1
=
v_cvt_f32
(
v_reinterpret_as_s32
(
v_int1
));
v_src0
=
v_src0
*
v_alpha
;
v_src1
=
v_src1
*
v_alpha
;
v_float32x4
v_dst0
=
v_load
(
dst
+
x
)
*
v_beta
;
v_float32x4
v_dst1
=
v_load
(
dst
+
x
+
4
)
*
v_beta
;
v_store
(
dst
+
x
,
v_dst0
+
v_src0
);
v_store
(
dst
+
x
+
4
,
v_dst1
+
v_src1
);
}
}
return
x
;
}
};
#if CV_SIMD128_64F
template
<>
struct
AccW_SIMD
<
uchar
,
double
>
{
int
operator
()
(
const
uchar
*
src
,
double
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
,
double
alpha
)
const
{
int
x
=
0
;
v_float64x2
v_alpha
=
v_setall_f64
(
alpha
);
v_float64x2
v_beta
=
v_setall_f64
(
1.0
f
-
alpha
);
if
(
!
mask
)
{
len
*=
cn
;
for
(
;
x
<=
len
-
8
;
x
+=
8
)
{
v_uint8x16
v_src
=
v_load
(
src
+
x
);
v_uint16x8
v_int
,
dummy
;
v_expand
(
v_src
,
v_int
,
dummy
);
v_uint32x4
v_int_0
,
v_int_1
;
v_expand
(
v_int
,
v_int_0
,
v_int_1
);
v_int32x4
v_int0
=
v_reinterpret_as_s32
(
v_int_0
);
v_int32x4
v_int1
=
v_reinterpret_as_s32
(
v_int_1
);
v_float64x2
v_src0
=
v_cvt_f64
(
v_int0
);
v_float64x2
v_src1
=
v_cvt_f64_high
(
v_int0
);
v_float64x2
v_src2
=
v_cvt_f64
(
v_int1
);
v_float64x2
v_src3
=
v_cvt_f64_high
(
v_int1
);
v_float64x2
v_dst0
=
v_load
(
dst
+
x
);
v_float64x2
v_dst1
=
v_load
(
dst
+
x
+
2
);
v_float64x2
v_dst2
=
v_load
(
dst
+
x
+
4
);
v_float64x2
v_dst3
=
v_load
(
dst
+
x
+
6
);
v_dst0
=
(
v_dst0
*
v_beta
)
+
(
v_src0
*
v_alpha
);
v_dst1
=
(
v_dst1
*
v_beta
)
+
(
v_src1
*
v_alpha
);
v_dst2
=
(
v_dst2
*
v_beta
)
+
(
v_src2
*
v_alpha
);
v_dst3
=
(
v_dst3
*
v_beta
)
+
(
v_src3
*
v_alpha
);
v_store
(
dst
+
x
,
v_dst0
);
v_store
(
dst
+
x
+
2
,
v_dst1
);
v_store
(
dst
+
x
+
4
,
v_dst2
);
v_store
(
dst
+
x
+
6
,
v_dst3
);
}
}
return
x
;
}
};
template
<>
struct
AccW_SIMD
<
ushort
,
double
>
{
int
operator
()
(
const
ushort
*
src
,
double
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
,
double
alpha
)
const
{
int
x
=
0
;
v_float64x2
v_alpha
=
v_setall_f64
(
alpha
);
v_float64x2
v_beta
=
v_setall_f64
(
1.0
f
-
alpha
);
if
(
!
mask
)
{
len
*=
cn
;
for
(
;
x
<=
len
-
8
;
x
+=
8
)
{
v_uint16x8
v_src
=
v_load
(
src
+
x
);
v_uint32x4
v_int_0
,
v_int_1
;
v_expand
(
v_src
,
v_int_0
,
v_int_1
);
v_int32x4
v_int0
=
v_reinterpret_as_s32
(
v_int_0
);
v_int32x4
v_int1
=
v_reinterpret_as_s32
(
v_int_1
);
v_float64x2
v_src00
=
v_cvt_f64
(
v_int0
);
v_float64x2
v_src01
=
v_cvt_f64_high
(
v_int0
);
v_float64x2
v_src10
=
v_cvt_f64
(
v_int1
);
v_float64x2
v_src11
=
v_cvt_f64_high
(
v_int1
);
v_float64x2
v_dst00
=
v_load
(
dst
+
x
);
v_float64x2
v_dst01
=
v_load
(
dst
+
x
+
2
);
v_float64x2
v_dst10
=
v_load
(
dst
+
x
+
4
);
v_float64x2
v_dst11
=
v_load
(
dst
+
x
+
6
);
v_dst00
=
(
v_dst00
*
v_beta
)
+
(
v_src00
*
v_alpha
);
v_dst01
=
(
v_dst01
*
v_beta
)
+
(
v_src01
*
v_alpha
);
v_dst10
=
(
v_dst10
*
v_beta
)
+
(
v_src10
*
v_alpha
);
v_dst11
=
(
v_dst11
*
v_beta
)
+
(
v_src11
*
v_alpha
);
v_store
(
dst
+
x
,
v_dst00
);
v_store
(
dst
+
x
+
2
,
v_dst01
);
v_store
(
dst
+
x
+
4
,
v_dst10
);
v_store
(
dst
+
x
+
6
,
v_dst11
);
}
}
return
x
;
}
};
#endif //CV_SIMD128_64F
#endif //CV_SIMD128
template
<
typename
T
,
typename
AT
>
void
acc_
(
const
T
*
src
,
AT
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
)
{
int
i
=
Acc_SIMD
<
T
,
AT
>
()(
src
,
dst
,
mask
,
len
,
cn
);
if
(
!
mask
)
{
len
*=
cn
;
#if CV_ENABLE_UNROLLED
for
(
;
i
<=
len
-
4
;
i
+=
4
)
{
AT
t0
,
t1
;
t0
=
src
[
i
]
+
dst
[
i
];
t1
=
src
[
i
+
1
]
+
dst
[
i
+
1
];
dst
[
i
]
=
t0
;
dst
[
i
+
1
]
=
t1
;
t0
=
src
[
i
+
2
]
+
dst
[
i
+
2
];
t1
=
src
[
i
+
3
]
+
dst
[
i
+
3
];
dst
[
i
+
2
]
=
t0
;
dst
[
i
+
3
]
=
t1
;
}
#endif
for
(
;
i
<
len
;
i
++
)
dst
[
i
]
+=
src
[
i
];
}
else
if
(
cn
==
1
)
{
for
(
;
i
<
len
;
i
++
)
{
if
(
mask
[
i
]
)
dst
[
i
]
+=
src
[
i
];
}
}
else
if
(
cn
==
3
)
{
for
(
;
i
<
len
;
i
++
,
src
+=
3
,
dst
+=
3
)
{
if
(
mask
[
i
]
)
{
AT
t0
=
src
[
0
]
+
dst
[
0
];
AT
t1
=
src
[
1
]
+
dst
[
1
];
AT
t2
=
src
[
2
]
+
dst
[
2
];
dst
[
0
]
=
t0
;
dst
[
1
]
=
t1
;
dst
[
2
]
=
t2
;
}
}
}
else
{
for
(
;
i
<
len
;
i
++
,
src
+=
cn
,
dst
+=
cn
)
if
(
mask
[
i
]
)
{
for
(
int
k
=
0
;
k
<
cn
;
k
++
)
dst
[
k
]
+=
src
[
k
];
}
}
}
template
<
typename
T
,
typename
AT
>
void
accSqr_
(
const
T
*
src
,
AT
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
)
{
int
i
=
AccSqr_SIMD
<
T
,
AT
>
()(
src
,
dst
,
mask
,
len
,
cn
);
if
(
!
mask
)
{
len
*=
cn
;
#if CV_ENABLE_UNROLLED
for
(
;
i
<=
len
-
4
;
i
+=
4
)
{
AT
t0
,
t1
;
t0
=
(
AT
)
src
[
i
]
*
src
[
i
]
+
dst
[
i
];
t1
=
(
AT
)
src
[
i
+
1
]
*
src
[
i
+
1
]
+
dst
[
i
+
1
];
dst
[
i
]
=
t0
;
dst
[
i
+
1
]
=
t1
;
t0
=
(
AT
)
src
[
i
+
2
]
*
src
[
i
+
2
]
+
dst
[
i
+
2
];
t1
=
(
AT
)
src
[
i
+
3
]
*
src
[
i
+
3
]
+
dst
[
i
+
3
];
dst
[
i
+
2
]
=
t0
;
dst
[
i
+
3
]
=
t1
;
}
#endif
for
(
;
i
<
len
;
i
++
)
dst
[
i
]
+=
(
AT
)
src
[
i
]
*
src
[
i
];
}
else
if
(
cn
==
1
)
{
for
(
;
i
<
len
;
i
++
)
{
if
(
mask
[
i
]
)
dst
[
i
]
+=
(
AT
)
src
[
i
]
*
src
[
i
];
}
}
else
if
(
cn
==
3
)
{
for
(
;
i
<
len
;
i
++
,
src
+=
3
,
dst
+=
3
)
{
if
(
mask
[
i
]
)
{
AT
t0
=
(
AT
)
src
[
0
]
*
src
[
0
]
+
dst
[
0
];
AT
t1
=
(
AT
)
src
[
1
]
*
src
[
1
]
+
dst
[
1
];
AT
t2
=
(
AT
)
src
[
2
]
*
src
[
2
]
+
dst
[
2
];
dst
[
0
]
=
t0
;
dst
[
1
]
=
t1
;
dst
[
2
]
=
t2
;
}
}
}
else
{
for
(
;
i
<
len
;
i
++
,
src
+=
cn
,
dst
+=
cn
)
if
(
mask
[
i
]
)
{
for
(
int
k
=
0
;
k
<
cn
;
k
++
)
dst
[
k
]
+=
(
AT
)
src
[
k
]
*
src
[
k
];
}
}
}
template
<
typename
T
,
typename
AT
>
void
accProd_
(
const
T
*
src1
,
const
T
*
src2
,
AT
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
)
{
int
i
=
AccProd_SIMD
<
T
,
AT
>
()(
src1
,
src2
,
dst
,
mask
,
len
,
cn
);
if
(
!
mask
)
{
len
*=
cn
;
#if CV_ENABLE_UNROLLED
for
(
;
i
<=
len
-
4
;
i
+=
4
)
{
AT
t0
,
t1
;
t0
=
(
AT
)
src1
[
i
]
*
src2
[
i
]
+
dst
[
i
];
t1
=
(
AT
)
src1
[
i
+
1
]
*
src2
[
i
+
1
]
+
dst
[
i
+
1
];
dst
[
i
]
=
t0
;
dst
[
i
+
1
]
=
t1
;
t0
=
(
AT
)
src1
[
i
+
2
]
*
src2
[
i
+
2
]
+
dst
[
i
+
2
];
t1
=
(
AT
)
src1
[
i
+
3
]
*
src2
[
i
+
3
]
+
dst
[
i
+
3
];
dst
[
i
+
2
]
=
t0
;
dst
[
i
+
3
]
=
t1
;
}
#endif
for
(
;
i
<
len
;
i
++
)
dst
[
i
]
+=
(
AT
)
src1
[
i
]
*
src2
[
i
];
}
else
if
(
cn
==
1
)
{
for
(
;
i
<
len
;
i
++
)
{
if
(
mask
[
i
]
)
dst
[
i
]
+=
(
AT
)
src1
[
i
]
*
src2
[
i
];
}
}
else
if
(
cn
==
3
)
{
for
(
;
i
<
len
;
i
++
,
src1
+=
3
,
src2
+=
3
,
dst
+=
3
)
{
if
(
mask
[
i
]
)
{
AT
t0
=
(
AT
)
src1
[
0
]
*
src2
[
0
]
+
dst
[
0
];
AT
t1
=
(
AT
)
src1
[
1
]
*
src2
[
1
]
+
dst
[
1
];
AT
t2
=
(
AT
)
src1
[
2
]
*
src2
[
2
]
+
dst
[
2
];
dst
[
0
]
=
t0
;
dst
[
1
]
=
t1
;
dst
[
2
]
=
t2
;
}
}
}
else
{
for
(
;
i
<
len
;
i
++
,
src1
+=
cn
,
src2
+=
cn
,
dst
+=
cn
)
if
(
mask
[
i
]
)
{
for
(
int
k
=
0
;
k
<
cn
;
k
++
)
dst
[
k
]
+=
(
AT
)
src1
[
k
]
*
src2
[
k
];
}
}
}
template
<
typename
T
,
typename
AT
>
void
accW_
(
const
T
*
src
,
AT
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
,
double
alpha
)
{
AT
a
=
(
AT
)
alpha
,
b
=
1
-
a
;
int
i
=
AccW_SIMD
<
T
,
AT
>
()(
src
,
dst
,
mask
,
len
,
cn
,
a
);
if
(
!
mask
)
{
len
*=
cn
;
#if CV_ENABLE_UNROLLED
for
(
;
i
<=
len
-
4
;
i
+=
4
)
{
AT
t0
,
t1
;
t0
=
src
[
i
]
*
a
+
dst
[
i
]
*
b
;
t1
=
src
[
i
+
1
]
*
a
+
dst
[
i
+
1
]
*
b
;
dst
[
i
]
=
t0
;
dst
[
i
+
1
]
=
t1
;
t0
=
src
[
i
+
2
]
*
a
+
dst
[
i
+
2
]
*
b
;
t1
=
src
[
i
+
3
]
*
a
+
dst
[
i
+
3
]
*
b
;
dst
[
i
+
2
]
=
t0
;
dst
[
i
+
3
]
=
t1
;
}
#endif
for
(
;
i
<
len
;
i
++
)
dst
[
i
]
=
src
[
i
]
*
a
+
dst
[
i
]
*
b
;
}
else
if
(
cn
==
1
)
{
for
(
;
i
<
len
;
i
++
)
{
if
(
mask
[
i
]
)
dst
[
i
]
=
src
[
i
]
*
a
+
dst
[
i
]
*
b
;
}
}
else
if
(
cn
==
3
)
{
for
(
;
i
<
len
;
i
++
,
src
+=
3
,
dst
+=
3
)
{
if
(
mask
[
i
]
)
{
AT
t0
=
src
[
0
]
*
a
+
dst
[
0
]
*
b
;
AT
t1
=
src
[
1
]
*
a
+
dst
[
1
]
*
b
;
AT
t2
=
src
[
2
]
*
a
+
dst
[
2
]
*
b
;
dst
[
0
]
=
t0
;
dst
[
1
]
=
t1
;
dst
[
2
]
=
t2
;
}
}
}
else
{
for
(
;
i
<
len
;
i
++
,
src
+=
cn
,
dst
+=
cn
)
if
(
mask
[
i
]
)
{
for
(
int
k
=
0
;
k
<
cn
;
k
++
)
dst
[
k
]
=
src
[
k
]
*
a
+
dst
[
k
]
*
b
;
}
}
}
#define DEF_ACC_FUNCS(suffix, type, acctype) \
static void acc_##suffix(const type* src, acctype* dst, \
const uchar* mask, int len, int cn) \
{ acc_(src, dst, mask, len, cn); } \
\
static void accSqr_##suffix(const type* src, acctype* dst, \
const uchar* mask, int len, int cn) \
{ accSqr_(src, dst, mask, len, cn); } \
\
static void accProd_##suffix(const type* src1, const type* src2, \
acctype* dst, const uchar* mask, int len, int cn) \
{ accProd_(src1, src2, dst, mask, len, cn); } \
\
static void accW_##suffix(const type* src, acctype* dst, \
const uchar* mask, int len, int cn, double alpha) \
{ accW_(src, dst, mask, len, cn, alpha); }
DEF_ACC_FUNCS
(
8u32
f
,
uchar
,
float
)
DEF_ACC_FUNCS
(
8u64
f
,
uchar
,
double
)
DEF_ACC_FUNCS
(
16u32
f
,
ushort
,
float
)
DEF_ACC_FUNCS
(
16u64
f
,
ushort
,
double
)
DEF_ACC_FUNCS
(
32
f
,
float
,
float
)
DEF_ACC_FUNCS
(
32
f64f
,
float
,
double
)
DEF_ACC_FUNCS
(
64
f
,
double
,
double
)
typedef
void
(
*
AccFunc
)(
const
uchar
*
,
uchar
*
,
const
uchar
*
,
int
,
int
);
typedef
void
(
*
AccProdFunc
)(
const
uchar
*
,
const
uchar
*
,
uchar
*
,
const
uchar
*
,
int
,
int
);
typedef
void
(
*
AccWFunc
)(
const
uchar
*
,
uchar
*
,
const
uchar
*
,
int
,
int
,
double
);
typedef
void
(
*
AccFunc
)(
const
uchar
*
,
uchar
*
,
const
uchar
*
,
int
,
int
);
typedef
void
(
*
AccProdFunc
)(
const
uchar
*
,
const
uchar
*
,
uchar
*
,
const
uchar
*
,
int
,
int
);
typedef
void
(
*
AccWFunc
)(
const
uchar
*
,
uchar
*
,
const
uchar
*
,
int
,
int
,
double
);
static
AccFunc
accTab
[]
=
{
...
...
modules/imgproc/src/accum.dispatch.cpp
0 → 100644
View file @
10e6491c
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#include "precomp.hpp"
#include "accum.simd.hpp"
#include "accum.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content
namespace
cv
{
DEF_ACC_INT_FUNCS
(
8u32
f
,
uchar
,
float
)
DEF_ACC_INT_FUNCS
(
8u64
f
,
uchar
,
double
)
DEF_ACC_INT_FUNCS
(
16u32
f
,
ushort
,
float
)
DEF_ACC_INT_FUNCS
(
16u64
f
,
ushort
,
double
)
DEF_ACC_FLT_FUNCS
(
32
f
,
float
,
float
)
DEF_ACC_FLT_FUNCS
(
32
f64f
,
float
,
double
)
DEF_ACC_FLT_FUNCS
(
64
f
,
double
,
double
)
}
//cv::hal
\ No newline at end of file
modules/imgproc/src/accum.simd.hpp
0 → 100644
View file @
10e6491c
This source diff could not be displayed because it is too large. You can
view the blob
instead.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment