Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
28db4a22
Commit
28db4a22
authored
Sep 02, 2016
by
Maksim Shabunin
Browse files
Options
Browse Files
Download
Plain Diff
Merge pull request #7175 from tomoaki0705:featureIntrinsic64
parents
1ac1e5be
7fef96be
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
626 additions
and
332 deletions
+626
-332
intrin_neon.hpp
modules/core/include/opencv2/core/hal/intrin_neon.hpp
+280
-2
intrin_sse.hpp
modules/core/include/opencv2/core/hal/intrin_sse.hpp
+10
-0
test_intrin.cpp
modules/core/test/test_intrin.cpp
+6
-0
accum.cpp
modules/imgproc/src/accum.cpp
+330
-330
No files found.
modules/core/include/opencv2/core/hal/intrin_neon.hpp
View file @
28db4a22
...
@@ -53,6 +53,28 @@ namespace cv
...
@@ -53,6 +53,28 @@ namespace cv
//! @cond IGNORED
//! @cond IGNORED
#define CV_SIMD128 1
#define CV_SIMD128 1
#if defined(__aarch64__)
#define CV_SIMD128_64F 1
#else
#define CV_SIMD128_64F 0
#endif
#if CV_SIMD128_64F
#define OPENCV_HAL_IMPL_NEON_REINTERPRET(_Tpv, suffix) \
template <typename T> static inline \
_Tpv vreinterpretq_##suffix##_f64(T a) { return (_Tpv) a; } \
template <typename T> static inline \
float64x2_t vreinterpretq_f64_##suffix(T a) { return (float64x2_t) a; }
OPENCV_HAL_IMPL_NEON_REINTERPRET
(
uint8x16_t
,
u8
)
OPENCV_HAL_IMPL_NEON_REINTERPRET
(
int8x16_t
,
s8
)
OPENCV_HAL_IMPL_NEON_REINTERPRET
(
uint16x8_t
,
u16
)
OPENCV_HAL_IMPL_NEON_REINTERPRET
(
int16x8_t
,
s16
)
OPENCV_HAL_IMPL_NEON_REINTERPRET
(
uint32x4_t
,
u32
)
OPENCV_HAL_IMPL_NEON_REINTERPRET
(
int32x4_t
,
s32
)
OPENCV_HAL_IMPL_NEON_REINTERPRET
(
uint64x2_t
,
u64
)
OPENCV_HAL_IMPL_NEON_REINTERPRET
(
int64x2_t
,
s64
)
OPENCV_HAL_IMPL_NEON_REINTERPRET
(
float32x4_t
,
f32
)
#endif
struct
v_uint8x16
struct
v_uint8x16
{
{
...
@@ -232,6 +254,27 @@ struct v_int64x2
...
@@ -232,6 +254,27 @@ struct v_int64x2
int64x2_t
val
;
int64x2_t
val
;
};
};
#if CV_SIMD128_64F
struct
v_float64x2
{
typedef
double
lane_type
;
enum
{
nlanes
=
2
};
v_float64x2
()
{}
explicit
v_float64x2
(
float64x2_t
v
)
:
val
(
v
)
{}
v_float64x2
(
double
v0
,
double
v1
)
{
double
v
[]
=
{
v0
,
v1
};
val
=
vld1q_f64
(
v
);
}
double
get0
()
const
{
return
vgetq_lane_f64
(
val
,
0
);
}
float64x2_t
val
;
};
#endif
#define OPENCV_HAL_IMPL_NEON_INIT(_Tpv, _Tp, suffix) \
#define OPENCV_HAL_IMPL_NEON_INIT(_Tpv, _Tp, suffix) \
inline v_##_Tpv v_setzero_##suffix() { return v_##_Tpv(vdupq_n_##suffix((_Tp)0)); } \
inline v_##_Tpv v_setzero_##suffix() { return v_##_Tpv(vdupq_n_##suffix((_Tp)0)); } \
inline v_##_Tpv v_setall_##suffix(_Tp v) { return v_##_Tpv(vdupq_n_##suffix(v)); } \
inline v_##_Tpv v_setall_##suffix(_Tp v) { return v_##_Tpv(vdupq_n_##suffix(v)); } \
...
@@ -255,6 +298,21 @@ OPENCV_HAL_IMPL_NEON_INIT(int32x4, int, s32)
...
@@ -255,6 +298,21 @@ OPENCV_HAL_IMPL_NEON_INIT(int32x4, int, s32)
OPENCV_HAL_IMPL_NEON_INIT
(
uint64x2
,
uint64
,
u64
)
OPENCV_HAL_IMPL_NEON_INIT
(
uint64x2
,
uint64
,
u64
)
OPENCV_HAL_IMPL_NEON_INIT
(
int64x2
,
int64
,
s64
)
OPENCV_HAL_IMPL_NEON_INIT
(
int64x2
,
int64
,
s64
)
OPENCV_HAL_IMPL_NEON_INIT
(
float32x4
,
float
,
f32
)
OPENCV_HAL_IMPL_NEON_INIT
(
float32x4
,
float
,
f32
)
#if CV_SIMD128_64F
#define OPENCV_HAL_IMPL_NEON_INIT_64(_Tpv, suffix) \
inline v_float64x2 v_reinterpret_as_f64(const v_##_Tpv& v) { return v_float64x2(vreinterpretq_f64_##suffix(v.val)); }
OPENCV_HAL_IMPL_NEON_INIT
(
float64x2
,
double
,
f64
)
OPENCV_HAL_IMPL_NEON_INIT_64
(
uint8x16
,
u8
)
OPENCV_HAL_IMPL_NEON_INIT_64
(
int8x16
,
s8
)
OPENCV_HAL_IMPL_NEON_INIT_64
(
uint16x8
,
u16
)
OPENCV_HAL_IMPL_NEON_INIT_64
(
int16x8
,
s16
)
OPENCV_HAL_IMPL_NEON_INIT_64
(
uint32x4
,
u32
)
OPENCV_HAL_IMPL_NEON_INIT_64
(
int32x4
,
s32
)
OPENCV_HAL_IMPL_NEON_INIT_64
(
uint64x2
,
u64
)
OPENCV_HAL_IMPL_NEON_INIT_64
(
int64x2
,
s64
)
OPENCV_HAL_IMPL_NEON_INIT_64
(
float32x4
,
f32
)
OPENCV_HAL_IMPL_NEON_INIT_64
(
float64x2
,
f64
)
#endif
#define OPENCV_HAL_IMPL_NEON_PACK(_Tpvec, _Tp, hreg, suffix, _Tpwvec, wsuffix, pack, op) \
#define OPENCV_HAL_IMPL_NEON_PACK(_Tpvec, _Tp, hreg, suffix, _Tpwvec, wsuffix, pack, op) \
inline _Tpvec v_##pack(const _Tpwvec& a, const _Tpwvec& b) \
inline _Tpvec v_##pack(const _Tpwvec& a, const _Tpwvec& b) \
...
@@ -337,7 +395,13 @@ OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int64x2, vaddq_s64)
...
@@ -337,7 +395,13 @@ OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int64x2, vaddq_s64)
OPENCV_HAL_IMPL_NEON_BIN_OP
(
-
,
v_int64x2
,
vsubq_s64
)
OPENCV_HAL_IMPL_NEON_BIN_OP
(
-
,
v_int64x2
,
vsubq_s64
)
OPENCV_HAL_IMPL_NEON_BIN_OP
(
+
,
v_uint64x2
,
vaddq_u64
)
OPENCV_HAL_IMPL_NEON_BIN_OP
(
+
,
v_uint64x2
,
vaddq_u64
)
OPENCV_HAL_IMPL_NEON_BIN_OP
(
-
,
v_uint64x2
,
vsubq_u64
)
OPENCV_HAL_IMPL_NEON_BIN_OP
(
-
,
v_uint64x2
,
vsubq_u64
)
#if CV_SIMD128_64F
OPENCV_HAL_IMPL_NEON_BIN_OP
(
/
,
v_float32x4
,
vdivq_f32
)
OPENCV_HAL_IMPL_NEON_BIN_OP
(
+
,
v_float64x2
,
vaddq_f64
)
OPENCV_HAL_IMPL_NEON_BIN_OP
(
-
,
v_float64x2
,
vsubq_f64
)
OPENCV_HAL_IMPL_NEON_BIN_OP
(
*
,
v_float64x2
,
vmulq_f64
)
OPENCV_HAL_IMPL_NEON_BIN_OP
(
/
,
v_float64x2
,
vdivq_f64
)
#else
inline
v_float32x4
operator
/
(
const
v_float32x4
&
a
,
const
v_float32x4
&
b
)
inline
v_float32x4
operator
/
(
const
v_float32x4
&
a
,
const
v_float32x4
&
b
)
{
{
float32x4_t
reciprocal
=
vrecpeq_f32
(
b
.
val
);
float32x4_t
reciprocal
=
vrecpeq_f32
(
b
.
val
);
...
@@ -353,6 +417,7 @@ inline v_float32x4& operator /= (v_float32x4& a, const v_float32x4& b)
...
@@ -353,6 +417,7 @@ inline v_float32x4& operator /= (v_float32x4& a, const v_float32x4& b)
a
.
val
=
vmulq_f32
(
a
.
val
,
reciprocal
);
a
.
val
=
vmulq_f32
(
a
.
val
,
reciprocal
);
return
a
;
return
a
;
}
}
#endif
inline
void
v_mul_expand
(
const
v_int16x8
&
a
,
const
v_int16x8
&
b
,
inline
void
v_mul_expand
(
const
v_int16x8
&
a
,
const
v_int16x8
&
b
,
v_int32x4
&
c
,
v_int32x4
&
d
)
v_int32x4
&
c
,
v_int32x4
&
d
)
...
@@ -421,6 +486,18 @@ inline v_float32x4 operator ~ (const v_float32x4& a)
...
@@ -421,6 +486,18 @@ inline v_float32x4 operator ~ (const v_float32x4& a)
return
v_float32x4
(
vreinterpretq_f32_s32
(
vmvnq_s32
(
vreinterpretq_s32_f32
(
a
.
val
))));
return
v_float32x4
(
vreinterpretq_f32_s32
(
vmvnq_s32
(
vreinterpretq_s32_f32
(
a
.
val
))));
}
}
#if CV_SIMD128_64F
inline
v_float32x4
v_sqrt
(
const
v_float32x4
&
x
)
{
return
v_float32x4
(
vsqrtq_f32
(
x
.
val
));
}
inline
v_float32x4
v_invsqrt
(
const
v_float32x4
&
x
)
{
v_float32x4
one
=
v_setall_f32
(
1.0
f
);
return
one
/
v_sqrt
(
x
);
}
#else
inline
v_float32x4
v_sqrt
(
const
v_float32x4
&
x
)
inline
v_float32x4
v_sqrt
(
const
v_float32x4
&
x
)
{
{
float32x4_t
x1
=
vmaxq_f32
(
x
.
val
,
vdupq_n_f32
(
FLT_MIN
));
float32x4_t
x1
=
vmaxq_f32
(
x
.
val
,
vdupq_n_f32
(
FLT_MIN
));
...
@@ -437,10 +514,47 @@ inline v_float32x4 v_invsqrt(const v_float32x4& x)
...
@@ -437,10 +514,47 @@ inline v_float32x4 v_invsqrt(const v_float32x4& x)
e
=
vmulq_f32
(
vrsqrtsq_f32
(
vmulq_f32
(
x
.
val
,
e
),
e
),
e
);
e
=
vmulq_f32
(
vrsqrtsq_f32
(
vmulq_f32
(
x
.
val
,
e
),
e
),
e
);
return
v_float32x4
(
e
);
return
v_float32x4
(
e
);
}
}
#endif
inline
v_float32x4
v_abs
(
v_float32x4
x
)
inline
v_float32x4
v_abs
(
v_float32x4
x
)
{
return
v_float32x4
(
vabsq_f32
(
x
.
val
));
}
{
return
v_float32x4
(
vabsq_f32
(
x
.
val
));
}
#if CV_SIMD128_64F
#define OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(bin_op, intrin) \
inline v_float64x2 operator bin_op (const v_float64x2& a, const v_float64x2& b) \
{ \
return v_float64x2(vreinterpretq_f64_s64(intrin(vreinterpretq_s64_f64(a.val), vreinterpretq_s64_f64(b.val)))); \
} \
inline v_float64x2& operator bin_op##= (v_float64x2& a, const v_float64x2& b) \
{ \
a.val = vreinterpretq_f64_s64(intrin(vreinterpretq_s64_f64(a.val), vreinterpretq_s64_f64(b.val))); \
return a; \
}
OPENCV_HAL_IMPL_NEON_DBL_BIT_OP
(
&
,
vandq_s64
)
OPENCV_HAL_IMPL_NEON_DBL_BIT_OP
(
|
,
vorrq_s64
)
OPENCV_HAL_IMPL_NEON_DBL_BIT_OP
(
^
,
veorq_s64
)
inline
v_float64x2
operator
~
(
const
v_float64x2
&
a
)
{
return
v_float64x2
(
vreinterpretq_f64_s32
(
vmvnq_s32
(
vreinterpretq_s32_f64
(
a
.
val
))));
}
inline
v_float64x2
v_sqrt
(
const
v_float64x2
&
x
)
{
return
v_float64x2
(
vsqrtq_f64
(
x
.
val
));
}
inline
v_float64x2
v_invsqrt
(
const
v_float64x2
&
x
)
{
v_float64x2
one
=
v_setall_f64
(
1.0
f
);
return
one
/
v_sqrt
(
x
);
}
inline
v_float64x2
v_abs
(
v_float64x2
x
)
{
return
v_float64x2
(
vabsq_f64
(
x
.
val
));
}
#endif
// TODO: exp, log, sin, cos
// TODO: exp, log, sin, cos
#define OPENCV_HAL_IMPL_NEON_BIN_FUNC(_Tpvec, func, intrin) \
#define OPENCV_HAL_IMPL_NEON_BIN_FUNC(_Tpvec, func, intrin) \
...
@@ -463,8 +577,23 @@ OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int32x4, v_min, vminq_s32)
...
@@ -463,8 +577,23 @@ OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int32x4, v_min, vminq_s32)
OPENCV_HAL_IMPL_NEON_BIN_FUNC
(
v_int32x4
,
v_max
,
vmaxq_s32
)
OPENCV_HAL_IMPL_NEON_BIN_FUNC
(
v_int32x4
,
v_max
,
vmaxq_s32
)
OPENCV_HAL_IMPL_NEON_BIN_FUNC
(
v_float32x4
,
v_min
,
vminq_f32
)
OPENCV_HAL_IMPL_NEON_BIN_FUNC
(
v_float32x4
,
v_min
,
vminq_f32
)
OPENCV_HAL_IMPL_NEON_BIN_FUNC
(
v_float32x4
,
v_max
,
vmaxq_f32
)
OPENCV_HAL_IMPL_NEON_BIN_FUNC
(
v_float32x4
,
v_max
,
vmaxq_f32
)
#if CV_SIMD128_64F
OPENCV_HAL_IMPL_NEON_BIN_FUNC
(
v_float64x2
,
v_min
,
vminq_f64
)
OPENCV_HAL_IMPL_NEON_BIN_FUNC
(
v_float64x2
,
v_max
,
vmaxq_f64
)
#endif
#if CV_SIMD128_64F
inline
int64x2_t
vmvnq_s64
(
int64x2_t
a
)
{
int64x2_t
vx
=
vreinterpretq_s64_u32
(
vdupq_n_u32
(
0xFFFFFFFF
));
return
veorq_s64
(
a
,
vx
);
}
inline
uint64x2_t
vmvnq_u64
(
uint64x2_t
a
)
{
uint64x2_t
vx
=
vreinterpretq_u64_u32
(
vdupq_n_u32
(
0xFFFFFFFF
));
return
veorq_u64
(
a
,
vx
);
}
#endif
#define OPENCV_HAL_IMPL_NEON_INT_CMP_OP(_Tpvec, cast, suffix, not_suffix) \
#define OPENCV_HAL_IMPL_NEON_INT_CMP_OP(_Tpvec, cast, suffix, not_suffix) \
inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(cast(vceqq_##suffix(a.val, b.val))); } \
{ return _Tpvec(cast(vceqq_##suffix(a.val, b.val))); } \
...
@@ -486,6 +615,11 @@ OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int16x8, vreinterpretq_s16_u16, s16, u16)
...
@@ -486,6 +615,11 @@ OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int16x8, vreinterpretq_s16_u16, s16, u16)
OPENCV_HAL_IMPL_NEON_INT_CMP_OP
(
v_uint32x4
,
OPENCV_HAL_NOP
,
u32
,
u32
)
OPENCV_HAL_IMPL_NEON_INT_CMP_OP
(
v_uint32x4
,
OPENCV_HAL_NOP
,
u32
,
u32
)
OPENCV_HAL_IMPL_NEON_INT_CMP_OP
(
v_int32x4
,
vreinterpretq_s32_u32
,
s32
,
u32
)
OPENCV_HAL_IMPL_NEON_INT_CMP_OP
(
v_int32x4
,
vreinterpretq_s32_u32
,
s32
,
u32
)
OPENCV_HAL_IMPL_NEON_INT_CMP_OP
(
v_float32x4
,
vreinterpretq_f32_u32
,
f32
,
u32
)
OPENCV_HAL_IMPL_NEON_INT_CMP_OP
(
v_float32x4
,
vreinterpretq_f32_u32
,
f32
,
u32
)
#if CV_SIMD128_64F
OPENCV_HAL_IMPL_NEON_INT_CMP_OP
(
v_uint64x2
,
OPENCV_HAL_NOP
,
u64
,
u64
)
OPENCV_HAL_IMPL_NEON_INT_CMP_OP
(
v_int64x2
,
vreinterpretq_s64_u64
,
s64
,
u64
)
OPENCV_HAL_IMPL_NEON_INT_CMP_OP
(
v_float64x2
,
vreinterpretq_f64_u64
,
f64
,
u64
)
#endif
OPENCV_HAL_IMPL_NEON_BIN_FUNC
(
v_uint8x16
,
v_add_wrap
,
vaddq_u8
)
OPENCV_HAL_IMPL_NEON_BIN_FUNC
(
v_uint8x16
,
v_add_wrap
,
vaddq_u8
)
OPENCV_HAL_IMPL_NEON_BIN_FUNC
(
v_int8x16
,
v_add_wrap
,
vaddq_s8
)
OPENCV_HAL_IMPL_NEON_BIN_FUNC
(
v_int8x16
,
v_add_wrap
,
vaddq_s8
)
...
@@ -501,6 +635,9 @@ OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_absdiff, vabdq_u8)
...
@@ -501,6 +635,9 @@ OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_absdiff, vabdq_u8)
OPENCV_HAL_IMPL_NEON_BIN_FUNC
(
v_uint16x8
,
v_absdiff
,
vabdq_u16
)
OPENCV_HAL_IMPL_NEON_BIN_FUNC
(
v_uint16x8
,
v_absdiff
,
vabdq_u16
)
OPENCV_HAL_IMPL_NEON_BIN_FUNC
(
v_uint32x4
,
v_absdiff
,
vabdq_u32
)
OPENCV_HAL_IMPL_NEON_BIN_FUNC
(
v_uint32x4
,
v_absdiff
,
vabdq_u32
)
OPENCV_HAL_IMPL_NEON_BIN_FUNC
(
v_float32x4
,
v_absdiff
,
vabdq_f32
)
OPENCV_HAL_IMPL_NEON_BIN_FUNC
(
v_float32x4
,
v_absdiff
,
vabdq_f32
)
#if CV_SIMD128_64F
OPENCV_HAL_IMPL_NEON_BIN_FUNC
(
v_float64x2
,
v_absdiff
,
vabdq_f64
)
#endif
#define OPENCV_HAL_IMPL_NEON_BIN_FUNC2(_Tpvec, _Tpvec2, cast, func, intrin) \
#define OPENCV_HAL_IMPL_NEON_BIN_FUNC2(_Tpvec, _Tpvec2, cast, func, intrin) \
inline _Tpvec2 func(const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec2 func(const _Tpvec& a, const _Tpvec& b) \
...
@@ -528,6 +665,24 @@ inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_
...
@@ -528,6 +665,24 @@ inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_
return
v_float32x4
(
vmlaq_f32
(
c
.
val
,
a
.
val
,
b
.
val
));
return
v_float32x4
(
vmlaq_f32
(
c
.
val
,
a
.
val
,
b
.
val
));
}
}
#if CV_SIMD128_64F
inline
v_float64x2
v_magnitude
(
const
v_float64x2
&
a
,
const
v_float64x2
&
b
)
{
v_float64x2
x
(
vaddq_f64
(
vmulq_f64
(
a
.
val
,
a
.
val
),
vmulq_f64
(
b
.
val
,
b
.
val
)));
return
v_sqrt
(
x
);
}
inline
v_float64x2
v_sqr_magnitude
(
const
v_float64x2
&
a
,
const
v_float64x2
&
b
)
{
return
v_float64x2
(
vaddq_f64
(
vmulq_f64
(
a
.
val
,
a
.
val
),
vmulq_f64
(
b
.
val
,
b
.
val
)));
}
inline
v_float64x2
v_muladd
(
const
v_float64x2
&
a
,
const
v_float64x2
&
b
,
const
v_float64x2
&
c
)
{
return
v_float64x2
(
vaddq_f64
(
c
.
val
,
vmulq_f64
(
a
.
val
,
b
.
val
)));
}
#endif
// trade efficiency for convenience
// trade efficiency for convenience
#define OPENCV_HAL_IMPL_NEON_SHIFT_OP(_Tpvec, suffix, _Tps, ssuffix) \
#define OPENCV_HAL_IMPL_NEON_SHIFT_OP(_Tpvec, suffix, _Tps, ssuffix) \
inline _Tpvec operator << (const _Tpvec& a, int n) \
inline _Tpvec operator << (const _Tpvec& a, int n) \
...
@@ -575,6 +730,9 @@ OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int32x4, int, s32)
...
@@ -575,6 +730,9 @@ OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int32x4, int, s32)
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP
(
v_uint64x2
,
uint64
,
u64
)
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP
(
v_uint64x2
,
uint64
,
u64
)
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP
(
v_int64x2
,
int64
,
s64
)
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP
(
v_int64x2
,
int64
,
s64
)
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP
(
v_float32x4
,
float
,
f32
)
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP
(
v_float32x4
,
float
,
f32
)
#if CV_SIMD128_64F
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP
(
v_float64x2
,
double
,
f64
)
#endif
#define OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(_Tpvec, scalartype, func, scalar_func) \
#define OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(_Tpvec, scalartype, func, scalar_func) \
inline scalartype v_reduce_##func(const _Tpvec& a) \
inline scalartype v_reduce_##func(const _Tpvec& a) \
...
@@ -627,6 +785,16 @@ inline int v_signmask(const v_int32x4& a)
...
@@ -627,6 +785,16 @@ inline int v_signmask(const v_int32x4& a)
{
return
v_signmask
(
v_reinterpret_as_u32
(
a
));
}
{
return
v_signmask
(
v_reinterpret_as_u32
(
a
));
}
inline
int
v_signmask
(
const
v_float32x4
&
a
)
inline
int
v_signmask
(
const
v_float32x4
&
a
)
{
return
v_signmask
(
v_reinterpret_as_u32
(
a
));
}
{
return
v_signmask
(
v_reinterpret_as_u32
(
a
));
}
#if CV_SIMD128_64F
inline
int
v_signmask
(
const
v_uint64x2
&
a
)
{
int64x1_t
m0
=
vdup_n_s64
(
0
);
uint64x2_t
v0
=
vshlq_u64
(
vshrq_n_u64
(
a
.
val
,
63
),
vcombine_s64
(
m0
,
m0
));
return
(
int
)
vgetq_lane_u64
(
v0
,
0
)
+
((
int
)
vgetq_lane_u64
(
v0
,
1
)
<<
1
);
}
inline
int
v_signmask
(
const
v_float64x2
&
a
)
{
return
v_signmask
(
v_reinterpret_as_u64
(
a
));
}
#endif
#define OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(_Tpvec, suffix, shift) \
#define OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(_Tpvec, suffix, shift) \
inline bool v_check_all(const v_##_Tpvec& a) \
inline bool v_check_all(const v_##_Tpvec& a) \
...
@@ -645,6 +813,9 @@ inline bool v_check_any(const v_##_Tpvec& a) \
...
@@ -645,6 +813,9 @@ inline bool v_check_any(const v_##_Tpvec& a) \
OPENCV_HAL_IMPL_NEON_CHECK_ALLANY
(
uint8x16
,
u8
,
7
)
OPENCV_HAL_IMPL_NEON_CHECK_ALLANY
(
uint8x16
,
u8
,
7
)
OPENCV_HAL_IMPL_NEON_CHECK_ALLANY
(
uint16x8
,
u16
,
15
)
OPENCV_HAL_IMPL_NEON_CHECK_ALLANY
(
uint16x8
,
u16
,
15
)
OPENCV_HAL_IMPL_NEON_CHECK_ALLANY
(
uint32x4
,
u32
,
31
)
OPENCV_HAL_IMPL_NEON_CHECK_ALLANY
(
uint32x4
,
u32
,
31
)
#if CV_SIMD128_64F
OPENCV_HAL_IMPL_NEON_CHECK_ALLANY
(
uint64x2
,
u64
,
63
)
#endif
inline
bool
v_check_all
(
const
v_int8x16
&
a
)
inline
bool
v_check_all
(
const
v_int8x16
&
a
)
{
return
v_check_all
(
v_reinterpret_as_u8
(
a
));
}
{
return
v_check_all
(
v_reinterpret_as_u8
(
a
));
}
...
@@ -664,6 +835,17 @@ inline bool v_check_any(const v_int32x4& a)
...
@@ -664,6 +835,17 @@ inline bool v_check_any(const v_int32x4& a)
inline
bool
v_check_any
(
const
v_float32x4
&
a
)
inline
bool
v_check_any
(
const
v_float32x4
&
a
)
{
return
v_check_any
(
v_reinterpret_as_u32
(
a
));
}
{
return
v_check_any
(
v_reinterpret_as_u32
(
a
));
}
#if CV_SIMD128_64F
inline
bool
v_check_all
(
const
v_int64x2
&
a
)
{
return
v_check_all
(
v_reinterpret_as_u64
(
a
));
}
inline
bool
v_check_all
(
const
v_float64x2
&
a
)
{
return
v_check_all
(
v_reinterpret_as_u64
(
a
));
}
inline
bool
v_check_any
(
const
v_int64x2
&
a
)
{
return
v_check_any
(
v_reinterpret_as_u64
(
a
));
}
inline
bool
v_check_any
(
const
v_float64x2
&
a
)
{
return
v_check_any
(
v_reinterpret_as_u64
(
a
));
}
#endif
#define OPENCV_HAL_IMPL_NEON_SELECT(_Tpvec, suffix, usuffix) \
#define OPENCV_HAL_IMPL_NEON_SELECT(_Tpvec, suffix, usuffix) \
inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
{ \
{ \
...
@@ -677,6 +859,9 @@ OPENCV_HAL_IMPL_NEON_SELECT(v_int16x8, s16, u16)
...
@@ -677,6 +859,9 @@ OPENCV_HAL_IMPL_NEON_SELECT(v_int16x8, s16, u16)
OPENCV_HAL_IMPL_NEON_SELECT
(
v_uint32x4
,
u32
,
u32
)
OPENCV_HAL_IMPL_NEON_SELECT
(
v_uint32x4
,
u32
,
u32
)
OPENCV_HAL_IMPL_NEON_SELECT
(
v_int32x4
,
s32
,
u32
)
OPENCV_HAL_IMPL_NEON_SELECT
(
v_int32x4
,
s32
,
u32
)
OPENCV_HAL_IMPL_NEON_SELECT
(
v_float32x4
,
f32
,
u32
)
OPENCV_HAL_IMPL_NEON_SELECT
(
v_float32x4
,
f32
,
u32
)
#if CV_SIMD128_64F
OPENCV_HAL_IMPL_NEON_SELECT
(
v_float64x2
,
f64
,
u64
)
#endif
#define OPENCV_HAL_IMPL_NEON_EXPAND(_Tpvec, _Tpwvec, _Tp, suffix) \
#define OPENCV_HAL_IMPL_NEON_EXPAND(_Tpvec, _Tpwvec, _Tp, suffix) \
inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
...
@@ -710,6 +895,27 @@ inline v_int32x4 v_load_expand_q(const schar* ptr)
...
@@ -710,6 +895,27 @@ inline v_int32x4 v_load_expand_q(const schar* ptr)
return
v_int32x4
(
vmovl_s16
(
v1
));
return
v_int32x4
(
vmovl_s16
(
v1
));
}
}
#if defined(__aarch64__)
#define OPENCV_HAL_IMPL_NEON_UNPACKS(_Tpvec, suffix) \
inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \
{ \
b0.val = vzip1q_##suffix(a0.val, a1.val); \
b1.val = vzip2q_##suffix(a0.val, a1.val); \
} \
inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \
{ \
return v_##_Tpvec(vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val))); \
} \
inline v_##_Tpvec v_combine_high(const v_##_Tpvec& a, const v_##_Tpvec& b) \
{ \
return v_##_Tpvec(vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val))); \
} \
inline void v_recombine(const v_##_Tpvec& a, const v_##_Tpvec& b, v_##_Tpvec& c, v_##_Tpvec& d) \
{ \
c.val = vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val)); \
d.val = vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val)); \
}
#else
#define OPENCV_HAL_IMPL_NEON_UNPACKS(_Tpvec, suffix) \
#define OPENCV_HAL_IMPL_NEON_UNPACKS(_Tpvec, suffix) \
inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \
inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \
{ \
{ \
...
@@ -730,6 +936,7 @@ inline void v_recombine(const v_##_Tpvec& a, const v_##_Tpvec& b, v_##_Tpvec& c,
...
@@ -730,6 +936,7 @@ inline void v_recombine(const v_##_Tpvec& a, const v_##_Tpvec& b, v_##_Tpvec& c,
c.val = vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val)); \
c.val = vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val)); \
d.val = vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val)); \
d.val = vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val)); \
}
}
#endif
OPENCV_HAL_IMPL_NEON_UNPACKS
(
uint8x16
,
u8
)
OPENCV_HAL_IMPL_NEON_UNPACKS
(
uint8x16
,
u8
)
OPENCV_HAL_IMPL_NEON_UNPACKS
(
int8x16
,
s8
)
OPENCV_HAL_IMPL_NEON_UNPACKS
(
int8x16
,
s8
)
...
@@ -738,6 +945,9 @@ OPENCV_HAL_IMPL_NEON_UNPACKS(int16x8, s16)
...
@@ -738,6 +945,9 @@ OPENCV_HAL_IMPL_NEON_UNPACKS(int16x8, s16)
OPENCV_HAL_IMPL_NEON_UNPACKS
(
uint32x4
,
u32
)
OPENCV_HAL_IMPL_NEON_UNPACKS
(
uint32x4
,
u32
)
OPENCV_HAL_IMPL_NEON_UNPACKS
(
int32x4
,
s32
)
OPENCV_HAL_IMPL_NEON_UNPACKS
(
int32x4
,
s32
)
OPENCV_HAL_IMPL_NEON_UNPACKS
(
float32x4
,
f32
)
OPENCV_HAL_IMPL_NEON_UNPACKS
(
float32x4
,
f32
)
#if CV_SIMD128_64F
OPENCV_HAL_IMPL_NEON_UNPACKS
(
float64x2
,
f64
)
#endif
#define OPENCV_HAL_IMPL_NEON_EXTRACT(_Tpvec, suffix) \
#define OPENCV_HAL_IMPL_NEON_EXTRACT(_Tpvec, suffix) \
template <int s> \
template <int s> \
...
@@ -755,6 +965,9 @@ OPENCV_HAL_IMPL_NEON_EXTRACT(int32x4, s32)
...
@@ -755,6 +965,9 @@ OPENCV_HAL_IMPL_NEON_EXTRACT(int32x4, s32)
OPENCV_HAL_IMPL_NEON_EXTRACT
(
uint64x2
,
u64
)
OPENCV_HAL_IMPL_NEON_EXTRACT
(
uint64x2
,
u64
)
OPENCV_HAL_IMPL_NEON_EXTRACT
(
int64x2
,
s64
)
OPENCV_HAL_IMPL_NEON_EXTRACT
(
int64x2
,
s64
)
OPENCV_HAL_IMPL_NEON_EXTRACT
(
float32x4
,
f32
)
OPENCV_HAL_IMPL_NEON_EXTRACT
(
float32x4
,
f32
)
#if CV_SIMD128_64F
OPENCV_HAL_IMPL_NEON_EXTRACT
(
float64x2
,
f64
)
#endif
inline
v_int32x4
v_round
(
const
v_float32x4
&
a
)
inline
v_int32x4
v_round
(
const
v_float32x4
&
a
)
{
{
...
@@ -782,6 +995,38 @@ inline v_int32x4 v_ceil(const v_float32x4& a)
...
@@ -782,6 +995,38 @@ inline v_int32x4 v_ceil(const v_float32x4& a)
inline
v_int32x4
v_trunc
(
const
v_float32x4
&
a
)
inline
v_int32x4
v_trunc
(
const
v_float32x4
&
a
)
{
return
v_int32x4
(
vcvtq_s32_f32
(
a
.
val
));
}
{
return
v_int32x4
(
vcvtq_s32_f32
(
a
.
val
));
}
#if CV_SIMD128_64F
inline
v_int32x4
v_round
(
const
v_float64x2
&
a
)
{
static
const
int32x2_t
zero
=
vdup_n_s32
(
0
);
return
v_int32x4
(
vcombine_s32
(
vmovn_s64
(
vcvtaq_s64_f64
(
a
.
val
)),
zero
));
}
inline
v_int32x4
v_floor
(
const
v_float64x2
&
a
)
{
static
const
int32x2_t
zero
=
vdup_n_s32
(
0
);
int64x2_t
a1
=
vcvtq_s64_f64
(
a
.
val
);
uint64x2_t
mask
=
vcgtq_f64
(
vcvtq_f64_s64
(
a1
),
a
.
val
);
a1
=
vaddq_s64
(
a1
,
vreinterpretq_s64_u64
(
mask
));
return
v_int32x4
(
vcombine_s32
(
vmovn_s64
(
a1
),
zero
));
}
inline
v_int32x4
v_ceil
(
const
v_float64x2
&
a
)
{
static
const
int32x2_t
zero
=
vdup_n_s32
(
0
);
int64x2_t
a1
=
vcvtq_s64_f64
(
a
.
val
);
uint64x2_t
mask
=
vcgtq_f64
(
a
.
val
,
vcvtq_f64_s64
(
a1
));
a1
=
vsubq_s64
(
a1
,
vreinterpretq_s64_u64
(
mask
));
return
v_int32x4
(
vcombine_s32
(
vmovn_s64
(
a1
),
zero
));
}
inline
v_int32x4
v_trunc
(
const
v_float64x2
&
a
)
{
static
const
int32x2_t
zero
=
vdup_n_s32
(
0
);
return
v_int32x4
(
vcombine_s32
(
vmovn_s64
(
vcvtaq_s64_f64
(
a
.
val
)),
zero
));
}
#endif
#define OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(_Tpvec, suffix) \
#define OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(_Tpvec, suffix) \
inline void v_transpose4x4(const v_##_Tpvec& a0, const v_##_Tpvec& a1, \
inline void v_transpose4x4(const v_##_Tpvec& a0, const v_##_Tpvec& a1, \
const v_##_Tpvec& a2, const v_##_Tpvec& a3, \
const v_##_Tpvec& a2, const v_##_Tpvec& a3, \
...
@@ -864,12 +1109,45 @@ OPENCV_HAL_IMPL_NEON_INTERLEAVED(int16x8, short, s16)
...
@@ -864,12 +1109,45 @@ OPENCV_HAL_IMPL_NEON_INTERLEAVED(int16x8, short, s16)
OPENCV_HAL_IMPL_NEON_INTERLEAVED
(
uint32x4
,
unsigned
,
u32
)
OPENCV_HAL_IMPL_NEON_INTERLEAVED
(
uint32x4
,
unsigned
,
u32
)
OPENCV_HAL_IMPL_NEON_INTERLEAVED
(
int32x4
,
int
,
s32
)
OPENCV_HAL_IMPL_NEON_INTERLEAVED
(
int32x4
,
int
,
s32
)
OPENCV_HAL_IMPL_NEON_INTERLEAVED
(
float32x4
,
float
,
f32
)
OPENCV_HAL_IMPL_NEON_INTERLEAVED
(
float32x4
,
float
,
f32
)
#if CV_SIMD128_64F
OPENCV_HAL_IMPL_NEON_INTERLEAVED
(
float64x2
,
double
,
f64
)
#endif
inline
v_float32x4
v_cvt_f32
(
const
v_int32x4
&
a
)
inline
v_float32x4
v_cvt_f32
(
const
v_int32x4
&
a
)
{
{
return
v_float32x4
(
vcvtq_f32_s32
(
a
.
val
));
return
v_float32x4
(
vcvtq_f32_s32
(
a
.
val
));
}
}
#if CV_SIMD128_64F
inline
v_float32x4
v_cvt_f32
(
const
v_float64x2
&
a
)
{
float32x2_t
zero
=
vdup_n_f32
(
0.0
f
);
return
v_float32x4
(
vcombine_f32
(
vcvt_f32_f64
(
a
.
val
),
zero
));
}
inline
v_float64x2
v_cvt_f64
(
const
v_int32x4
&
a
)
{
return
v_float64x2
(
vcvt_f64_f32
(
vcvt_f32_s32
(
vget_low_s32
(
a
.
val
))));
}
inline
v_float64x2
v_cvt_f64_high
(
const
v_int32x4
&
a
)
{
return
v_float64x2
(
vcvt_f64_f32
(
vcvt_f32_s32
(
vget_high_s32
(
a
.
val
))));
}
inline
v_float64x2
v_cvt_f64
(
const
v_float32x4
&
a
)
{
return
v_float64x2
(
vcvt_f64_f32
(
vget_low_f32
(
a
.
val
)));
}
inline
v_float64x2
v_cvt_f64_high
(
const
v_float32x4
&
a
)
{
return
v_float64x2
(
vcvt_f64_f32
(
vget_high_f32
(
a
.
val
)));
}
#endif
//! @endcond
//! @endcond
}
}
...
...
modules/core/include/opencv2/core/hal/intrin_sse.hpp
View file @
28db4a22
...
@@ -1611,11 +1611,21 @@ inline v_float64x2 v_cvt_f64(const v_int32x4& a)
...
@@ -1611,11 +1611,21 @@ inline v_float64x2 v_cvt_f64(const v_int32x4& a)
return
v_float64x2
(
_mm_cvtepi32_pd
(
a
.
val
));
return
v_float64x2
(
_mm_cvtepi32_pd
(
a
.
val
));
}
}
inline
v_float64x2
v_cvt_f64_high
(
const
v_int32x4
&
a
)
{
return
v_float64x2
(
_mm_cvtepi32_pd
(
_mm_srli_si128
(
a
.
val
,
8
)));
}
inline
v_float64x2
v_cvt_f64
(
const
v_float32x4
&
a
)
inline
v_float64x2
v_cvt_f64
(
const
v_float32x4
&
a
)
{
{
return
v_float64x2
(
_mm_cvtps_pd
(
a
.
val
));
return
v_float64x2
(
_mm_cvtps_pd
(
a
.
val
));
}
}
inline
v_float64x2
v_cvt_f64_high
(
const
v_float32x4
&
a
)
{
return
v_float64x2
(
_mm_cvtps_pd
(
_mm_castsi128_ps
(
_mm_srli_si128
(
_mm_castps_si128
(
a
.
val
),
8
))));
}
//! @endcond
//! @endcond
}
}
...
...
modules/core/test/test_intrin.cpp
View file @
28db4a22
...
@@ -652,12 +652,18 @@ template<typename R> struct TheTest
...
@@ -652,12 +652,18 @@ template<typename R> struct TheTest
dataA
*=
1.1
;
dataA
*=
1.1
;
R
a
=
dataA
;
R
a
=
dataA
;
Rt
b
=
v_cvt_f64
(
a
);
Rt
b
=
v_cvt_f64
(
a
);
Rt
c
=
v_cvt_f64_high
(
a
);
Data
<
Rt
>
resB
=
b
;
Data
<
Rt
>
resB
=
b
;
Data
<
Rt
>
resC
=
c
;
int
n
=
std
::
min
<
int
>
(
Rt
::
nlanes
,
R
::
nlanes
);
int
n
=
std
::
min
<
int
>
(
Rt
::
nlanes
,
R
::
nlanes
);
for
(
int
i
=
0
;
i
<
n
;
++
i
)
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
{
EXPECT_EQ
((
typename
Rt
::
lane_type
)
dataA
[
i
],
resB
[
i
]);
EXPECT_EQ
((
typename
Rt
::
lane_type
)
dataA
[
i
],
resB
[
i
]);
}
}
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
EXPECT_EQ
((
typename
Rt
::
lane_type
)
dataA
[
i
+
n
],
resC
[
i
]);
}
#endif
#endif
return
*
this
;
return
*
this
;
}
}
...
...
modules/imgproc/src/accum.cpp
View file @
28db4a22
...
@@ -395,7 +395,7 @@ struct AccW_SIMD<double, double>
...
@@ -395,7 +395,7 @@ struct AccW_SIMD<double, double>
return
x
;
return
x
;
}
}
};
};
#elif CV_S
SE2
#elif CV_S
IMD128
template
<>
template
<>
struct
Acc_SIMD
<
float
,
float
>
struct
Acc_SIMD
<
float
,
float
>
{
{
...
@@ -408,8 +408,8 @@ struct Acc_SIMD<float, float>
...
@@ -408,8 +408,8 @@ struct Acc_SIMD<float, float>
len
*=
cn
;
len
*=
cn
;
for
(
;
x
<=
len
-
8
;
x
+=
8
)
for
(
;
x
<=
len
-
8
;
x
+=
8
)
{
{
_mm_storeu_ps
(
dst
+
x
,
_mm_add_ps
(
_mm_loadu_ps
(
dst
+
x
),
_mm_loadu_ps
(
src
+
x
)
));
v_store
(
dst
+
x
,
v_load
(
dst
+
x
)
+
v_load
(
src
+
x
));
_mm_storeu_ps
(
dst
+
x
+
4
,
_mm_add_ps
(
_mm_loadu_ps
(
dst
+
x
+
4
),
_mm_loadu_ps
(
src
+
x
+
4
)
));
v_store
(
dst
+
x
+
4
,
v_load
(
dst
+
x
+
4
)
+
v_load
(
src
+
x
+
4
));
}
}
}
}
...
@@ -417,6 +417,7 @@ struct Acc_SIMD<float, float>
...
@@ -417,6 +417,7 @@ struct Acc_SIMD<float, float>
}
}
};
};
#if CV_SIMD128_64F
template
<>
template
<>
struct
Acc_SIMD
<
float
,
double
>
struct
Acc_SIMD
<
float
,
double
>
{
{
...
@@ -429,17 +430,12 @@ struct Acc_SIMD<float, double>
...
@@ -429,17 +430,12 @@ struct Acc_SIMD<float, double>
len
*=
cn
;
len
*=
cn
;
for
(
;
x
<=
len
-
4
;
x
+=
4
)
for
(
;
x
<=
len
-
4
;
x
+=
4
)
{
{
__m128
v_src
=
_mm_loadu_ps
(
src
+
x
);
v_float32x4
v_src
=
v_load
(
src
+
x
);
__m128d
v_src0
=
_mm_cvtps_pd
(
v_src
);
v_float64x2
v_src0
=
v_cvt_f64
(
v_src
);
__m128d
v_src1
=
_mm_cvtps_pd
(
_mm_shuffle_ps
(
v_src
,
v_src
,
_MM_SHUFFLE
(
1
,
0
,
3
,
2
))
);
v_float64x2
v_src1
=
v_cvt_f64_high
(
v_src
);
__m128d
v_dst0
=
_mm_loadu_pd
(
dst
+
x
);
v_store
(
dst
+
x
,
v_load
(
dst
+
x
)
+
v_src0
);
__m128d
v_dst1
=
_mm_loadu_pd
(
dst
+
x
+
2
);
v_store
(
dst
+
x
+
2
,
v_load
(
dst
+
x
+
2
)
+
v_src1
);
v_dst0
=
_mm_add_pd
(
v_dst0
,
v_src0
);
v_dst1
=
_mm_add_pd
(
v_dst1
,
v_src1
);
_mm_storeu_pd
(
dst
+
x
,
v_dst0
);
_mm_storeu_pd
(
dst
+
x
+
2
,
v_dst1
);
}
}
}
}
return
x
;
return
x
;
...
@@ -458,21 +454,17 @@ struct Acc_SIMD<double, double>
...
@@ -458,21 +454,17 @@ struct Acc_SIMD<double, double>
len
*=
cn
;
len
*=
cn
;
for
(
;
x
<=
len
-
4
;
x
+=
4
)
for
(
;
x
<=
len
-
4
;
x
+=
4
)
{
{
__m128d
v_src0
=
_mm_loadu_pd
(
src
+
x
);
v_float64x2
v_src0
=
v_load
(
src
+
x
);
__m128d
v_src1
=
_mm_loadu_pd
(
src
+
x
+
2
);
v_float64x2
v_src1
=
v_load
(
src
+
x
+
2
);
__m128d
v_dst0
=
_mm_loadu_pd
(
dst
+
x
);
__m128d
v_dst1
=
_mm_loadu_pd
(
dst
+
x
+
2
);
v_dst0
=
_mm_add_pd
(
v_dst0
,
v_src0
);
v_store
(
dst
+
x
,
v_load
(
dst
+
x
)
+
v_src0
);
v_dst1
=
_mm_add_pd
(
v_dst1
,
v_src1
);
v_store
(
dst
+
x
+
2
,
v_load
(
dst
+
x
+
2
)
+
v_src1
);
_mm_storeu_pd
(
dst
+
x
,
v_dst0
);
_mm_storeu_pd
(
dst
+
x
+
2
,
v_dst1
);
}
}
}
}
return
x
;
return
x
;
}
}
};
};
#endif //CV_SIMD128_64F
template
<>
template
<>
struct
AccSqr_SIMD
<
float
,
float
>
struct
AccSqr_SIMD
<
float
,
float
>
...
@@ -486,12 +478,13 @@ struct AccSqr_SIMD<float, float>
...
@@ -486,12 +478,13 @@ struct AccSqr_SIMD<float, float>
len
*=
cn
;
len
*=
cn
;
for
(
;
x
<=
len
-
8
;
x
+=
8
)
for
(
;
x
<=
len
-
8
;
x
+=
8
)
{
{
__m128
v_src0
=
_mm_loadu_ps
(
src
+
x
);
v_float32x4
v_src0
=
v_load
(
src
+
x
);
__m128
v_src1
=
_mm_loadu_ps
(
src
+
x
+
4
);
v_float32x4
v_src1
=
v_load
(
src
+
x
+
4
);
v_src0
=
_mm_mul_ps
(
v_src0
,
v_src0
);
v_src0
=
v_src0
*
v_src0
;
v_src1
=
_mm_mul_ps
(
v_src1
,
v_src1
);
v_src1
=
v_src1
*
v_src1
;
_mm_storeu_ps
(
dst
+
x
,
_mm_add_ps
(
_mm_loadu_ps
(
dst
+
x
),
v_src0
));
_mm_storeu_ps
(
dst
+
x
+
4
,
_mm_add_ps
(
_mm_loadu_ps
(
dst
+
x
+
4
),
v_src1
));
v_store
(
dst
+
x
,
v_load
(
dst
+
x
)
+
v_src0
);
v_store
(
dst
+
x
+
4
,
v_load
(
dst
+
x
+
4
)
+
v_src1
);
}
}
}
}
...
@@ -499,6 +492,7 @@ struct AccSqr_SIMD<float, float>
...
@@ -499,6 +492,7 @@ struct AccSqr_SIMD<float, float>
}
}
};
};
#if CV_SIMD128_64F
template
<>
template
<>
struct
AccSqr_SIMD
<
float
,
double
>
struct
AccSqr_SIMD
<
float
,
double
>
{
{
...
@@ -511,19 +505,14 @@ struct AccSqr_SIMD<float, double>
...
@@ -511,19 +505,14 @@ struct AccSqr_SIMD<float, double>
len
*=
cn
;
len
*=
cn
;
for
(
;
x
<=
len
-
4
;
x
+=
4
)
for
(
;
x
<=
len
-
4
;
x
+=
4
)
{
{
__m128
v_src
=
_mm_loadu_ps
(
src
+
x
);
v_float32x4
v_src
=
v_load
(
src
+
x
);
__m128d
v_src0
=
_mm_cvtps_pd
(
v_src
);
v_float64x2
v_src0
=
v_cvt_f64
(
v_src
);
__m128d
v_src1
=
_mm_cvtps_pd
(
_mm_shuffle_ps
(
v_src
,
v_src
,
_MM_SHUFFLE
(
1
,
0
,
3
,
2
)));
v_float64x2
v_src1
=
v_cvt_f64_high
(
v_src
);
v_src0
=
_mm_mul_pd
(
v_src0
,
v_src0
);
v_src0
=
v_src0
*
v_src0
;
v_src1
=
_mm_mul_pd
(
v_src1
,
v_src1
);
v_src1
=
v_src1
*
v_src1
;
__m128d
v_dst0
=
_mm_loadu_pd
(
dst
+
x
);
__m128d
v_dst1
=
_mm_loadu_pd
(
dst
+
x
+
2
);
v_dst0
=
_mm_add_pd
(
v_dst0
,
v_src0
);
v_dst1
=
_mm_add_pd
(
v_dst1
,
v_src1
);
_mm_storeu_pd
(
dst
+
x
,
v_dst
0
);
v_store
(
dst
+
x
,
v_load
(
dst
+
x
)
+
v_src
0
);
_mm_storeu_pd
(
dst
+
x
+
2
,
v_dst
1
);
v_store
(
dst
+
x
+
2
,
v_load
(
dst
+
x
+
2
)
+
v_src
1
);
}
}
}
}
return
x
;
return
x
;
...
@@ -542,23 +531,19 @@ struct AccSqr_SIMD<double, double>
...
@@ -542,23 +531,19 @@ struct AccSqr_SIMD<double, double>
len
*=
cn
;
len
*=
cn
;
for
(
;
x
<=
len
-
4
;
x
+=
4
)
for
(
;
x
<=
len
-
4
;
x
+=
4
)
{
{
__m128d
v_src0
=
_mm_loadu_pd
(
src
+
x
);
v_float64x2
v_src0
=
v_load
(
src
+
x
);
__m128d
v_src1
=
_mm_loadu_pd
(
src
+
x
+
2
);
v_float64x2
v_src1
=
v_load
(
src
+
x
+
2
);
v_src0
=
_mm_mul_pd
(
v_src0
,
v_src0
);
v_src0
=
v_src0
*
v_src0
;
v_src1
=
_mm_mul_pd
(
v_src1
,
v_src1
);
v_src1
=
v_src1
*
v_src1
;
__m128d
v_dst0
=
_mm_loadu_pd
(
dst
+
x
);
__m128d
v_dst1
=
_mm_loadu_pd
(
dst
+
x
+
2
);
v_dst0
=
_mm_add_pd
(
v_dst0
,
v_src0
);
v_store
(
dst
+
x
,
v_load
(
dst
+
x
)
+
v_src0
);
v_dst1
=
_mm_add_pd
(
v_dst1
,
v_src1
);
v_store
(
dst
+
x
+
2
,
v_load
(
dst
+
x
+
2
)
+
v_src1
);
_mm_storeu_pd
(
dst
+
x
,
v_dst0
);
_mm_storeu_pd
(
dst
+
x
+
2
,
v_dst1
);
}
}
}
}
return
x
;
return
x
;
}
}
};
};
#endif //CV_SIMD128_64F
template
<>
template
<>
struct
AccProd_SIMD
<
float
,
float
>
struct
AccProd_SIMD
<
float
,
float
>
...
@@ -572,8 +557,8 @@ struct AccProd_SIMD<float, float>
...
@@ -572,8 +557,8 @@ struct AccProd_SIMD<float, float>
len
*=
cn
;
len
*=
cn
;
for
(
;
x
<=
len
-
8
;
x
+=
8
)
for
(
;
x
<=
len
-
8
;
x
+=
8
)
{
{
_mm_storeu_ps
(
dst
+
x
,
_mm_add_ps
(
_mm_loadu_ps
(
dst
+
x
),
_mm_mul_ps
(
_mm_loadu_ps
(
src1
+
x
),
_mm_loadu_ps
(
src2
+
x
))
));
v_store
(
dst
+
x
,
v_load
(
dst
+
x
)
+
v_load
(
src1
+
x
)
*
v_load
(
src2
+
x
));
_mm_storeu_ps
(
dst
+
x
+
4
,
_mm_add_ps
(
_mm_loadu_ps
(
dst
+
x
+
4
),
_mm_mul_ps
(
_mm_loadu_ps
(
src1
+
x
+
4
),
_mm_loadu_ps
(
src2
+
x
+
4
))
));
v_store
(
dst
+
x
+
4
,
v_load
(
dst
+
x
+
4
)
+
v_load
(
src1
+
x
+
4
)
*
v_load
(
src2
+
x
+
4
));
}
}
}
}
...
@@ -581,6 +566,7 @@ struct AccProd_SIMD<float, float>
...
@@ -581,6 +566,7 @@ struct AccProd_SIMD<float, float>
}
}
};
};
#if CV_SIMD128_64F
template
<>
template
<>
struct
AccProd_SIMD
<
float
,
double
>
struct
AccProd_SIMD
<
float
,
double
>
{
{
...
@@ -593,22 +579,16 @@ struct AccProd_SIMD<float, double>
...
@@ -593,22 +579,16 @@ struct AccProd_SIMD<float, double>
len
*=
cn
;
len
*=
cn
;
for
(
;
x
<=
len
-
4
;
x
+=
4
)
for
(
;
x
<=
len
-
4
;
x
+=
4
)
{
{
__m128
v_1src
=
_mm_loadu_ps
(
src1
+
x
);
v_float32x4
v_1src
=
v_load
(
src1
+
x
);
__m128
v_2src
=
_mm_loadu_ps
(
src2
+
x
);
v_float32x4
v_2src
=
v_load
(
src2
+
x
);
__m128d
v_1src0
=
_mm_cvtps_pd
(
v_1src
);
__m128d
v_1src1
=
_mm_cvtps_pd
(
_mm_shuffle_ps
(
v_1src
,
v_1src
,
_MM_SHUFFLE
(
0
,
0
,
3
,
2
)));
__m128d
v_2src0
=
_mm_cvtps_pd
(
v_2src
);
__m128d
v_2src1
=
_mm_cvtps_pd
(
_mm_shuffle_ps
(
v_2src
,
v_2src
,
_MM_SHUFFLE
(
0
,
0
,
3
,
2
)));
__m128d
v_dst0
=
_mm_loadu_pd
(
dst
+
x
);
v_float64x2
v_1src0
=
v_cvt_f64
(
v_1src
);
__m128d
v_dst1
=
_mm_loadu_pd
(
dst
+
x
+
2
);
v_float64x2
v_1src1
=
v_cvt_f64_high
(
v_1src
);
v_float64x2
v_2src0
=
v_cvt_f64
(
v_2src
);
v_float64x2
v_2src1
=
v_cvt_f64_high
(
v_2src
);
v_dst0
=
_mm_add_pd
(
v_dst0
,
_mm_mul_pd
(
v_1src0
,
v_2src0
));
v_store
(
dst
+
x
,
v_load
(
dst
+
x
)
+
(
v_1src0
*
v_2src0
));
v_dst1
=
_mm_add_pd
(
v_dst1
,
_mm_mul_pd
(
v_1src1
,
v_2src1
));
v_store
(
dst
+
x
+
2
,
v_load
(
dst
+
x
+
2
)
+
(
v_1src1
*
v_2src1
));
_mm_storeu_pd
(
dst
+
x
,
v_dst0
);
_mm_storeu_pd
(
dst
+
x
+
2
,
v_dst1
);
}
}
}
}
return
x
;
return
x
;
...
@@ -627,25 +607,19 @@ struct AccProd_SIMD<double, double>
...
@@ -627,25 +607,19 @@ struct AccProd_SIMD<double, double>
len
*=
cn
;
len
*=
cn
;
for
(
;
x
<=
len
-
4
;
x
+=
4
)
for
(
;
x
<=
len
-
4
;
x
+=
4
)
{
{
__m128d
v_src00
=
_mm_loadu_pd
(
src1
+
x
);
v_float64x2
v_src00
=
v_load
(
src1
+
x
);
__m128d
v_src01
=
_mm_loadu_pd
(
src1
+
x
+
2
);
v_float64x2
v_src01
=
v_load
(
src1
+
x
+
2
);
__m128d
v_src10
=
_mm_loadu_pd
(
src2
+
x
);
v_float64x2
v_src10
=
v_load
(
src2
+
x
);
__m128d
v_src11
=
_mm_loadu_pd
(
src2
+
x
+
2
);
v_float64x2
v_src11
=
v_load
(
src2
+
x
+
2
);
__m128d
v_src0
=
_mm_mul_pd
(
v_src00
,
v_src10
);
__m128d
v_src1
=
_mm_mul_pd
(
v_src01
,
v_src11
);
__m128d
v_dst0
=
_mm_loadu_pd
(
dst
+
x
);
__m128d
v_dst1
=
_mm_loadu_pd
(
dst
+
x
+
2
);
v_dst0
=
_mm_add_pd
(
v_dst0
,
v_src0
);
v_store
(
dst
+
x
,
v_load
(
dst
+
x
)
+
(
v_src00
*
v_src10
));
v_dst1
=
_mm_add_pd
(
v_dst1
,
v_src1
);
v_store
(
dst
+
x
+
2
,
v_load
(
dst
+
x
+
2
)
+
(
v_src01
*
v_src11
));
_mm_storeu_pd
(
dst
+
x
,
v_dst0
);
_mm_storeu_pd
(
dst
+
x
+
2
,
v_dst1
);
}
}
}
}
return
x
;
return
x
;
}
}
};
};
#endif //CV_SIMD128_64F
template
<>
template
<>
struct
AccW_SIMD
<
float
,
float
>
struct
AccW_SIMD
<
float
,
float
>
...
@@ -653,16 +627,16 @@ struct AccW_SIMD<float, float>
...
@@ -653,16 +627,16 @@ struct AccW_SIMD<float, float>
int
operator
()
(
const
float
*
src
,
float
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
,
float
alpha
)
const
int
operator
()
(
const
float
*
src
,
float
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
,
float
alpha
)
const
{
{
int
x
=
0
;
int
x
=
0
;
__m128
v_alpha
=
_mm_set1_ps
(
alpha
);
v_float32x4
v_alpha
=
v_setall_f32
(
alpha
);
__m128
v_beta
=
_mm_set1_ps
(
1.0
f
-
alpha
);
v_float32x4
v_beta
=
v_setall_f32
(
1.0
f
-
alpha
);
if
(
!
mask
)
if
(
!
mask
)
{
{
len
*=
cn
;
len
*=
cn
;
for
(
;
x
<=
len
-
8
;
x
+=
8
)
for
(
;
x
<=
len
-
8
;
x
+=
8
)
{
{
_mm_storeu_ps
(
dst
+
x
,
_mm_add_ps
(
_mm_mul_ps
(
_mm_loadu_ps
(
dst
+
x
),
v_beta
),
_mm_mul_ps
(
_mm_loadu_ps
(
src
+
x
),
v_alpha
)));
v_store
(
dst
+
x
,
((
v_load
(
dst
+
x
)
*
v_beta
)
+
(
v_load
(
src
+
x
)
*
v_alpha
)));
_mm_storeu_ps
(
dst
+
x
+
4
,
_mm_add_ps
(
_mm_mul_ps
(
_mm_loadu_ps
(
dst
+
x
+
4
),
v_beta
),
_mm_mul_ps
(
_mm_loadu_ps
(
src
+
x
+
4
),
v_alpha
)));
v_store
(
dst
+
x
+
4
,
((
v_load
(
dst
+
x
+
4
)
*
v_beta
)
+
(
v_load
(
src
+
x
+
4
)
*
v_alpha
)));
}
}
}
}
...
@@ -670,31 +644,32 @@ struct AccW_SIMD<float, float>
...
@@ -670,31 +644,32 @@ struct AccW_SIMD<float, float>
}
}
};
};
#if CV_SIMD128_64F
template
<>
template
<>
struct
AccW_SIMD
<
float
,
double
>
struct
AccW_SIMD
<
float
,
double
>
{
{
int
operator
()
(
const
float
*
src
,
double
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
,
double
alpha
)
const
int
operator
()
(
const
float
*
src
,
double
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
,
double
alpha
)
const
{
{
int
x
=
0
;
int
x
=
0
;
__m128d
v_alpha
=
_mm_set1_pd
(
alpha
);
v_float64x2
v_alpha
=
v_setall_f64
(
alpha
);
__m128d
v_beta
=
_mm_set1_pd
(
1.0
f
-
alpha
);
v_float64x2
v_beta
=
v_setall_f64
(
1.0
f
-
alpha
);
if
(
!
mask
)
if
(
!
mask
)
{
{
len
*=
cn
;
len
*=
cn
;
for
(
;
x
<=
len
-
8
;
x
+=
8
)
for
(
;
x
<=
len
-
8
;
x
+=
8
)
{
{
__m128
v_src0
=
_mm_loadu_ps
(
src
+
x
);
v_float32x4
v_src0
=
v_load
(
src
+
x
);
__m128
v_src1
=
_mm_loadu_ps
(
src
+
x
+
4
);
v_float32x4
v_src1
=
v_load
(
src
+
x
+
4
);
__m128d
v_src00
=
_mm_cvtps_pd
(
v_src0
);
v_float64x2
v_src00
=
v_cvt_f64
(
v_src0
);
__m128d
v_src01
=
_mm_cvtps_pd
(
_mm_shuffle_ps
(
v_src0
,
v_src0
,
_MM_SHUFFLE
(
0
,
0
,
3
,
2
))
);
v_float64x2
v_src01
=
v_cvt_f64_high
(
v_src0
);
__m128d
v_src10
=
_mm_cvtps_pd
(
v_src1
);
v_float64x2
v_src10
=
v_cvt_f64
(
v_src1
);
__m128d
v_src11
=
_mm_cvtps_pd
(
_mm_shuffle_ps
(
v_src1
,
v_src1
,
_MM_SHUFFLE
(
0
,
0
,
3
,
2
))
);
v_float64x2
v_src11
=
v_cvt_f64_high
(
v_src1
);
_mm_storeu_pd
(
dst
+
x
,
_mm_add_pd
(
_mm_mul_pd
(
_mm_loadu_pd
(
dst
+
x
),
v_beta
),
_mm_mul_pd
(
v_src00
,
v_alpha
)));
v_store
(
dst
+
x
,
((
v_load
(
dst
+
x
)
*
v_beta
)
+
(
v_src00
*
v_alpha
)));
_mm_storeu_pd
(
dst
+
x
+
2
,
_mm_add_pd
(
_mm_mul_pd
(
_mm_loadu_pd
(
dst
+
x
+
2
),
v_beta
),
_mm_mul_pd
(
v_src01
,
v_alpha
)));
v_store
(
dst
+
x
+
2
,
((
v_load
(
dst
+
x
+
2
)
*
v_beta
)
+
(
v_src01
*
v_alpha
)));
_mm_storeu_pd
(
dst
+
x
+
4
,
_mm_add_pd
(
_mm_mul_pd
(
_mm_loadu_pd
(
dst
+
x
+
4
),
v_beta
),
_mm_mul_pd
(
v_src10
,
v_alpha
)));
v_store
(
dst
+
x
+
4
,
((
v_load
(
dst
+
x
+
4
)
*
v_beta
)
+
(
v_src10
*
v_alpha
)));
_mm_storeu_pd
(
dst
+
x
+
6
,
_mm_add_pd
(
_mm_mul_pd
(
_mm_loadu_pd
(
dst
+
x
+
6
),
v_beta
),
_mm_mul_pd
(
v_src11
,
v_alpha
)));
v_store
(
dst
+
x
+
6
,
((
v_load
(
dst
+
x
+
6
)
*
v_beta
)
+
(
v_src11
*
v_alpha
)));
}
}
}
}
...
@@ -708,26 +683,27 @@ struct AccW_SIMD<double, double>
...
@@ -708,26 +683,27 @@ struct AccW_SIMD<double, double>
int
operator
()
(
const
double
*
src
,
double
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
,
double
alpha
)
const
int
operator
()
(
const
double
*
src
,
double
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
,
double
alpha
)
const
{
{
int
x
=
0
;
int
x
=
0
;
__m128d
v_alpha
=
_mm_set1_pd
(
alpha
);
v_float64x2
v_alpha
=
v_setall_f64
(
alpha
);
__m128d
v_beta
=
_mm_set1_pd
(
1.0
f
-
alpha
);
v_float64x2
v_beta
=
v_setall_f64
(
1.0
f
-
alpha
);
if
(
!
mask
)
if
(
!
mask
)
{
{
len
*=
cn
;
len
*=
cn
;
for
(
;
x
<=
len
-
4
;
x
+=
4
)
for
(
;
x
<=
len
-
4
;
x
+=
4
)
{
{
__m128d
v_src0
=
_mm_loadu_p
d
(
src
+
x
);
v_float64x2
v_src0
=
v_loa
d
(
src
+
x
);
__m128d
v_src1
=
_mm_loadu_p
d
(
src
+
x
+
2
);
v_float64x2
v_src1
=
v_loa
d
(
src
+
x
+
2
);
_mm_storeu_pd
(
dst
+
x
,
_mm_add_pd
(
_mm_mul_pd
(
_mm_loadu_pd
(
dst
+
x
),
v_beta
),
_mm_mul_pd
(
v_src0
,
v_alpha
)));
v_store
(
dst
+
x
,
((
v_load
(
dst
+
x
)
*
v_beta
)
+
(
v_src0
*
v_alpha
)));
_mm_storeu_pd
(
dst
+
x
+
2
,
_mm_add_pd
(
_mm_mul_pd
(
_mm_loadu_pd
(
dst
+
x
+
2
),
v_beta
),
_mm_mul_pd
(
v_src1
,
v_alpha
)));
v_store
(
dst
+
x
+
2
,
((
v_load
(
dst
+
x
+
2
)
*
v_beta
)
+
(
v_src1
*
v_alpha
)));
}
}
}
}
return
x
;
return
x
;
}
}
};
};
#endif
#endif //CV_SIMD128_64F
#endif //CV_SIMD128
#if CV_SIMD128
#if CV_SIMD128
template
<>
template
<>
...
@@ -742,7 +718,7 @@ struct Acc_SIMD<uchar, float>
...
@@ -742,7 +718,7 @@ struct Acc_SIMD<uchar, float>
len
*=
cn
;
len
*=
cn
;
for
(
;
x
<=
len
-
16
;
x
+=
16
)
for
(
;
x
<=
len
-
16
;
x
+=
16
)
{
{
v_uint8x16
v_src
=
v_load
(
(
const
uchar
*
)(
src
+
x
)
);
v_uint8x16
v_src
=
v_load
(
src
+
x
);
v_uint16x8
v_src0
,
v_src1
;
v_uint16x8
v_src0
,
v_src1
;
v_expand
(
v_src
,
v_src0
,
v_src1
);
v_expand
(
v_src
,
v_src0
,
v_src1
);
...
@@ -762,9 +738,9 @@ struct Acc_SIMD<uchar, float>
...
@@ -762,9 +738,9 @@ struct Acc_SIMD<uchar, float>
for
(
;
x
<=
len
-
16
;
x
+=
16
)
for
(
;
x
<=
len
-
16
;
x
+=
16
)
{
{
v_uint8x16
v_mask
=
v_load
(
(
const
uchar
*
)(
mask
+
x
)
);
v_uint8x16
v_mask
=
v_load
(
mask
+
x
);
v_mask
=
~
(
v_0
==
v_mask
);
v_mask
=
~
(
v_0
==
v_mask
);
v_uint8x16
v_src
=
v_load
(
(
const
uchar
*
)(
src
+
x
)
);
v_uint8x16
v_src
=
v_load
(
src
+
x
);
v_src
=
v_src
&
v_mask
;
v_src
=
v_src
&
v_mask
;
v_uint16x8
v_src0
,
v_src1
;
v_uint16x8
v_src0
,
v_src1
;
v_expand
(
v_src
,
v_src0
,
v_src1
);
v_expand
(
v_src
,
v_src0
,
v_src1
);
...
@@ -795,7 +771,7 @@ struct Acc_SIMD<ushort, float>
...
@@ -795,7 +771,7 @@ struct Acc_SIMD<ushort, float>
len
*=
cn
;
len
*=
cn
;
for
(
;
x
<=
len
-
8
;
x
+=
8
)
for
(
;
x
<=
len
-
8
;
x
+=
8
)
{
{
v_uint16x8
v_src
=
v_load
(
(
const
ushort
*
)(
src
+
x
)
);
v_uint16x8
v_src
=
v_load
(
src
+
x
);
v_uint32x4
v_src0
,
v_src1
;
v_uint32x4
v_src0
,
v_src1
;
v_expand
(
v_src
,
v_src0
,
v_src1
);
v_expand
(
v_src
,
v_src0
,
v_src1
);
...
@@ -808,7 +784,7 @@ struct Acc_SIMD<ushort, float>
...
@@ -808,7 +784,7 @@ struct Acc_SIMD<ushort, float>
}
}
};
};
#if CV_S
SE2
#if CV_S
IMD128_64F
template
<>
template
<>
struct
Acc_SIMD
<
uchar
,
double
>
struct
Acc_SIMD
<
uchar
,
double
>
{
{
...
@@ -818,52 +794,52 @@ struct Acc_SIMD<uchar, double>
...
@@ -818,52 +794,52 @@ struct Acc_SIMD<uchar, double>
if
(
!
mask
)
if
(
!
mask
)
{
{
__m128i
v_0
=
_mm_setzero_si128
();
len
*=
cn
;
len
*=
cn
;
for
(
;
x
<=
len
-
16
;
x
+=
16
)
for
(
;
x
<=
len
-
16
;
x
+=
16
)
{
{
__m128i
v_src
=
_mm_loadu_si128
((
const
__m128i
*
)(
src
+
x
));
v_uint8x16
v_src
=
v_load
(
src
+
x
);
__m128i
v_int0
=
_mm_unpacklo_epi8
(
v_src
,
v_0
);
v_uint16x8
v_int0
,
v_int1
;
__m128i
v_int1
=
_mm_unpackhi_epi8
(
v_src
,
v_0
);
v_expand
(
v_src
,
v_int0
,
v_int1
);
__m128i
v_int00
=
_mm_unpacklo_epi16
(
v_int0
,
v_0
);
__m128i
v_int01
=
_mm_unpackhi_epi16
(
v_int0
,
v_0
);
v_uint32x4
v_int00
,
v_int01
,
v_int10
,
v_int11
;
__m128i
v_int10
=
_mm_unpacklo_epi16
(
v_int1
,
v_0
);
v_expand
(
v_int0
,
v_int00
,
v_int01
);
__m128i
v_int11
=
_mm_unpackhi_epi16
(
v_int1
,
v_0
);
v_expand
(
v_int1
,
v_int10
,
v_int11
);
__m128d
v_src0
=
_mm_cvtepi32_pd
(
v_int00
);
__m128d
v_src1
=
_mm_cvtepi32_pd
(
_mm_shuffle_epi32
(
v_int00
,
_MM_SHUFFLE
(
0
,
0
,
3
,
2
)));
v_float64x2
v_src0
=
v_cvt_f64
(
v_reinterpret_as_s32
(
v_int00
));
__m128d
v_src2
=
_mm_cvtepi32_pd
(
v_int01
);
v_float64x2
v_src1
=
v_cvt_f64_high
(
v_reinterpret_as_s32
(
v_int00
));
__m128d
v_src3
=
_mm_cvtepi32_pd
(
_mm_shuffle_epi32
(
v_int01
,
_MM_SHUFFLE
(
0
,
0
,
3
,
2
)));
v_float64x2
v_src2
=
v_cvt_f64
(
v_reinterpret_as_s32
(
v_int01
));
__m128d
v_src4
=
_mm_cvtepi32_pd
(
v_int10
);
v_float64x2
v_src3
=
v_cvt_f64_high
(
v_reinterpret_as_s32
(
v_int01
));
__m128d
v_src5
=
_mm_cvtepi32_pd
(
_mm_shuffle_epi32
(
v_int10
,
_MM_SHUFFLE
(
0
,
0
,
3
,
2
)));
v_float64x2
v_src4
=
v_cvt_f64
(
v_reinterpret_as_s32
(
v_int10
));
__m128d
v_src6
=
_mm_cvtepi32_pd
(
v_int11
);
v_float64x2
v_src5
=
v_cvt_f64_high
(
v_reinterpret_as_s32
(
v_int10
));
__m128d
v_src7
=
_mm_cvtepi32_pd
(
_mm_shuffle_epi32
(
v_int11
,
_MM_SHUFFLE
(
0
,
0
,
3
,
2
)));
v_float64x2
v_src6
=
v_cvt_f64
(
v_reinterpret_as_s32
(
v_int11
));
v_float64x2
v_src7
=
v_cvt_f64_high
(
v_reinterpret_as_s32
(
v_int11
));
__m128d
v_dst0
=
_mm_loadu_pd
(
dst
+
x
);
__m128d
v_dst1
=
_mm_loadu_pd
(
dst
+
x
+
2
);
v_float64x2
v_dst0
=
v_load
(
dst
+
x
);
__m128d
v_dst2
=
_mm_loadu_pd
(
dst
+
x
+
4
);
v_float64x2
v_dst1
=
v_load
(
dst
+
x
+
2
);
__m128d
v_dst3
=
_mm_loadu_pd
(
dst
+
x
+
6
);
v_float64x2
v_dst2
=
v_load
(
dst
+
x
+
4
);
__m128d
v_dst4
=
_mm_loadu_pd
(
dst
+
x
+
8
);
v_float64x2
v_dst3
=
v_load
(
dst
+
x
+
6
);
__m128d
v_dst5
=
_mm_loadu_pd
(
dst
+
x
+
10
);
v_float64x2
v_dst4
=
v_load
(
dst
+
x
+
8
);
__m128d
v_dst6
=
_mm_loadu_pd
(
dst
+
x
+
12
);
v_float64x2
v_dst5
=
v_load
(
dst
+
x
+
10
);
__m128d
v_dst7
=
_mm_loadu_pd
(
dst
+
x
+
14
);
v_float64x2
v_dst6
=
v_load
(
dst
+
x
+
12
);
v_float64x2
v_dst7
=
v_load
(
dst
+
x
+
14
);
v_dst0
=
_mm_add_pd
(
v_dst0
,
v_src0
);
v_dst1
=
_mm_add_pd
(
v_dst1
,
v_src1
);
v_dst0
=
v_dst0
+
v_src0
;
v_dst2
=
_mm_add_pd
(
v_dst2
,
v_src2
);
v_dst1
=
v_dst1
+
v_src1
;
v_dst3
=
_mm_add_pd
(
v_dst3
,
v_src3
);
v_dst2
=
v_dst2
+
v_src2
;
v_dst4
=
_mm_add_pd
(
v_dst4
,
v_src4
);
v_dst3
=
v_dst3
+
v_src3
;
v_dst5
=
_mm_add_pd
(
v_dst5
,
v_src5
);
v_dst4
=
v_dst4
+
v_src4
;
v_dst6
=
_mm_add_pd
(
v_dst6
,
v_src6
);
v_dst5
=
v_dst5
+
v_src5
;
v_dst7
=
_mm_add_pd
(
v_dst7
,
v_src7
);
v_dst6
=
v_dst6
+
v_src6
;
v_dst7
=
v_dst7
+
v_src7
;
_mm_storeu_pd
(
dst
+
x
,
v_dst0
);
_mm_storeu_pd
(
dst
+
x
+
2
,
v_dst1
);
v_store
(
dst
+
x
,
v_dst0
);
_mm_storeu_pd
(
dst
+
x
+
4
,
v_dst2
);
v_store
(
dst
+
x
+
2
,
v_dst1
);
_mm_storeu_pd
(
dst
+
x
+
6
,
v_dst3
);
v_store
(
dst
+
x
+
4
,
v_dst2
);
_mm_storeu_pd
(
dst
+
x
+
8
,
v_dst4
);
v_store
(
dst
+
x
+
6
,
v_dst3
);
_mm_storeu_pd
(
dst
+
x
+
10
,
v_dst5
);
v_store
(
dst
+
x
+
8
,
v_dst4
);
_mm_storeu_pd
(
dst
+
x
+
12
,
v_dst6
);
v_store
(
dst
+
x
+
10
,
v_dst5
);
_mm_storeu_pd
(
dst
+
x
+
14
,
v_dst7
);
v_store
(
dst
+
x
+
12
,
v_dst6
);
v_store
(
dst
+
x
+
14
,
v_dst7
);
}
}
}
}
return
x
;
return
x
;
...
@@ -879,32 +855,32 @@ struct Acc_SIMD<ushort, double>
...
@@ -879,32 +855,32 @@ struct Acc_SIMD<ushort, double>
if
(
!
mask
)
if
(
!
mask
)
{
{
__m128i
v_0
=
_mm_setzero_si128
();
len
*=
cn
;
len
*=
cn
;
for
(
;
x
<=
len
-
8
;
x
+=
8
)
for
(
;
x
<=
len
-
8
;
x
+=
8
)
{
{
__m128i
v_src
=
_mm_loadu_si128
((
const
__m128i
*
)(
src
+
x
));
v_uint16x8
v_src
=
v_load
(
src
+
x
);
__m128i
v_int0
=
_mm_unpacklo_epi16
(
v_src
,
v_0
);
v_uint32x4
v_int0
,
v_int1
;
__m128i
v_int1
=
_mm_unpackhi_epi16
(
v_src
,
v_0
);
v_expand
(
v_src
,
v_int0
,
v_int1
);
__m128d
v_src0
=
_mm_cvtepi32_pd
(
v_int0
);
__m128d
v_src1
=
_mm_cvtepi32_pd
(
_mm_shuffle_epi32
(
v_int0
,
_MM_SHUFFLE
(
0
,
0
,
3
,
2
)));
v_float64x2
v_src0
=
v_cvt_f64
(
v_reinterpret_as_s32
(
v_int0
));
__m128d
v_src2
=
_mm_cvtepi32_pd
(
v_int1
);
v_float64x2
v_src1
=
v_cvt_f64_high
(
v_reinterpret_as_s32
(
v_int0
));
__m128d
v_src3
=
_mm_cvtepi32_pd
(
_mm_shuffle_epi32
(
v_int1
,
_MM_SHUFFLE
(
0
,
0
,
3
,
2
)));
v_float64x2
v_src2
=
v_cvt_f64
(
v_reinterpret_as_s32
(
v_int1
));
v_float64x2
v_src3
=
v_cvt_f64_high
(
v_reinterpret_as_s32
(
v_int1
));
__m128d
v_dst0
=
_mm_loadu_p
d
(
dst
+
x
);
v_float64x2
v_dst0
=
v_loa
d
(
dst
+
x
);
__m128d
v_dst1
=
_mm_loadu_p
d
(
dst
+
x
+
2
);
v_float64x2
v_dst1
=
v_loa
d
(
dst
+
x
+
2
);
__m128d
v_dst2
=
_mm_loadu_p
d
(
dst
+
x
+
4
);
v_float64x2
v_dst2
=
v_loa
d
(
dst
+
x
+
4
);
__m128d
v_dst3
=
_mm_loadu_p
d
(
dst
+
x
+
6
);
v_float64x2
v_dst3
=
v_loa
d
(
dst
+
x
+
6
);
v_dst0
=
_mm_add_pd
(
v_dst0
,
v_src0
)
;
v_dst0
=
v_dst0
+
v_src0
;
v_dst1
=
_mm_add_pd
(
v_dst1
,
v_src1
)
;
v_dst1
=
v_dst1
+
v_src1
;
v_dst2
=
_mm_add_pd
(
v_dst2
,
v_src2
)
;
v_dst2
=
v_dst2
+
v_src2
;
v_dst3
=
_mm_add_pd
(
v_dst3
,
v_src3
)
;
v_dst3
=
v_dst3
+
v_src3
;
_mm_storeu_pd
(
dst
+
x
,
v_dst0
);
v_store
(
dst
+
x
,
v_dst0
);
_mm_storeu_pd
(
dst
+
x
+
2
,
v_dst1
);
v_store
(
dst
+
x
+
2
,
v_dst1
);
_mm_storeu_pd
(
dst
+
x
+
4
,
v_dst2
);
v_store
(
dst
+
x
+
4
,
v_dst2
);
_mm_storeu_pd
(
dst
+
x
+
6
,
v_dst3
);
v_store
(
dst
+
x
+
6
,
v_dst3
);
}
}
}
}
return
x
;
return
x
;
...
@@ -924,7 +900,7 @@ struct AccSqr_SIMD<uchar, float>
...
@@ -924,7 +900,7 @@ struct AccSqr_SIMD<uchar, float>
len
*=
cn
;
len
*=
cn
;
for
(
;
x
<=
len
-
16
;
x
+=
16
)
for
(
;
x
<=
len
-
16
;
x
+=
16
)
{
{
v_uint8x16
v_src
=
v_load
(
(
const
uchar
*
)(
src
+
x
)
);
v_uint8x16
v_src
=
v_load
(
src
+
x
);
v_uint16x8
v_src0
,
v_src1
;
v_uint16x8
v_src0
,
v_src1
;
v_expand
(
v_src
,
v_src0
,
v_src1
);
v_expand
(
v_src
,
v_src0
,
v_src1
);
v_src0
=
v_src0
*
v_src0
;
v_src0
=
v_src0
*
v_src0
;
...
@@ -945,9 +921,9 @@ struct AccSqr_SIMD<uchar, float>
...
@@ -945,9 +921,9 @@ struct AccSqr_SIMD<uchar, float>
v_uint8x16
v_0
=
v_setall_u8
(
0
);
v_uint8x16
v_0
=
v_setall_u8
(
0
);
for
(
;
x
<=
len
-
16
;
x
+=
16
)
for
(
;
x
<=
len
-
16
;
x
+=
16
)
{
{
v_uint8x16
v_mask
=
v_load
(
(
const
uchar
*
)(
mask
+
x
)
);
v_uint8x16
v_mask
=
v_load
(
mask
+
x
);
v_mask
=
~
(
v_0
==
v_mask
);
v_mask
=
~
(
v_0
==
v_mask
);
v_uint8x16
v_src
=
v_load
(
(
const
uchar
*
)(
src
+
x
)
);
v_uint8x16
v_src
=
v_load
(
src
+
x
);
v_src
=
v_src
&
v_mask
;
v_src
=
v_src
&
v_mask
;
v_uint16x8
v_src0
,
v_src1
;
v_uint16x8
v_src0
,
v_src1
;
v_expand
(
v_src
,
v_src0
,
v_src1
);
v_expand
(
v_src
,
v_src0
,
v_src1
);
...
@@ -981,7 +957,7 @@ struct AccSqr_SIMD<ushort, float>
...
@@ -981,7 +957,7 @@ struct AccSqr_SIMD<ushort, float>
len
*=
cn
;
len
*=
cn
;
for
(
;
x
<=
len
-
8
;
x
+=
8
)
for
(
;
x
<=
len
-
8
;
x
+=
8
)
{
{
v_uint16x8
v_src
=
v_load
(
(
const
ushort
*
)(
src
+
x
)
);
v_uint16x8
v_src
=
v_load
(
src
+
x
);
v_uint32x4
v_src0
,
v_src1
;
v_uint32x4
v_src0
,
v_src1
;
v_expand
(
v_src
,
v_src0
,
v_src1
);
v_expand
(
v_src
,
v_src0
,
v_src1
);
...
@@ -1000,7 +976,7 @@ struct AccSqr_SIMD<ushort, float>
...
@@ -1000,7 +976,7 @@ struct AccSqr_SIMD<ushort, float>
}
}
};
};
#if CV_S
SE2
#if CV_S
IMD128_64F
template
<>
template
<>
struct
AccSqr_SIMD
<
uchar
,
double
>
struct
AccSqr_SIMD
<
uchar
,
double
>
{
{
...
@@ -1010,37 +986,39 @@ struct AccSqr_SIMD<uchar, double>
...
@@ -1010,37 +986,39 @@ struct AccSqr_SIMD<uchar, double>
if
(
!
mask
)
if
(
!
mask
)
{
{
__m128i
v_0
=
_mm_setzero_si128
();
len
*=
cn
;
len
*=
cn
;
for
(
;
x
<=
len
-
8
;
x
+=
8
)
for
(
;
x
<=
len
-
8
;
x
+=
8
)
{
{
__m128i
v_src
=
_mm_loadl_epi64
((
const
__m128i
*
)(
src
+
x
));
v_uint8x16
v_src
=
v_load
(
src
+
x
);
__m128i
v_int
=
_mm_unpacklo_epi8
(
v_src
,
v_0
);
v_uint16x8
v_int
,
dummy
;
__m128i
v_int0
=
_mm_unpacklo_epi16
(
v_int
,
v_0
);
v_expand
(
v_src
,
v_int
,
dummy
);
__m128i
v_int1
=
_mm_unpackhi_epi16
(
v_int
,
v_0
);
__m128d
v_src0
=
_mm_cvtepi32_pd
(
v_int0
);
v_uint32x4
v_int0
,
v_int1
;
__m128d
v_src1
=
_mm_cvtepi32_pd
(
_mm_shuffle_epi32
(
v_int0
,
_MM_SHUFFLE
(
0
,
0
,
3
,
2
)));
v_expand
(
v_int
,
v_int0
,
v_int1
);
__m128d
v_src2
=
_mm_cvtepi32_pd
(
v_int1
);
__m128d
v_src3
=
_mm_cvtepi32_pd
(
_mm_shuffle_epi32
(
v_int1
,
_MM_SHUFFLE
(
0
,
0
,
3
,
2
)));
v_float64x2
v_src0
=
v_cvt_f64
(
v_reinterpret_as_s32
(
v_int0
));
v_src0
=
_mm_mul_pd
(
v_src0
,
v_src0
);
v_float64x2
v_src1
=
v_cvt_f64_high
(
v_reinterpret_as_s32
(
v_int0
));
v_src1
=
_mm_mul_pd
(
v_src1
,
v_src1
);
v_float64x2
v_src2
=
v_cvt_f64
(
v_reinterpret_as_s32
(
v_int1
));
v_src2
=
_mm_mul_pd
(
v_src2
,
v_src2
);
v_float64x2
v_src3
=
v_cvt_f64_high
(
v_reinterpret_as_s32
(
v_int1
));
v_src3
=
_mm_mul_pd
(
v_src3
,
v_src3
);
v_src0
=
v_src0
*
v_src0
;
v_src1
=
v_src1
*
v_src1
;
__m128d
v_dst0
=
_mm_loadu_pd
(
dst
+
x
);
v_src2
=
v_src2
*
v_src2
;
__m128d
v_dst1
=
_mm_loadu_pd
(
dst
+
x
+
2
);
v_src3
=
v_src3
*
v_src3
;
__m128d
v_dst2
=
_mm_loadu_pd
(
dst
+
x
+
4
);
__m128d
v_dst3
=
_mm_loadu_pd
(
dst
+
x
+
6
);
v_float64x2
v_dst0
=
v_load
(
dst
+
x
);
v_float64x2
v_dst1
=
v_load
(
dst
+
x
+
2
);
v_dst0
=
_mm_add_pd
(
v_dst0
,
v_src0
);
v_float64x2
v_dst2
=
v_load
(
dst
+
x
+
4
);
v_dst1
=
_mm_add_pd
(
v_dst1
,
v_src1
);
v_float64x2
v_dst3
=
v_load
(
dst
+
x
+
6
);
v_dst2
=
_mm_add_pd
(
v_dst2
,
v_src2
);
v_dst3
=
_mm_add_pd
(
v_dst3
,
v_src3
);
v_dst0
+=
v_src0
;
v_dst1
+=
v_src1
;
_mm_storeu_pd
(
dst
+
x
,
v_dst0
);
v_dst2
+=
v_src2
;
_mm_storeu_pd
(
dst
+
x
+
2
,
v_dst1
);
v_dst3
+=
v_src3
;
_mm_storeu_pd
(
dst
+
x
+
4
,
v_dst2
);
_mm_storeu_pd
(
dst
+
x
+
6
,
v_dst3
);
v_store
(
dst
+
x
,
v_dst0
);
v_store
(
dst
+
x
+
2
,
v_dst1
);
v_store
(
dst
+
x
+
4
,
v_dst2
);
v_store
(
dst
+
x
+
6
,
v_dst3
);
}
}
}
}
return
x
;
return
x
;
...
@@ -1056,36 +1034,39 @@ struct AccSqr_SIMD<ushort, double>
...
@@ -1056,36 +1034,39 @@ struct AccSqr_SIMD<ushort, double>
if
(
!
mask
)
if
(
!
mask
)
{
{
__m128i
v_0
=
_mm_setzero_si128
();
len
*=
cn
;
len
*=
cn
;
for
(
;
x
<=
len
-
8
;
x
+=
8
)
for
(
;
x
<=
len
-
8
;
x
+=
8
)
{
{
__m128i
v_src
=
_mm_loadu_si128
((
const
__m128i
*
)(
src
+
x
));
v_uint16x8
v_src
=
v_load
(
src
+
x
);
__m128i
v_int0
=
_mm_unpacklo_epi16
(
v_src
,
v_0
);
v_uint32x4
v_int_0
,
v_int_1
;
__m128i
v_int1
=
_mm_unpackhi_epi16
(
v_src
,
v_0
);
v_expand
(
v_src
,
v_int_0
,
v_int_1
);
__m128d
v_src0
=
_mm_cvtepi32_pd
(
v_int0
);
__m128d
v_src1
=
_mm_cvtepi32_pd
(
_mm_shuffle_epi32
(
v_int0
,
_MM_SHUFFLE
(
0
,
0
,
3
,
2
)));
v_int32x4
v_int0
=
v_reinterpret_as_s32
(
v_int_0
);
__m128d
v_src2
=
_mm_cvtepi32_pd
(
v_int1
);
v_int32x4
v_int1
=
v_reinterpret_as_s32
(
v_int_1
);
__m128d
v_src3
=
_mm_cvtepi32_pd
(
_mm_shuffle_epi32
(
v_int1
,
_MM_SHUFFLE
(
0
,
0
,
3
,
2
)));
v_src0
=
_mm_mul_pd
(
v_src0
,
v_src0
);
v_float64x2
v_src0
=
v_cvt_f64
(
v_int0
);
v_src1
=
_mm_mul_pd
(
v_src1
,
v_src1
);
v_float64x2
v_src1
=
v_cvt_f64_high
(
v_int0
);
v_src2
=
_mm_mul_pd
(
v_src2
,
v_src2
);
v_float64x2
v_src2
=
v_cvt_f64
(
v_int1
);
v_src3
=
_mm_mul_pd
(
v_src3
,
v_src3
);
v_float64x2
v_src3
=
v_cvt_f64_high
(
v_int1
);
v_src0
=
v_src0
*
v_src0
;
__m128d
v_dst0
=
_mm_loadu_pd
(
dst
+
x
);
v_src1
=
v_src1
*
v_src1
;
__m128d
v_dst1
=
_mm_loadu_pd
(
dst
+
x
+
2
);
v_src2
=
v_src2
*
v_src2
;
__m128d
v_dst2
=
_mm_loadu_pd
(
dst
+
x
+
4
);
v_src3
=
v_src3
*
v_src3
;
__m128d
v_dst3
=
_mm_loadu_pd
(
dst
+
x
+
6
);
v_float64x2
v_dst0
=
v_load
(
dst
+
x
);
v_dst0
=
_mm_add_pd
(
v_dst0
,
v_src0
);
v_float64x2
v_dst1
=
v_load
(
dst
+
x
+
2
);
v_dst1
=
_mm_add_pd
(
v_dst1
,
v_src1
);
v_float64x2
v_dst2
=
v_load
(
dst
+
x
+
4
);
v_dst2
=
_mm_add_pd
(
v_dst2
,
v_src2
);
v_float64x2
v_dst3
=
v_load
(
dst
+
x
+
6
);
v_dst3
=
_mm_add_pd
(
v_dst3
,
v_src3
);
v_dst0
+=
v_src0
;
_mm_storeu_pd
(
dst
+
x
,
v_dst0
);
v_dst1
+=
v_src1
;
_mm_storeu_pd
(
dst
+
x
+
2
,
v_dst1
);
v_dst2
+=
v_src2
;
_mm_storeu_pd
(
dst
+
x
+
4
,
v_dst2
);
v_dst3
+=
v_src3
;
_mm_storeu_pd
(
dst
+
x
+
6
,
v_dst3
);
v_store
(
dst
+
x
,
v_dst0
);
v_store
(
dst
+
x
+
2
,
v_dst1
);
v_store
(
dst
+
x
+
4
,
v_dst2
);
v_store
(
dst
+
x
+
6
,
v_dst3
);
}
}
}
}
return
x
;
return
x
;
...
@@ -1227,7 +1208,7 @@ struct AccProd_SIMD<ushort, float>
...
@@ -1227,7 +1208,7 @@ struct AccProd_SIMD<ushort, float>
}
}
};
};
#if CV_S
SE2
#if CV_S
IMD128_64F
template
<>
template
<>
struct
AccProd_SIMD
<
uchar
,
double
>
struct
AccProd_SIMD
<
uchar
,
double
>
{
{
...
@@ -1237,38 +1218,44 @@ struct AccProd_SIMD<uchar, double>
...
@@ -1237,38 +1218,44 @@ struct AccProd_SIMD<uchar, double>
if
(
!
mask
)
if
(
!
mask
)
{
{
__m128i
v_0
=
_mm_setzero_si128
();
len
*=
cn
;
len
*=
cn
;
for
(
;
x
<=
len
-
8
;
x
+=
8
)
for
(
;
x
<=
len
-
8
;
x
+=
8
)
{
{
__m128i
v_1src
=
_mm_loadl_epi64
((
const
__m128i
*
)(
src1
+
x
)
);
v_uint8x16
v_1src
=
v_load
(
src1
+
x
);
__m128i
v_2src
=
_mm_loadl_epi64
((
const
__m128i
*
)(
src2
+
x
)
);
v_uint8x16
v_2src
=
v_load
(
src2
+
x
);
__m128i
v_1int
=
_mm_unpacklo_epi8
(
v_1src
,
v_0
);
v_uint16x8
v_1int
,
v_2int
,
dummy
;
__m128i
v_2int
=
_mm_unpacklo_epi8
(
v_2src
,
v_0
);
v_expand
(
v_1src
,
v_1int
,
dummy
);
__m128i
v_1int0
=
_mm_unpacklo_epi16
(
v_1int
,
v_0
);
v_expand
(
v_2src
,
v_2int
,
dummy
);
__m128i
v_1int1
=
_mm_unpackhi_epi16
(
v_1int
,
v_0
);
__m128i
v_2int0
=
_mm_unpacklo_epi16
(
v_2int
,
v_0
);
__m128i
v_2int1
=
_mm_unpackhi_epi16
(
v_2int
,
v_0
);
__m128d
v_src0
=
_mm_mul_pd
(
_mm_cvtepi32_pd
(
v_1int0
),
_mm_cvtepi32_pd
(
v_2int0
));
__m128d
v_src1
=
_mm_mul_pd
(
_mm_cvtepi32_pd
(
_mm_shuffle_epi32
(
v_1int0
,
_MM_SHUFFLE
(
0
,
0
,
3
,
2
))),
_mm_cvtepi32_pd
(
_mm_shuffle_epi32
(
v_2int0
,
_MM_SHUFFLE
(
0
,
0
,
3
,
2
))));
__m128d
v_src2
=
_mm_mul_pd
(
_mm_cvtepi32_pd
(
v_1int1
),
_mm_cvtepi32_pd
(
v_2int1
));
__m128d
v_src3
=
_mm_mul_pd
(
_mm_cvtepi32_pd
(
_mm_shuffle_epi32
(
v_1int1
,
_MM_SHUFFLE
(
0
,
0
,
3
,
2
))),
_mm_cvtepi32_pd
(
_mm_shuffle_epi32
(
v_2int1
,
_MM_SHUFFLE
(
0
,
0
,
3
,
2
))));
__m128d
v_dst0
=
_mm_loadu_pd
(
dst
+
x
);
v_uint32x4
v_1int_0
,
v_1int_1
,
v_2int_0
,
v_2int_1
;
__m128d
v_dst1
=
_mm_loadu_pd
(
dst
+
x
+
2
);
v_expand
(
v_1int
,
v_1int_0
,
v_1int_1
);
__m128d
v_dst2
=
_mm_loadu_pd
(
dst
+
x
+
4
);
v_expand
(
v_2int
,
v_2int_0
,
v_2int_1
);
__m128d
v_dst3
=
_mm_loadu_pd
(
dst
+
x
+
6
);
v_
dst0
=
_mm_add_pd
(
v_dst0
,
v_src
0
);
v_
int32x4
v_1int0
=
v_reinterpret_as_s32
(
v_1int_
0
);
v_
dst1
=
_mm_add_pd
(
v_dst1
,
v_src
1
);
v_
int32x4
v_1int1
=
v_reinterpret_as_s32
(
v_1int_
1
);
v_
dst2
=
_mm_add_pd
(
v_dst2
,
v_src2
);
v_
int32x4
v_2int0
=
v_reinterpret_as_s32
(
v_2int_0
);
v_
dst3
=
_mm_add_pd
(
v_dst3
,
v_src3
);
v_
int32x4
v_2int1
=
v_reinterpret_as_s32
(
v_2int_1
);
_mm_storeu_pd
(
dst
+
x
,
v_dst0
);
v_float64x2
v_src0
=
v_cvt_f64
(
v_1int0
)
*
v_cvt_f64
(
v_2int0
);
_mm_storeu_pd
(
dst
+
x
+
2
,
v_dst1
);
v_float64x2
v_src1
=
v_cvt_f64_high
(
v_1int0
)
*
v_cvt_f64_high
(
v_2int0
);
_mm_storeu_pd
(
dst
+
x
+
4
,
v_dst2
);
v_float64x2
v_src2
=
v_cvt_f64
(
v_1int1
)
*
v_cvt_f64
(
v_2int1
);
_mm_storeu_pd
(
dst
+
x
+
6
,
v_dst3
);
v_float64x2
v_src3
=
v_cvt_f64_high
(
v_1int1
)
*
v_cvt_f64_high
(
v_2int1
);
v_float64x2
v_dst0
=
v_load
(
dst
+
x
);
v_float64x2
v_dst1
=
v_load
(
dst
+
x
+
2
);
v_float64x2
v_dst2
=
v_load
(
dst
+
x
+
4
);
v_float64x2
v_dst3
=
v_load
(
dst
+
x
+
6
);
v_dst0
+=
v_src0
;
v_dst1
+=
v_src1
;
v_dst2
+=
v_src2
;
v_dst3
+=
v_src3
;
v_store
(
dst
+
x
,
v_dst0
);
v_store
(
dst
+
x
+
2
,
v_dst1
);
v_store
(
dst
+
x
+
4
,
v_dst2
);
v_store
(
dst
+
x
+
6
,
v_dst3
);
}
}
}
}
return
x
;
return
x
;
...
@@ -1284,35 +1271,40 @@ struct AccProd_SIMD<ushort, double>
...
@@ -1284,35 +1271,40 @@ struct AccProd_SIMD<ushort, double>
if
(
!
mask
)
if
(
!
mask
)
{
{
__m128i
v_0
=
_mm_setzero_si128
();
len
*=
cn
;
len
*=
cn
;
for
(
;
x
<=
len
-
8
;
x
+=
8
)
for
(
;
x
<=
len
-
8
;
x
+=
8
)
{
{
__m128i
v_1src
=
_mm_loadu_si128
((
const
__m128i
*
)(
src1
+
x
));
v_uint16x8
v_1src
=
v_load
(
src1
+
x
);
__m128i
v_2src
=
_mm_loadu_si128
((
const
__m128i
*
)(
src2
+
x
));
v_uint16x8
v_2src
=
v_load
(
src2
+
x
);
__m128i
v_1int0
=
_mm_unpacklo_epi16
(
v_1src
,
v_0
);
__m128i
v_1int1
=
_mm_unpackhi_epi16
(
v_1src
,
v_0
);
v_uint32x4
v_1int_0
,
v_1int_1
,
v_2int_0
,
v_2int_1
;
__m128i
v_2int0
=
_mm_unpacklo_epi16
(
v_2src
,
v_0
);
v_expand
(
v_1src
,
v_1int_0
,
v_1int_1
);
__m128i
v_2int1
=
_mm_unpackhi_epi16
(
v_2src
,
v_0
);
v_expand
(
v_2src
,
v_2int_0
,
v_2int_1
);
__m128d
v_src0
=
_mm_mul_pd
(
_mm_cvtepi32_pd
(
v_1int0
),
_mm_cvtepi32_pd
(
v_2int0
));
__m128d
v_src1
=
_mm_mul_pd
(
_mm_cvtepi32_pd
(
_mm_shuffle_epi32
(
v_1int0
,
_MM_SHUFFLE
(
0
,
0
,
3
,
2
))),
_mm_cvtepi32_pd
(
_mm_shuffle_epi32
(
v_2int0
,
_MM_SHUFFLE
(
0
,
0
,
3
,
2
))));
v_int32x4
v_1int0
=
v_reinterpret_as_s32
(
v_1int_0
);
__m128d
v_src2
=
_mm_mul_pd
(
_mm_cvtepi32_pd
(
v_1int1
),
_mm_cvtepi32_pd
(
v_2int1
));
v_int32x4
v_1int1
=
v_reinterpret_as_s32
(
v_1int_1
);
__m128d
v_src3
=
_mm_mul_pd
(
_mm_cvtepi32_pd
(
_mm_shuffle_epi32
(
v_1int1
,
_MM_SHUFFLE
(
0
,
0
,
3
,
2
))),
_mm_cvtepi32_pd
(
_mm_shuffle_epi32
(
v_2int1
,
_MM_SHUFFLE
(
0
,
0
,
3
,
2
))));
v_int32x4
v_2int0
=
v_reinterpret_as_s32
(
v_2int_0
);
v_int32x4
v_2int1
=
v_reinterpret_as_s32
(
v_2int_1
);
__m128d
v_dst0
=
_mm_loadu_pd
(
dst
+
x
);
__m128d
v_dst1
=
_mm_loadu_pd
(
dst
+
x
+
2
);
v_float64x2
v_src0
=
v_cvt_f64
(
v_1int0
)
*
v_cvt_f64
(
v_2int0
);
__m128d
v_dst2
=
_mm_loadu_pd
(
dst
+
x
+
4
);
v_float64x2
v_src1
=
v_cvt_f64_high
(
v_1int0
)
*
v_cvt_f64_high
(
v_2int0
);
__m128d
v_dst3
=
_mm_loadu_pd
(
dst
+
x
+
6
);
v_float64x2
v_src2
=
v_cvt_f64
(
v_1int1
)
*
v_cvt_f64
(
v_2int1
);
v_float64x2
v_src3
=
v_cvt_f64_high
(
v_1int1
)
*
v_cvt_f64_high
(
v_2int1
);
v_dst0
=
_mm_add_pd
(
v_dst0
,
v_src0
);
v_dst1
=
_mm_add_pd
(
v_dst1
,
v_src1
);
v_float64x2
v_dst0
=
v_load
(
dst
+
x
);
v_dst2
=
_mm_add_pd
(
v_dst2
,
v_src2
);
v_float64x2
v_dst1
=
v_load
(
dst
+
x
+
2
);
v_dst3
=
_mm_add_pd
(
v_dst3
,
v_src3
);
v_float64x2
v_dst2
=
v_load
(
dst
+
x
+
4
);
v_float64x2
v_dst3
=
v_load
(
dst
+
x
+
6
);
_mm_storeu_pd
(
dst
+
x
,
v_dst0
);
_mm_storeu_pd
(
dst
+
x
+
2
,
v_dst1
);
v_dst0
=
v_dst0
+
v_src0
;
_mm_storeu_pd
(
dst
+
x
+
4
,
v_dst2
);
v_dst1
=
v_dst1
+
v_src1
;
_mm_storeu_pd
(
dst
+
x
+
6
,
v_dst3
);
v_dst2
=
v_dst2
+
v_src2
;
v_dst3
=
v_dst3
+
v_src3
;
v_store
(
dst
+
x
,
v_dst0
);
v_store
(
dst
+
x
+
2
,
v_dst1
);
v_store
(
dst
+
x
+
4
,
v_dst2
);
v_store
(
dst
+
x
+
6
,
v_dst3
);
}
}
}
}
return
x
;
return
x
;
...
@@ -1399,45 +1391,50 @@ struct AccW_SIMD<ushort, float>
...
@@ -1399,45 +1391,50 @@ struct AccW_SIMD<ushort, float>
}
}
};
};
#if CV_S
SE2
#if CV_S
IMD128_64F
template
<>
template
<>
struct
AccW_SIMD
<
uchar
,
double
>
struct
AccW_SIMD
<
uchar
,
double
>
{
{
int
operator
()
(
const
uchar
*
src
,
double
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
,
double
alpha
)
const
int
operator
()
(
const
uchar
*
src
,
double
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
,
double
alpha
)
const
{
{
int
x
=
0
;
int
x
=
0
;
__m128d
v_alpha
=
_mm_set1_pd
(
alpha
);
v_float64x2
v_alpha
=
v_setall_f64
(
alpha
);
__m128d
v_beta
=
_mm_set1_pd
(
1.0
f
-
alpha
);
v_float64x2
v_beta
=
v_setall_f64
(
1.0
f
-
alpha
);
__m128i
v_0
=
_mm_setzero_si128
();
if
(
!
mask
)
if
(
!
mask
)
{
{
len
*=
cn
;
len
*=
cn
;
for
(
;
x
<=
len
-
8
;
x
+=
8
)
for
(
;
x
<=
len
-
8
;
x
+=
8
)
{
{
__m128i
v_src
=
_mm_loadl_epi64
((
const
__m128i
*
)(
src
+
x
));
v_uint8x16
v_src
=
v_load
(
src
+
x
);
__m128i
v_int
=
_mm_unpacklo_epi8
(
v_src
,
v_0
);
v_uint16x8
v_int
,
dummy
;
__m128i
v_int0
=
_mm_unpacklo_epi16
(
v_int
,
v_0
);
v_expand
(
v_src
,
v_int
,
dummy
);
__m128i
v_int1
=
_mm_unpackhi_epi16
(
v_int
,
v_0
);
__m128d
v_src0
=
_mm_cvtepi32_pd
(
v_int0
);
v_uint32x4
v_int_0
,
v_int_1
;
__m128d
v_src1
=
_mm_cvtepi32_pd
(
_mm_shuffle_epi32
(
v_int0
,
_MM_SHUFFLE
(
0
,
0
,
3
,
2
)));
v_expand
(
v_int
,
v_int_0
,
v_int_1
);
__m128d
v_src2
=
_mm_cvtepi32_pd
(
v_int1
);
__m128d
v_src3
=
_mm_cvtepi32_pd
(
_mm_shuffle_epi32
(
v_int1
,
_MM_SHUFFLE
(
0
,
0
,
3
,
2
)));
v_int32x4
v_int0
=
v_reinterpret_as_s32
(
v_int_0
);
v_int32x4
v_int1
=
v_reinterpret_as_s32
(
v_int_1
);
v_float64x2
v_src0
=
v_cvt_f64
(
v_int0
);
v_float64x2
v_src1
=
v_cvt_f64_high
(
v_int0
);
v_float64x2
v_src2
=
v_cvt_f64
(
v_int1
);
v_float64x2
v_src3
=
v_cvt_f64_high
(
v_int1
);
__m128d
v_dst0
=
_mm_loadu_p
d
(
dst
+
x
);
v_float64x2
v_dst0
=
v_loa
d
(
dst
+
x
);
__m128d
v_dst1
=
_mm_loadu_p
d
(
dst
+
x
+
2
);
v_float64x2
v_dst1
=
v_loa
d
(
dst
+
x
+
2
);
__m128d
v_dst2
=
_mm_loadu_p
d
(
dst
+
x
+
4
);
v_float64x2
v_dst2
=
v_loa
d
(
dst
+
x
+
4
);
__m128d
v_dst3
=
_mm_loadu_p
d
(
dst
+
x
+
6
);
v_float64x2
v_dst3
=
v_loa
d
(
dst
+
x
+
6
);
v_dst0
=
_mm_add_pd
(
_mm_mul_pd
(
v_dst0
,
v_beta
),
_mm_mul_pd
(
v_src0
,
v_alpha
)
);
v_dst0
=
(
v_dst0
*
v_beta
)
+
(
v_src0
*
v_alpha
);
v_dst1
=
_mm_add_pd
(
_mm_mul_pd
(
v_dst1
,
v_beta
),
_mm_mul_pd
(
v_src1
,
v_alpha
)
);
v_dst1
=
(
v_dst1
*
v_beta
)
+
(
v_src1
*
v_alpha
);
v_dst2
=
_mm_add_pd
(
_mm_mul_pd
(
v_dst2
,
v_beta
),
_mm_mul_pd
(
v_src2
,
v_alpha
)
);
v_dst2
=
(
v_dst2
*
v_beta
)
+
(
v_src2
*
v_alpha
);
v_dst3
=
_mm_add_pd
(
_mm_mul_pd
(
v_dst3
,
v_beta
),
_mm_mul_pd
(
v_src3
,
v_alpha
)
);
v_dst3
=
(
v_dst3
*
v_beta
)
+
(
v_src3
*
v_alpha
);
_mm_storeu_pd
(
dst
+
x
,
v_dst0
);
v_store
(
dst
+
x
,
v_dst0
);
_mm_storeu_pd
(
dst
+
x
+
2
,
v_dst1
);
v_store
(
dst
+
x
+
2
,
v_dst1
);
_mm_storeu_pd
(
dst
+
x
+
4
,
v_dst2
);
v_store
(
dst
+
x
+
4
,
v_dst2
);
_mm_storeu_pd
(
dst
+
x
+
6
,
v_dst3
);
v_store
(
dst
+
x
+
6
,
v_dst3
);
}
}
}
}
...
@@ -1451,44 +1448,47 @@ struct AccW_SIMD<ushort, double>
...
@@ -1451,44 +1448,47 @@ struct AccW_SIMD<ushort, double>
int
operator
()
(
const
ushort
*
src
,
double
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
,
double
alpha
)
const
int
operator
()
(
const
ushort
*
src
,
double
*
dst
,
const
uchar
*
mask
,
int
len
,
int
cn
,
double
alpha
)
const
{
{
int
x
=
0
;
int
x
=
0
;
__m128d
v_alpha
=
_mm_set1_pd
(
alpha
);
v_float64x2
v_alpha
=
v_setall_f64
(
alpha
);
__m128d
v_beta
=
_mm_set1_pd
(
1.0
f
-
alpha
);
v_float64x2
v_beta
=
v_setall_f64
(
1.0
f
-
alpha
);
__m128i
v_0
=
_mm_setzero_si128
();
if
(
!
mask
)
if
(
!
mask
)
{
{
len
*=
cn
;
len
*=
cn
;
for
(
;
x
<=
len
-
8
;
x
+=
8
)
for
(
;
x
<=
len
-
8
;
x
+=
8
)
{
{
__m128i
v_src
=
_mm_loadu_si128
((
const
__m128i
*
)(
src
+
x
));
v_uint16x8
v_src
=
v_load
(
src
+
x
);
__m128i
v_int0
=
_mm_unpacklo_epi16
(
v_src
,
v_0
);
v_uint32x4
v_int_0
,
v_int_1
;
__m128i
v_int1
=
_mm_unpackhi_epi16
(
v_src
,
v_0
);
v_expand
(
v_src
,
v_int_0
,
v_int_1
);
__m128d
v_src00
=
_mm_cvtepi32_pd
(
v_int0
);
__m128d
v_src01
=
_mm_cvtepi32_pd
(
_mm_shuffle_epi32
(
v_int0
,
_MM_SHUFFLE
(
0
,
0
,
3
,
2
)));
v_int32x4
v_int0
=
v_reinterpret_as_s32
(
v_int_0
);
__m128d
v_src10
=
_mm_cvtepi32_pd
(
v_int1
);
v_int32x4
v_int1
=
v_reinterpret_as_s32
(
v_int_1
);
__m128d
v_src11
=
_mm_cvtepi32_pd
(
_mm_shuffle_epi32
(
v_int1
,
_MM_SHUFFLE
(
0
,
0
,
3
,
2
)));
__m128d
v_dst00
=
_mm_loadu_pd
(
dst
+
x
);
v_float64x2
v_src00
=
v_cvt_f64
(
v_int0
);
__m128d
v_dst01
=
_mm_loadu_pd
(
dst
+
x
+
2
);
v_float64x2
v_src01
=
v_cvt_f64_high
(
v_int0
);
__m128d
v_dst10
=
_mm_loadu_pd
(
dst
+
x
+
4
);
v_float64x2
v_src10
=
v_cvt_f64
(
v_int1
);
__m128d
v_dst11
=
_mm_loadu_pd
(
dst
+
x
+
6
);
v_float64x2
v_src11
=
v_cvt_f64_high
(
v_int1
);
v_
dst00
=
_mm_add_pd
(
_mm_mul_pd
(
v_dst00
,
v_beta
),
_mm_mul_pd
(
v_src00
,
v_alpha
)
);
v_
float64x2
v_dst00
=
v_load
(
dst
+
x
);
v_
dst01
=
_mm_add_pd
(
_mm_mul_pd
(
v_dst01
,
v_beta
),
_mm_mul_pd
(
v_src01
,
v_alpha
)
);
v_
float64x2
v_dst01
=
v_load
(
dst
+
x
+
2
);
v_
dst10
=
_mm_add_pd
(
_mm_mul_pd
(
v_dst10
,
v_beta
),
_mm_mul_pd
(
v_src10
,
v_alpha
)
);
v_
float64x2
v_dst10
=
v_load
(
dst
+
x
+
4
);
v_
dst11
=
_mm_add_pd
(
_mm_mul_pd
(
v_dst11
,
v_beta
),
_mm_mul_pd
(
v_src11
,
v_alpha
)
);
v_
float64x2
v_dst11
=
v_load
(
dst
+
x
+
6
);
_mm_storeu_pd
(
dst
+
x
,
v_dst00
);
v_dst00
=
(
v_dst00
*
v_beta
)
+
(
v_src00
*
v_alpha
);
_mm_storeu_pd
(
dst
+
x
+
2
,
v_dst01
);
v_dst01
=
(
v_dst01
*
v_beta
)
+
(
v_src01
*
v_alpha
);
_mm_storeu_pd
(
dst
+
x
+
4
,
v_dst10
);
v_dst10
=
(
v_dst10
*
v_beta
)
+
(
v_src10
*
v_alpha
);
_mm_storeu_pd
(
dst
+
x
+
6
,
v_dst11
);
v_dst11
=
(
v_dst11
*
v_beta
)
+
(
v_src11
*
v_alpha
);
v_store
(
dst
+
x
,
v_dst00
);
v_store
(
dst
+
x
+
2
,
v_dst01
);
v_store
(
dst
+
x
+
4
,
v_dst10
);
v_store
(
dst
+
x
+
6
,
v_dst11
);
}
}
}
}
return
x
;
return
x
;
}
}
};
};
#endif //CV_S
SE2
#endif //CV_S
IMD128_64F
#endif //CV_SIMD128
#endif //CV_SIMD128
template
<
typename
T
,
typename
AT
>
void
template
<
typename
T
,
typename
AT
>
void
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment