Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
1cc3f7ab
Commit
1cc3f7ab
authored
Oct 15, 2018
by
Alexander Alekhin
Browse files
Options
Browse Files
Download
Plain Diff
Merge pull request #12516 from seiko2plus:changeUnvMultiply16
parents
803ff64b
8965f3ae
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
15 changed files
with
735 additions
and
169 deletions
+735
-169
intrin.hpp
modules/core/include/opencv2/core/hal/intrin.hpp
+8
-0
intrin_avx.hpp
modules/core/include/opencv2/core/hal/intrin_avx.hpp
+118
-22
intrin_cpp.hpp
modules/core/include/opencv2/core/hal/intrin_cpp.hpp
+52
-6
intrin_forward.hpp
modules/core/include/opencv2/core/hal/intrin_forward.hpp
+159
-0
intrin_neon.hpp
modules/core/include/opencv2/core/hal/intrin_neon.hpp
+43
-2
intrin_sse.hpp
modules/core/include/opencv2/core/hal/intrin_sse.hpp
+0
-0
intrin_sse_em.hpp
modules/core/include/opencv2/core/hal/intrin_sse_em.hpp
+168
-0
intrin_vsx.hpp
modules/core/include/opencv2/core/hal/intrin_vsx.hpp
+35
-18
vsx_utils.hpp
modules/core/include/opencv2/core/vsx_utils.hpp
+12
-10
test_intrin_utils.hpp
modules/core/test/test_intrin_utils.hpp
+21
-9
CMakeLists.txt
modules/imgproc/CMakeLists.txt
+1
-1
perf_accumulate.cpp
modules/imgproc/perf/perf_accumulate.cpp
+97
-89
accum.simd.hpp
modules/imgproc/src/accum.simd.hpp
+0
-0
smooth.cpp
modules/imgproc/src/smooth.cpp
+19
-10
lkpyramid.cpp
modules/video/src/lkpyramid.cpp
+2
-2
No files found.
modules/core/include/opencv2/core/hal/intrin.hpp
View file @
1cc3f7ab
...
...
@@ -139,8 +139,14 @@ using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
# undef CV_FP16
#endif
#if CV_SSE2 || CV_NEON || CV_VSX
#define CV__SIMD_FORWARD 128
#include "opencv2/core/hal/intrin_forward.hpp"
#endif
#if CV_SSE2
#include "opencv2/core/hal/intrin_sse_em.hpp"
#include "opencv2/core/hal/intrin_sse.hpp"
#elif CV_NEON
...
...
@@ -168,6 +174,8 @@ using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
// (and will be mapped to v256_ counterparts) (e.g. vx_load() => v256_load())
#if CV_AVX2
#define CV__SIMD_FORWARD 256
#include "opencv2/core/hal/intrin_forward.hpp"
#include "opencv2/core/hal/intrin_avx.hpp"
#endif
...
...
modules/core/include/opencv2/core/hal/intrin_avx.hpp
View file @
1cc3f7ab
...
...
@@ -82,6 +82,14 @@ inline __m128 _v256_extract_low(const __m256& v)
inline
__m128d
_v256_extract_low
(
const
__m256d
&
v
)
{
return
_mm256_castpd256_pd128
(
v
);
}
inline
__m256i
_v256_packs_epu32
(
const
__m256i
&
a
,
const
__m256i
&
b
)
{
const
__m256i
m
=
_mm256_set1_epi32
(
65535
);
__m256i
am
=
_mm256_min_epu32
(
a
,
m
);
__m256i
bm
=
_mm256_min_epu32
(
b
,
m
);
return
_mm256_packus_epi32
(
am
,
bm
);
}
///////// Types ////////////
struct
v_uint8x32
...
...
@@ -626,10 +634,8 @@ OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int8x32, _mm256_adds_epi8)
OPENCV_HAL_IMPL_AVX_BIN_OP
(
-
,
v_int8x32
,
_mm256_subs_epi8
)
OPENCV_HAL_IMPL_AVX_BIN_OP
(
+
,
v_uint16x16
,
_mm256_adds_epu16
)
OPENCV_HAL_IMPL_AVX_BIN_OP
(
-
,
v_uint16x16
,
_mm256_subs_epu16
)
OPENCV_HAL_IMPL_AVX_BIN_OP
(
*
,
v_uint16x16
,
_mm256_mullo_epi16
)
OPENCV_HAL_IMPL_AVX_BIN_OP
(
+
,
v_int16x16
,
_mm256_adds_epi16
)
OPENCV_HAL_IMPL_AVX_BIN_OP
(
-
,
v_int16x16
,
_mm256_subs_epi16
)
OPENCV_HAL_IMPL_AVX_BIN_OP
(
*
,
v_int16x16
,
_mm256_mullo_epi16
)
OPENCV_HAL_IMPL_AVX_BIN_OP
(
+
,
v_uint32x8
,
_mm256_add_epi32
)
OPENCV_HAL_IMPL_AVX_BIN_OP
(
-
,
v_uint32x8
,
_mm256_sub_epi32
)
OPENCV_HAL_IMPL_AVX_BIN_OP
(
*
,
v_uint32x8
,
_mm256_mullo_epi32
)
...
...
@@ -650,13 +656,103 @@ OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_float64x4, _mm256_sub_pd)
OPENCV_HAL_IMPL_AVX_BIN_OP
(
*
,
v_float64x4
,
_mm256_mul_pd
)
OPENCV_HAL_IMPL_AVX_BIN_OP
(
/
,
v_float64x4
,
_mm256_div_pd
)
// saturating multiply 8-bit, 16-bit
inline
v_uint8x32
operator
*
(
const
v_uint8x32
&
a
,
const
v_uint8x32
&
b
)
{
v_uint16x16
c
,
d
;
v_mul_expand
(
a
,
b
,
c
,
d
);
return
v_pack_u
(
v_reinterpret_as_s16
(
c
),
v_reinterpret_as_s16
(
d
));
}
inline
v_int8x32
operator
*
(
const
v_int8x32
&
a
,
const
v_int8x32
&
b
)
{
v_int16x16
c
,
d
;
v_mul_expand
(
a
,
b
,
c
,
d
);
return
v_pack
(
c
,
d
);
}
inline
v_uint16x16
operator
*
(
const
v_uint16x16
&
a
,
const
v_uint16x16
&
b
)
{
__m256i
pl
=
_mm256_mullo_epi16
(
a
.
val
,
b
.
val
);
__m256i
ph
=
_mm256_mulhi_epu16
(
a
.
val
,
b
.
val
);
__m256i
p0
=
_mm256_unpacklo_epi16
(
pl
,
ph
);
__m256i
p1
=
_mm256_unpackhi_epi16
(
pl
,
ph
);
return
v_uint16x16
(
_v256_packs_epu32
(
p0
,
p1
));
}
inline
v_int16x16
operator
*
(
const
v_int16x16
&
a
,
const
v_int16x16
&
b
)
{
__m256i
pl
=
_mm256_mullo_epi16
(
a
.
val
,
b
.
val
);
__m256i
ph
=
_mm256_mulhi_epi16
(
a
.
val
,
b
.
val
);
__m256i
p0
=
_mm256_unpacklo_epi16
(
pl
,
ph
);
__m256i
p1
=
_mm256_unpackhi_epi16
(
pl
,
ph
);
return
v_int16x16
(
_mm256_packs_epi32
(
p0
,
p1
));
}
inline
v_uint8x32
&
operator
*=
(
v_uint8x32
&
a
,
const
v_uint8x32
&
b
)
{
a
=
a
*
b
;
return
a
;
}
inline
v_int8x32
&
operator
*=
(
v_int8x32
&
a
,
const
v_int8x32
&
b
)
{
a
=
a
*
b
;
return
a
;
}
inline
v_uint16x16
&
operator
*=
(
v_uint16x16
&
a
,
const
v_uint16x16
&
b
)
{
a
=
a
*
b
;
return
a
;
}
inline
v_int16x16
&
operator
*=
(
v_int16x16
&
a
,
const
v_int16x16
&
b
)
{
a
=
a
*
b
;
return
a
;
}
/** Non-saturating arithmetics **/
#define OPENCV_HAL_IMPL_AVX_BIN_FUNC(func, _Tpvec, intrin) \
inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(intrin(a.val, b.val)); }
OPENCV_HAL_IMPL_AVX_BIN_FUNC
(
v_add_wrap
,
v_uint8x32
,
_mm256_add_epi8
)
OPENCV_HAL_IMPL_AVX_BIN_FUNC
(
v_add_wrap
,
v_int8x32
,
_mm256_add_epi8
)
OPENCV_HAL_IMPL_AVX_BIN_FUNC
(
v_add_wrap
,
v_uint16x16
,
_mm256_add_epi16
)
OPENCV_HAL_IMPL_AVX_BIN_FUNC
(
v_add_wrap
,
v_int16x16
,
_mm256_add_epi16
)
OPENCV_HAL_IMPL_AVX_BIN_FUNC
(
v_sub_wrap
,
v_uint8x32
,
_mm256_sub_epi8
)
OPENCV_HAL_IMPL_AVX_BIN_FUNC
(
v_sub_wrap
,
v_int8x32
,
_mm256_sub_epi8
)
OPENCV_HAL_IMPL_AVX_BIN_FUNC
(
v_sub_wrap
,
v_uint16x16
,
_mm256_sub_epi16
)
OPENCV_HAL_IMPL_AVX_BIN_FUNC
(
v_sub_wrap
,
v_int16x16
,
_mm256_sub_epi16
)
OPENCV_HAL_IMPL_AVX_BIN_FUNC
(
v_mul_wrap
,
v_uint16x16
,
_mm256_mullo_epi16
)
OPENCV_HAL_IMPL_AVX_BIN_FUNC
(
v_mul_wrap
,
v_int16x16
,
_mm256_mullo_epi16
)
inline
v_uint8x32
v_mul_wrap
(
const
v_uint8x32
&
a
,
const
v_uint8x32
&
b
)
{
__m256i
ad
=
_mm256_srai_epi16
(
a
.
val
,
8
);
__m256i
bd
=
_mm256_srai_epi16
(
b
.
val
,
8
);
__m256i
p0
=
_mm256_mullo_epi16
(
a
.
val
,
b
.
val
);
// even
__m256i
p1
=
_mm256_slli_epi16
(
_mm256_mullo_epi16
(
ad
,
bd
),
8
);
// odd
const
__m256i
b01
=
_mm256_set1_epi32
(
0xFF00FF00
);
return
v_uint8x32
(
_mm256_blendv_epi8
(
p0
,
p1
,
b01
));
}
inline
v_int8x32
v_mul_wrap
(
const
v_int8x32
&
a
,
const
v_int8x32
&
b
)
{
return
v_reinterpret_as_s8
(
v_mul_wrap
(
v_reinterpret_as_u8
(
a
),
v_reinterpret_as_u8
(
b
)));
}
// Multiply and expand
inline
void
v_mul_expand
(
const
v_uint8x32
&
a
,
const
v_uint8x32
&
b
,
v_uint16x16
&
c
,
v_uint16x16
&
d
)
{
v_uint16x16
a0
,
a1
,
b0
,
b1
;
v_expand
(
a
,
a0
,
a1
);
v_expand
(
b
,
b0
,
b1
);
c
=
v_mul_wrap
(
a0
,
b0
);
d
=
v_mul_wrap
(
a1
,
b1
);
}
inline
void
v_mul_expand
(
const
v_int8x32
&
a
,
const
v_int8x32
&
b
,
v_int16x16
&
c
,
v_int16x16
&
d
)
{
v_int16x16
a0
,
a1
,
b0
,
b1
;
v_expand
(
a
,
a0
,
a1
);
v_expand
(
b
,
b0
,
b1
);
c
=
v_mul_wrap
(
a0
,
b0
);
d
=
v_mul_wrap
(
a1
,
b1
);
}
inline
void
v_mul_expand
(
const
v_int16x16
&
a
,
const
v_int16x16
&
b
,
v_int32x8
&
c
,
v_int32x8
&
d
)
{
v_int16x16
vhi
=
v_int16x16
(
_mm256_mulhi_epi16
(
a
.
val
,
b
.
val
));
v_int16x16
v0
,
v1
;
v_zip
(
a
*
b
,
vhi
,
v0
,
v1
);
v_zip
(
v_mul_wrap
(
a
,
b
)
,
vhi
,
v0
,
v1
);
c
=
v_reinterpret_as_s32
(
v0
);
d
=
v_reinterpret_as_s32
(
v1
);
...
...
@@ -668,7 +764,7 @@ inline void v_mul_expand(const v_uint16x16& a, const v_uint16x16& b,
v_uint16x16
vhi
=
v_uint16x16
(
_mm256_mulhi_epu16
(
a
.
val
,
b
.
val
));
v_uint16x16
v0
,
v1
;
v_zip
(
a
*
b
,
vhi
,
v0
,
v1
);
v_zip
(
v_mul_wrap
(
a
,
b
)
,
vhi
,
v0
,
v1
);
c
=
v_reinterpret_as_u32
(
v0
);
d
=
v_reinterpret_as_u32
(
v1
);
...
...
@@ -685,20 +781,6 @@ inline void v_mul_expand(const v_uint32x8& a, const v_uint32x8& b,
inline
v_int16x16
v_mul_hi
(
const
v_int16x16
&
a
,
const
v_int16x16
&
b
)
{
return
v_int16x16
(
_mm256_mulhi_epi16
(
a
.
val
,
b
.
val
));
}
inline
v_uint16x16
v_mul_hi
(
const
v_uint16x16
&
a
,
const
v_uint16x16
&
b
)
{
return
v_uint16x16
(
_mm256_mulhi_epu16
(
a
.
val
,
b
.
val
));
}
/** Non-saturating arithmetics **/
#define OPENCV_HAL_IMPL_AVX_BIN_FUNC(func, _Tpvec, intrin) \
inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(intrin(a.val, b.val)); }
OPENCV_HAL_IMPL_AVX_BIN_FUNC
(
v_add_wrap
,
v_uint8x32
,
_mm256_add_epi8
)
OPENCV_HAL_IMPL_AVX_BIN_FUNC
(
v_add_wrap
,
v_int8x32
,
_mm256_add_epi8
)
OPENCV_HAL_IMPL_AVX_BIN_FUNC
(
v_add_wrap
,
v_uint16x16
,
_mm256_add_epi16
)
OPENCV_HAL_IMPL_AVX_BIN_FUNC
(
v_add_wrap
,
v_int16x16
,
_mm256_add_epi16
)
OPENCV_HAL_IMPL_AVX_BIN_FUNC
(
v_sub_wrap
,
v_uint8x32
,
_mm256_sub_epi8
)
OPENCV_HAL_IMPL_AVX_BIN_FUNC
(
v_sub_wrap
,
v_int8x32
,
_mm256_sub_epi8
)
OPENCV_HAL_IMPL_AVX_BIN_FUNC
(
v_sub_wrap
,
v_uint16x16
,
_mm256_sub_epi16
)
OPENCV_HAL_IMPL_AVX_BIN_FUNC
(
v_sub_wrap
,
v_int16x16
,
_mm256_sub_epi16
)
/** Bitwise shifts **/
#define OPENCV_HAL_IMPL_AVX_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai) \
inline _Tpuvec operator << (const _Tpuvec& a, int imm) \
...
...
@@ -1385,6 +1467,10 @@ OPENCV_HAL_IMPL_AVX_TRANSPOSE4x4(v_float32x8, ps, _mm256_castps_si256, _mm256_ca
b0.val = intrin(_v256_extract_low(a.val)); \
b1.val = intrin(_v256_extract_high(a.val)); \
} \
inline _Tpwvec v_expand_low(const _Tpvec& a) \
{ return _Tpwvec(intrin(_v256_extract_low(a.val))); } \
inline _Tpwvec v_expand_high(const _Tpvec& a) \
{ return _Tpwvec(intrin(_v256_extract_high(a.val))); } \
inline _Tpwvec v256_load_expand(const _Tp* ptr) \
{ \
__m128i a = _mm_loadu_si128((const __m128i*)ptr); \
...
...
@@ -1430,7 +1516,12 @@ inline void v_pack_store(schar* ptr, const v_int16x16& a)
{
v_store_low
(
ptr
,
v_pack
(
a
,
a
));
}
inline
void
v_pack_store
(
uchar
*
ptr
,
const
v_uint16x16
&
a
)
{
v_store_low
(
ptr
,
v_pack
(
a
,
a
));
}
{
const
__m256i
m
=
_mm256_set1_epi16
(
255
);
__m256i
am
=
_mm256_min_epu16
(
a
.
val
,
m
);
am
=
_v256_shuffle_odd_64
(
_mm256_packus_epi16
(
am
,
am
));
v_store_low
(
ptr
,
v_uint8x32
(
am
));
}
inline
void
v_pack_u_store
(
uchar
*
ptr
,
const
v_int16x16
&
a
)
{
v_store_low
(
ptr
,
v_pack_u
(
a
,
a
));
}
...
...
@@ -1484,16 +1575,21 @@ inline v_int16x16 v_pack(const v_int32x8& a, const v_int32x8& b)
{
return
v_int16x16
(
_v256_shuffle_odd_64
(
_mm256_packs_epi32
(
a
.
val
,
b
.
val
)));
}
inline
v_uint16x16
v_pack
(
const
v_uint32x8
&
a
,
const
v_uint32x8
&
b
)
{
return
v_uint16x16
(
_v256_shuffle_odd_64
(
_
mm256_packus_epi
32
(
a
.
val
,
b
.
val
)));
}
{
return
v_uint16x16
(
_v256_shuffle_odd_64
(
_
v256_packs_epu
32
(
a
.
val
,
b
.
val
)));
}
inline
v_uint16x16
v_pack_u
(
const
v_int32x8
&
a
,
const
v_int32x8
&
b
)
{
return
v_
pack
(
v_reinterpret_as_u32
(
a
),
v_reinterpret_as_u32
(
b
));
}
{
return
v_
uint16x16
(
_v256_shuffle_odd_64
(
_mm256_packus_epi32
(
a
.
val
,
b
.
val
)
));
}
inline
void
v_pack_store
(
short
*
ptr
,
const
v_int32x8
&
a
)
{
v_store_low
(
ptr
,
v_pack
(
a
,
a
));
}
inline
void
v_pack_store
(
ushort
*
ptr
,
const
v_uint32x8
&
a
)
{
v_store_low
(
ptr
,
v_pack
(
a
,
a
));
}
{
const
__m256i
m
=
_mm256_set1_epi32
(
65535
);
__m256i
am
=
_mm256_min_epu32
(
a
.
val
,
m
);
am
=
_v256_shuffle_odd_64
(
_mm256_packus_epi32
(
am
,
am
));
v_store_low
(
ptr
,
v_uint16x16
(
am
));
}
inline
void
v_pack_u_store
(
ushort
*
ptr
,
const
v_int32x8
&
a
)
{
v_store_low
(
ptr
,
v_pack_u
(
a
,
a
));
}
...
...
modules/core/include/opencv2/core/hal/intrin_cpp.hpp
View file @
1cc3f7ab
...
...
@@ -108,7 +108,7 @@ block and to save contents of the register to memory block.
These operations allow to reorder or recombine elements in one or multiple vectors.
- Interleave, deinterleave (2, 3 and 4 channels): @ref v_load_deinterleave, @ref v_store_interleave
- Expand: @ref v_load_expand, @ref v_load_expand_q, @ref v_expand
- Expand: @ref v_load_expand, @ref v_load_expand_q, @ref v_expand
, @ref v_expand_low, @ref v_expand_high
- Pack: @ref v_pack, @ref v_pack_u, @ref v_rshr_pack, @ref v_rshr_pack_u,
@ref v_pack_store, @ref v_pack_u_store, @ref v_rshr_pack_store, @ref v_rshr_pack_u_store
- Recombine: @ref v_zip, @ref v_recombine, @ref v_combine_low, @ref v_combine_high
...
...
@@ -185,11 +185,14 @@ Regular integers:
|load, store | x | x | x | x | x | x |
|interleave | x | x | x | x | x | x |
|expand | x | x | x | x | x | x |
|expand_low | x | x | x | x | x | x |
|expand_high | x | x | x | x | x | x |
|expand_q | x | x | | | | |
|add, sub | x | x | x | x | x | x |
|add_wrap, sub_wrap | x | x | x | x | | |
|mul | | | x | x | x | x |
|mul_expand | | | x | x | x | |
|mul_wrap | x | x | x | x | | |
|mul | x | x | x | x | x | x |
|mul_expand | x | x | x | x | x | |
|compare | x | x | x | x | x | x |
|shift | | | x | x | x | x |
|dotprod | | | | x | | |
...
...
@@ -680,7 +683,7 @@ OPENCV_HAL_IMPL_CMP_OP(!=)
//! @brief Helper macro
//! @ingroup core_hal_intrin_impl
#define OPENCV_HAL_IMPL_A
DD_SUB
_OP(func, bin_op, cast_op, _Tp2) \
#define OPENCV_HAL_IMPL_A
RITHM
_OP(func, bin_op, cast_op, _Tp2) \
template<typename _Tp, int n> \
inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
{ \
...
...
@@ -694,12 +697,17 @@ inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
/** @brief Add values without saturation
For 8- and 16-bit integer values. */
OPENCV_HAL_IMPL_A
DD_SUB
_OP
(
v_add_wrap
,
+
,
(
_Tp
),
_Tp
)
OPENCV_HAL_IMPL_A
RITHM
_OP
(
v_add_wrap
,
+
,
(
_Tp
),
_Tp
)
/** @brief Subtract values without saturation
For 8- and 16-bit integer values. */
OPENCV_HAL_IMPL_ADD_SUB_OP
(
v_sub_wrap
,
-
,
(
_Tp
),
_Tp
)
OPENCV_HAL_IMPL_ARITHM_OP
(
v_sub_wrap
,
-
,
(
_Tp
),
_Tp
)
/** @brief Multiply values without saturation
For 8- and 16-bit integer values. */
OPENCV_HAL_IMPL_ARITHM_OP
(
v_mul_wrap
,
*
,
(
_Tp
),
_Tp
)
//! @cond IGNORED
template
<
typename
T
>
inline
T
_absdiff
(
T
a
,
T
b
)
...
...
@@ -1106,6 +1114,44 @@ template<typename _Tp, int n> inline void v_expand(const v_reg<_Tp, n>& a,
}
}
/** @brief Expand lower values to the wider pack type
Same as cv::v_expand, but return lower half of the vector.
Scheme:
@code
int32x4 int64x2
{A B C D} ==> {A B}
@endcode */
template
<
typename
_Tp
,
int
n
>
inline
v_reg
<
typename
V_TypeTraits
<
_Tp
>::
w_type
,
n
/
2
>
v_expand_low
(
const
v_reg
<
_Tp
,
n
>&
a
)
{
v_reg
<
typename
V_TypeTraits
<
_Tp
>::
w_type
,
n
/
2
>
b
;
for
(
int
i
=
0
;
i
<
(
n
/
2
);
i
++
)
b
.
s
[
i
]
=
a
.
s
[
i
];
return
b
;
}
/** @brief Expand higher values to the wider pack type
Same as cv::v_expand_low, but expand higher half of the vector instead.
Scheme:
@code
int32x4 int64x2
{A B C D} ==> {C D}
@endcode */
template
<
typename
_Tp
,
int
n
>
inline
v_reg
<
typename
V_TypeTraits
<
_Tp
>::
w_type
,
n
/
2
>
v_expand_high
(
const
v_reg
<
_Tp
,
n
>&
a
)
{
v_reg
<
typename
V_TypeTraits
<
_Tp
>::
w_type
,
n
/
2
>
b
;
for
(
int
i
=
0
;
i
<
(
n
/
2
);
i
++
)
b
.
s
[
i
]
=
a
.
s
[
i
+
(
n
/
2
)];
return
b
;
}
//! @cond IGNORED
template
<
typename
_Tp
,
int
n
>
inline
v_reg
<
typename
V_TypeTraits
<
_Tp
>::
int_type
,
n
>
v_reinterpret_as_int
(
const
v_reg
<
_Tp
,
n
>&
a
)
...
...
modules/core/include/opencv2/core/hal/intrin_forward.hpp
0 → 100644
View file @
1cc3f7ab
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html
#ifndef CV__SIMD_FORWARD
#error "Need to pre-define forward width"
#endif
namespace
cv
{
//! @cond IGNORED
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
/** Types **/
#if CV__SIMD_FORWARD == 512
// [todo] 512
#error "AVX512 Not implemented yet"
#elif CV__SIMD_FORWARD == 256
// 256
#define __CV_VX(fun) v256_##fun
#define __CV_V_UINT8 v_uint8x32
#define __CV_V_INT8 v_int8x32
#define __CV_V_UINT16 v_uint16x16
#define __CV_V_INT16 v_int16x16
#define __CV_V_UINT32 v_uint32x8
#define __CV_V_INT32 v_int32x8
#define __CV_V_UINT64 v_uint64x4
#define __CV_V_INT64 v_int64x4
#define __CV_V_FLOAT32 v_float32x8
#define __CV_V_FLOAT64 v_float64x4
struct
v_uint8x32
;
struct
v_int8x32
;
struct
v_uint16x16
;
struct
v_int16x16
;
struct
v_uint32x8
;
struct
v_int32x8
;
struct
v_uint64x4
;
struct
v_int64x4
;
struct
v_float32x8
;
struct
v_float64x4
;
#else
// 128
#define __CV_VX(fun) v_##fun
#define __CV_V_UINT8 v_uint8x16
#define __CV_V_INT8 v_int8x16
#define __CV_V_UINT16 v_uint16x8
#define __CV_V_INT16 v_int16x8
#define __CV_V_UINT32 v_uint32x4
#define __CV_V_INT32 v_int32x4
#define __CV_V_UINT64 v_uint64x2
#define __CV_V_INT64 v_int64x2
#define __CV_V_FLOAT32 v_float32x4
#define __CV_V_FLOAT64 v_float64x2
struct
v_uint8x16
;
struct
v_int8x16
;
struct
v_uint16x8
;
struct
v_int16x8
;
struct
v_uint32x4
;
struct
v_int32x4
;
struct
v_uint64x2
;
struct
v_int64x2
;
struct
v_float32x4
;
struct
v_float64x2
;
#endif
/** Value reordering **/
// Expansion
void
v_expand
(
const
__CV_V_UINT8
&
,
__CV_V_UINT16
&
,
__CV_V_UINT16
&
);
void
v_expand
(
const
__CV_V_INT8
&
,
__CV_V_INT16
&
,
__CV_V_INT16
&
);
void
v_expand
(
const
__CV_V_UINT16
&
,
__CV_V_UINT32
&
,
__CV_V_UINT32
&
);
void
v_expand
(
const
__CV_V_INT16
&
,
__CV_V_INT32
&
,
__CV_V_INT32
&
);
void
v_expand
(
const
__CV_V_UINT32
&
,
__CV_V_UINT64
&
,
__CV_V_UINT64
&
);
void
v_expand
(
const
__CV_V_INT32
&
,
__CV_V_INT64
&
,
__CV_V_INT64
&
);
// Low Expansion
__CV_V_UINT16
v_expand_low
(
const
__CV_V_UINT8
&
);
__CV_V_INT16
v_expand_low
(
const
__CV_V_INT8
&
);
__CV_V_UINT32
v_expand_low
(
const
__CV_V_UINT16
&
);
__CV_V_INT32
v_expand_low
(
const
__CV_V_INT16
&
);
__CV_V_UINT64
v_expand_low
(
const
__CV_V_UINT32
&
);
__CV_V_INT64
v_expand_low
(
const
__CV_V_INT32
&
);
// High Expansion
__CV_V_UINT16
v_expand_high
(
const
__CV_V_UINT8
&
);
__CV_V_INT16
v_expand_high
(
const
__CV_V_INT8
&
);
__CV_V_UINT32
v_expand_high
(
const
__CV_V_UINT16
&
);
__CV_V_INT32
v_expand_high
(
const
__CV_V_INT16
&
);
__CV_V_UINT64
v_expand_high
(
const
__CV_V_UINT32
&
);
__CV_V_INT64
v_expand_high
(
const
__CV_V_INT32
&
);
// Load & Low Expansion
__CV_V_UINT16
__CV_VX
(
load_expand
)(
const
uchar
*
);
__CV_V_INT16
__CV_VX
(
load_expand
)(
const
schar
*
);
__CV_V_UINT32
__CV_VX
(
load_expand
)(
const
ushort
*
);
__CV_V_INT32
__CV_VX
(
load_expand
)(
const
short
*
);
__CV_V_UINT64
__CV_VX
(
load_expand
)(
const
uint
*
);
__CV_V_INT64
__CV_VX
(
load_expand
)(
const
int
*
);
// Load lower 8-bit and expand into 32-bit
__CV_V_UINT32
__CV_VX
(
load_expand_q
)(
const
uchar
*
);
__CV_V_INT32
__CV_VX
(
load_expand_q
)(
const
schar
*
);
// Saturating Pack
__CV_V_UINT8
v_pack
(
const
__CV_V_UINT16
&
,
const
__CV_V_UINT16
&
);
__CV_V_INT8
v_pack
(
const
__CV_V_INT16
&
,
const
__CV_V_INT16
&
);
__CV_V_UINT16
v_pack
(
const
__CV_V_UINT32
&
,
const
__CV_V_UINT32
&
);
__CV_V_INT16
v_pack
(
const
__CV_V_INT32
&
,
const
__CV_V_INT32
&
);
// Non-saturating Pack
__CV_V_UINT32
v_pack
(
const
__CV_V_UINT64
&
,
const
__CV_V_UINT64
&
);
__CV_V_INT32
v_pack
(
const
__CV_V_INT64
&
,
const
__CV_V_INT64
&
);
// Pack signed integers with unsigned saturation
__CV_V_UINT8
v_pack_u
(
const
__CV_V_INT16
&
,
const
__CV_V_INT16
&
);
__CV_V_UINT16
v_pack_u
(
const
__CV_V_INT32
&
,
const
__CV_V_INT32
&
);
/** Arithmetic, bitwise and comparison operations **/
// Non-saturating multiply
#if CV_VSX
template
<
typename
Tvec
>
Tvec
v_mul_wrap
(
const
Tvec
&
a
,
const
Tvec
&
b
);
#else
__CV_V_UINT8
v_mul_wrap
(
const
__CV_V_UINT8
&
,
const
__CV_V_UINT8
&
);
__CV_V_INT8
v_mul_wrap
(
const
__CV_V_INT8
&
,
const
__CV_V_INT8
&
);
__CV_V_UINT16
v_mul_wrap
(
const
__CV_V_UINT16
&
,
const
__CV_V_UINT16
&
);
__CV_V_INT16
v_mul_wrap
(
const
__CV_V_INT16
&
,
const
__CV_V_INT16
&
);
#endif
// Multiply and expand
#if CV_VSX
template
<
typename
Tvec
,
typename
Twvec
>
void
v_mul_expand
(
const
Tvec
&
a
,
const
Tvec
&
b
,
Twvec
&
c
,
Twvec
&
d
);
#else
void
v_mul_expand
(
const
__CV_V_UINT8
&
,
const
__CV_V_UINT8
&
,
__CV_V_UINT16
&
,
__CV_V_UINT16
&
);
void
v_mul_expand
(
const
__CV_V_INT8
&
,
const
__CV_V_INT8
&
,
__CV_V_INT16
&
,
__CV_V_INT16
&
);
void
v_mul_expand
(
const
__CV_V_UINT16
&
,
const
__CV_V_UINT16
&
,
__CV_V_UINT32
&
,
__CV_V_UINT32
&
);
void
v_mul_expand
(
const
__CV_V_INT16
&
,
const
__CV_V_INT16
&
,
__CV_V_INT32
&
,
__CV_V_INT32
&
);
void
v_mul_expand
(
const
__CV_V_UINT32
&
,
const
__CV_V_UINT32
&
,
__CV_V_UINT64
&
,
__CV_V_UINT64
&
);
void
v_mul_expand
(
const
__CV_V_INT32
&
,
const
__CV_V_INT32
&
,
__CV_V_INT64
&
,
__CV_V_INT64
&
);
#endif
/** Cleanup **/
#undef CV__SIMD_FORWARD
#undef __CV_VX
#undef __CV_V_UINT8
#undef __CV_V_INT8
#undef __CV_V_UINT16
#undef __CV_V_INT16
#undef __CV_V_UINT32
#undef __CV_V_INT32
#undef __CV_V_UINT64
#undef __CV_V_INT64
#undef __CV_V_FLOAT32
#undef __CV_V_FLOAT64
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
//! @endcond
}
// cv::
\ No newline at end of file
modules/core/include/opencv2/core/hal/intrin_neon.hpp
View file @
1cc3f7ab
...
...
@@ -435,10 +435,8 @@ OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int8x16, vqaddq_s8)
OPENCV_HAL_IMPL_NEON_BIN_OP
(
-
,
v_int8x16
,
vqsubq_s8
)
OPENCV_HAL_IMPL_NEON_BIN_OP
(
+
,
v_uint16x8
,
vqaddq_u16
)
OPENCV_HAL_IMPL_NEON_BIN_OP
(
-
,
v_uint16x8
,
vqsubq_u16
)
OPENCV_HAL_IMPL_NEON_BIN_OP
(
*
,
v_uint16x8
,
vmulq_u16
)
OPENCV_HAL_IMPL_NEON_BIN_OP
(
+
,
v_int16x8
,
vqaddq_s16
)
OPENCV_HAL_IMPL_NEON_BIN_OP
(
-
,
v_int16x8
,
vqsubq_s16
)
OPENCV_HAL_IMPL_NEON_BIN_OP
(
*
,
v_int16x8
,
vmulq_s16
)
OPENCV_HAL_IMPL_NEON_BIN_OP
(
+
,
v_int32x4
,
vaddq_s32
)
OPENCV_HAL_IMPL_NEON_BIN_OP
(
-
,
v_int32x4
,
vsubq_s32
)
OPENCV_HAL_IMPL_NEON_BIN_OP
(
*
,
v_int32x4
,
vmulq_s32
)
...
...
@@ -476,6 +474,37 @@ inline v_float32x4& operator /= (v_float32x4& a, const v_float32x4& b)
}
#endif
// saturating multiply 8-bit, 16-bit
#define OPENCV_HAL_IMPL_NEON_MUL_SAT(_Tpvec, _Tpwvec) \
inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \
{ \
_Tpwvec c, d; \
v_mul_expand(a, b, c, d); \
return v_pack(c, d); \
} \
inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \
{ a = a * b; return a; }
OPENCV_HAL_IMPL_NEON_MUL_SAT
(
v_int8x16
,
v_int16x8
)
OPENCV_HAL_IMPL_NEON_MUL_SAT
(
v_uint8x16
,
v_uint16x8
)
OPENCV_HAL_IMPL_NEON_MUL_SAT
(
v_int16x8
,
v_int32x4
)
OPENCV_HAL_IMPL_NEON_MUL_SAT
(
v_uint16x8
,
v_uint32x4
)
// Multiply and expand
inline
void
v_mul_expand
(
const
v_int8x16
&
a
,
const
v_int8x16
&
b
,
v_int16x8
&
c
,
v_int16x8
&
d
)
{
c
.
val
=
vmull_s8
(
vget_low_s8
(
a
.
val
),
vget_low_s8
(
b
.
val
));
d
.
val
=
vmull_s8
(
vget_high_s8
(
a
.
val
),
vget_high_s8
(
b
.
val
));
}
inline
void
v_mul_expand
(
const
v_uint8x16
&
a
,
const
v_uint8x16
&
b
,
v_uint16x8
&
c
,
v_uint16x8
&
d
)
{
c
.
val
=
vmull_u8
(
vget_low_u8
(
a
.
val
),
vget_low_u8
(
b
.
val
));
d
.
val
=
vmull_u8
(
vget_high_u8
(
a
.
val
),
vget_high_u8
(
b
.
val
));
}
inline
void
v_mul_expand
(
const
v_int16x8
&
a
,
const
v_int16x8
&
b
,
v_int32x4
&
c
,
v_int32x4
&
d
)
{
...
...
@@ -714,6 +743,10 @@ OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_sub_wrap, vsubq_u8)
OPENCV_HAL_IMPL_NEON_BIN_FUNC
(
v_int8x16
,
v_sub_wrap
,
vsubq_s8
)
OPENCV_HAL_IMPL_NEON_BIN_FUNC
(
v_uint16x8
,
v_sub_wrap
,
vsubq_u16
)
OPENCV_HAL_IMPL_NEON_BIN_FUNC
(
v_int16x8
,
v_sub_wrap
,
vsubq_s16
)
OPENCV_HAL_IMPL_NEON_BIN_FUNC
(
v_uint8x16
,
v_mul_wrap
,
vmulq_u8
)
OPENCV_HAL_IMPL_NEON_BIN_FUNC
(
v_int8x16
,
v_mul_wrap
,
vmulq_s8
)
OPENCV_HAL_IMPL_NEON_BIN_FUNC
(
v_uint16x8
,
v_mul_wrap
,
vmulq_u16
)
OPENCV_HAL_IMPL_NEON_BIN_FUNC
(
v_int16x8
,
v_mul_wrap
,
vmulq_s16
)
// TODO: absdiff for signed integers
OPENCV_HAL_IMPL_NEON_BIN_FUNC
(
v_uint8x16
,
v_absdiff
,
vabdq_u8
)
...
...
@@ -1056,6 +1089,14 @@ inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
b0.val = vmovl_##suffix(vget_low_##suffix(a.val)); \
b1.val = vmovl_##suffix(vget_high_##suffix(a.val)); \
} \
inline _Tpwvec v_expand_low(const _Tpvec& a) \
{ \
return _Tpwvec(vmovl_##suffix(vget_low_##suffix(a.val))); \
} \
inline _Tpwvec v_expand_high(const _Tpvec& a) \
{ \
return _Tpwvec(vmovl_##suffix(vget_high_##suffix(a.val))); \
} \
inline _Tpwvec v_load_expand(const _Tp* ptr) \
{ \
return _Tpwvec(vmovl_##suffix(vld1_##suffix(ptr))); \
...
...
modules/core/include/opencv2/core/hal/intrin_sse.hpp
View file @
1cc3f7ab
This diff is collapsed.
Click to expand it.
modules/core/include/opencv2/core/hal/intrin_sse_em.hpp
0 → 100644
View file @
1cc3f7ab
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html
#ifndef OPENCV_HAL_INTRIN_SSE_EM_HPP
#define OPENCV_HAL_INTRIN_SSE_EM_HPP
namespace
cv
{
//! @cond IGNORED
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
#define OPENCV_HAL_SSE_WRAP_1(fun, tp) \
inline tp _v128_##fun(const tp& a) \
{ return _mm_##fun(a); }
#define OPENCV_HAL_SSE_WRAP_2(fun, tp) \
inline tp _v128_##fun(const tp& a, const tp& b) \
{ return _mm_##fun(a, b); }
#define OPENCV_HAL_SSE_WRAP_3(fun, tp) \
inline tp _v128_##fun(const tp& a, const tp& b, const tp& c) \
{ return _mm_##fun(a, b, c); }
///////////////////////////// XOP /////////////////////////////
// [todo] define CV_XOP
#if 1 // CV_XOP
inline
__m128i
_v128_comgt_epu32
(
const
__m128i
&
a
,
const
__m128i
&
b
)
{
const
__m128i
delta
=
_mm_set1_epi32
((
int
)
0x80000000
);
return
_mm_cmpgt_epi32
(
_mm_xor_si128
(
a
,
delta
),
_mm_xor_si128
(
b
,
delta
));
}
// wrapping XOP
#else
OPENCV_HAL_SSE_WRAP_2
(
_v128_comgt_epu32
,
__m128i
)
#endif // !CV_XOP
///////////////////////////// SSE4.1 /////////////////////////////
#if !CV_SSE4_1
/** Swizzle **/
inline
__m128i
_v128_blendv_epi8
(
const
__m128i
&
a
,
const
__m128i
&
b
,
const
__m128i
&
mask
)
{
return
_mm_xor_si128
(
a
,
_mm_and_si128
(
_mm_xor_si128
(
b
,
a
),
mask
));
}
/** Convert **/
// 8 >> 16
inline
__m128i
_v128_cvtepu8_epi16
(
const
__m128i
&
a
)
{
const
__m128i
z
=
_mm_setzero_si128
();
return
_mm_unpacklo_epi8
(
a
,
z
);
}
inline
__m128i
_v128_cvtepi8_epi16
(
const
__m128i
&
a
)
{
return
_mm_srai_epi16
(
_mm_unpacklo_epi8
(
a
,
a
),
8
);
}
// 8 >> 32
inline
__m128i
_v128_cvtepu8_epi32
(
const
__m128i
&
a
)
{
const
__m128i
z
=
_mm_setzero_si128
();
return
_mm_unpacklo_epi16
(
_mm_unpacklo_epi8
(
a
,
z
),
z
);
}
inline
__m128i
_v128_cvtepi8_epi32
(
const
__m128i
&
a
)
{
__m128i
r
=
_mm_unpacklo_epi8
(
a
,
a
);
r
=
_mm_unpacklo_epi8
(
r
,
r
);
return
_mm_srai_epi32
(
r
,
24
);
}
// 16 >> 32
inline
__m128i
_v128_cvtepu16_epi32
(
const
__m128i
&
a
)
{
const
__m128i
z
=
_mm_setzero_si128
();
return
_mm_unpacklo_epi16
(
a
,
z
);
}
inline
__m128i
_v128_cvtepi16_epi32
(
const
__m128i
&
a
)
{
return
_mm_srai_epi32
(
_mm_unpacklo_epi16
(
a
,
a
),
16
);
}
// 32 >> 64
inline
__m128i
_v128_cvtepu32_epi64
(
const
__m128i
&
a
)
{
const
__m128i
z
=
_mm_setzero_si128
();
return
_mm_unpacklo_epi32
(
a
,
z
);
}
inline
__m128i
_v128_cvtepi32_epi64
(
const
__m128i
&
a
)
{
return
_mm_unpacklo_epi32
(
a
,
_mm_srai_epi32
(
a
,
31
));
}
/** Arithmetic **/
inline
__m128i
_v128_mullo_epi32
(
const
__m128i
&
a
,
const
__m128i
&
b
)
{
__m128i
c0
=
_mm_mul_epu32
(
a
,
b
);
__m128i
c1
=
_mm_mul_epu32
(
_mm_srli_epi64
(
a
,
32
),
_mm_srli_epi64
(
b
,
32
));
__m128i
d0
=
_mm_unpacklo_epi32
(
c0
,
c1
);
__m128i
d1
=
_mm_unpackhi_epi32
(
c0
,
c1
);
return
_mm_unpacklo_epi64
(
d0
,
d1
);
}
/** Math **/
inline
__m128i
_v128_min_epu32
(
const
__m128i
&
a
,
const
__m128i
&
b
)
{
return
_v128_blendv_epi8
(
a
,
b
,
_v128_comgt_epu32
(
a
,
b
));
}
// wrapping SSE4.1
#else
OPENCV_HAL_SSE_WRAP_1
(
cvtepu8_epi16
,
__m128i
)
OPENCV_HAL_SSE_WRAP_1
(
cvtepi8_epi16
,
__m128i
)
OPENCV_HAL_SSE_WRAP_1
(
cvtepu8_epi32
,
__m128i
)
OPENCV_HAL_SSE_WRAP_1
(
cvtepi8_epi32
,
__m128i
)
OPENCV_HAL_SSE_WRAP_1
(
cvtepu16_epi32
,
__m128i
)
OPENCV_HAL_SSE_WRAP_1
(
cvtepi16_epi32
,
__m128i
)
OPENCV_HAL_SSE_WRAP_1
(
cvtepu32_epi64
,
__m128i
)
OPENCV_HAL_SSE_WRAP_1
(
cvtepi32_epi64
,
__m128i
)
OPENCV_HAL_SSE_WRAP_2
(
min_epu32
,
__m128i
)
OPENCV_HAL_SSE_WRAP_2
(
mullo_epi32
,
__m128i
)
OPENCV_HAL_SSE_WRAP_3
(
blendv_epi8
,
__m128i
)
#endif // !CV_SSE4_1
///////////////////////////// Revolutionary /////////////////////////////
/** Convert **/
// 16 << 8
inline
__m128i
_v128_cvtepu8_epi16_high
(
const
__m128i
&
a
)
{
const
__m128i
z
=
_mm_setzero_si128
();
return
_mm_unpackhi_epi8
(
a
,
z
);
}
inline
__m128i
_v128_cvtepi8_epi16_high
(
const
__m128i
&
a
)
{
return
_mm_srai_epi16
(
_mm_unpackhi_epi8
(
a
,
a
),
8
);
}
// 32 << 16
inline
__m128i
_v128_cvtepu16_epi32_high
(
const
__m128i
&
a
)
{
const
__m128i
z
=
_mm_setzero_si128
();
return
_mm_unpackhi_epi16
(
a
,
z
);
}
inline
__m128i
_v128_cvtepi16_epi32_high
(
const
__m128i
&
a
)
{
return
_mm_srai_epi32
(
_mm_unpackhi_epi16
(
a
,
a
),
16
);
}
// 64 << 32
inline
__m128i
_v128_cvtepu32_epi64_high
(
const
__m128i
&
a
)
{
const
__m128i
z
=
_mm_setzero_si128
();
return
_mm_unpackhi_epi32
(
a
,
z
);
}
inline
__m128i
_v128_cvtepi32_epi64_high
(
const
__m128i
&
a
)
{
return
_mm_unpackhi_epi32
(
a
,
_mm_srai_epi32
(
a
,
31
));
}
/** Miscellaneous **/
inline
__m128i
_v128_packs_epu32
(
const
__m128i
&
a
,
const
__m128i
&
b
)
{
const
__m128i
m
=
_mm_set1_epi32
(
65535
);
__m128i
am
=
_v128_min_epu32
(
a
,
m
);
__m128i
bm
=
_v128_min_epu32
(
b
,
m
);
#if CV_SSE4_1
return
_mm_packus_epi32
(
am
,
bm
);
#else
const
__m128i
d
=
_mm_set1_epi32
(
32768
),
nd
=
_mm_set1_epi16
(
-
32768
);
am
=
_mm_sub_epi32
(
am
,
d
);
bm
=
_mm_sub_epi32
(
bm
,
d
);
am
=
_mm_packs_epi32
(
am
,
bm
);
return
_mm_sub_epi16
(
am
,
nd
);
#endif
}
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
//! @endcond
}
// cv::
#endif // OPENCV_HAL_INTRIN_SSE_EM_HPP
\ No newline at end of file
modules/core/include/opencv2/core/hal/intrin_vsx.hpp
View file @
1cc3f7ab
...
...
@@ -315,6 +315,10 @@ inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
b0.val = fh(a.val); \
b1.val = fl(a.val); \
} \
inline _Tpwvec v_expand_low(const _Tpvec& a) \
{ return _Tpwvec(fh(a.val)); } \
inline _Tpwvec v_expand_high(const _Tpvec& a) \
{ return _Tpwvec(fl(a.val)); } \
inline _Tpwvec v_load_expand(const _Tp* ptr) \
{ return _Tpwvec(fh(vec_ld_l8(ptr))); }
...
...
@@ -418,10 +422,8 @@ OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_int8x16, vec_adds)
OPENCV_HAL_IMPL_VSX_BIN_OP
(
-
,
v_int8x16
,
vec_subs
)
OPENCV_HAL_IMPL_VSX_BIN_OP
(
+
,
v_uint16x8
,
vec_adds
)
OPENCV_HAL_IMPL_VSX_BIN_OP
(
-
,
v_uint16x8
,
vec_subs
)
OPENCV_HAL_IMPL_VSX_BIN_OP
(
*
,
v_uint16x8
,
vec_mul
)
OPENCV_HAL_IMPL_VSX_BIN_OP
(
+
,
v_int16x8
,
vec_adds
)
OPENCV_HAL_IMPL_VSX_BIN_OP
(
-
,
v_int16x8
,
vec_subs
)
OPENCV_HAL_IMPL_VSX_BIN_OP
(
*
,
v_int16x8
,
vec_mul
)
OPENCV_HAL_IMPL_VSX_BIN_OP
(
+
,
v_uint32x4
,
vec_add
)
OPENCV_HAL_IMPL_VSX_BIN_OP
(
-
,
v_uint32x4
,
vec_sub
)
OPENCV_HAL_IMPL_VSX_BIN_OP
(
*
,
v_uint32x4
,
vec_mul
)
...
...
@@ -441,16 +443,30 @@ OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_uint64x2, vec_sub)
OPENCV_HAL_IMPL_VSX_BIN_OP
(
+
,
v_int64x2
,
vec_add
)
OPENCV_HAL_IMPL_VSX_BIN_OP
(
-
,
v_int64x2
,
vec_sub
)
inline
void
v_mul_expand
(
const
v_int16x8
&
a
,
const
v_int16x8
&
b
,
v_int32x4
&
c
,
v_int32x4
&
d
)
// saturating multiply
#define OPENCV_HAL_IMPL_VSX_MUL_SAT(_Tpvec, _Tpwvec) \
inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \
{ \
_Tpwvec c, d; \
v_mul_expand(a, b, c, d); \
return v_pack(c, d); \
} \
inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \
{ a = a * b; return a; }
OPENCV_HAL_IMPL_VSX_MUL_SAT
(
v_int8x16
,
v_int16x8
)
OPENCV_HAL_IMPL_VSX_MUL_SAT
(
v_uint8x16
,
v_uint16x8
)
OPENCV_HAL_IMPL_VSX_MUL_SAT
(
v_int16x8
,
v_int32x4
)
OPENCV_HAL_IMPL_VSX_MUL_SAT
(
v_uint16x8
,
v_uint32x4
)
template
<
typename
Tvec
,
typename
Twvec
>
inline
void
v_mul_expand
(
const
Tvec
&
a
,
const
Tvec
&
b
,
Twvec
&
c
,
Twvec
&
d
)
{
c
.
val
=
vec_mul
(
vec_unpackh
(
a
.
val
),
vec_unpackh
(
b
.
val
));
d
.
val
=
vec_mul
(
vec_unpackl
(
a
.
val
),
vec_unpackl
(
b
.
val
));
}
inline
void
v_mul_expand
(
const
v_uint16x8
&
a
,
const
v_uint16x8
&
b
,
v_uint32x4
&
c
,
v_uint32x4
&
d
)
{
c
.
val
=
vec_mul
(
vec_unpackhu
(
a
.
val
),
vec_unpackhu
(
b
.
val
));
d
.
val
=
vec_mul
(
vec_unpacklu
(
a
.
val
),
vec_unpacklu
(
b
.
val
));
Twvec
p0
=
Twvec
(
vec_mule
(
a
.
val
,
b
.
val
));
Twvec
p1
=
Twvec
(
vec_mulo
(
a
.
val
,
b
.
val
));
v_zip
(
p0
,
p1
,
c
,
d
);
}
inline
void
v_mul_expand
(
const
v_uint32x4
&
a
,
const
v_uint32x4
&
b
,
v_uint64x2
&
c
,
v_uint64x2
&
d
)
{
c
.
val
=
vec_mul
(
vec_unpackhu
(
a
.
val
),
vec_unpackhu
(
b
.
val
));
...
...
@@ -459,17 +475,17 @@ inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b, v_uint64x2& c
inline
v_int16x8
v_mul_hi
(
const
v_int16x8
&
a
,
const
v_int16x8
&
b
)
{
return
v_int16x8
(
vec_packs
(
vec_sra
(
vec_mul
(
vec_unpackh
(
a
.
val
),
vec_unpackh
(
b
.
val
)),
vec_uint4_sp
(
16
)),
vec_sra
(
vec_mul
(
vec_unpackl
(
a
.
val
),
vec_unpackl
(
b
.
val
)),
vec_uint4_sp
(
16
))
));
vec_int4
p0
=
vec_mule
(
a
.
val
,
b
.
val
);
vec_int4
p1
=
vec_mulo
(
a
.
val
,
b
.
val
);
static
const
vec_uchar16
perm
=
{
2
,
3
,
18
,
19
,
6
,
7
,
22
,
23
,
10
,
11
,
26
,
27
,
14
,
15
,
30
,
31
};
return
v_int16x8
(
vec_perm
(
vec_short8_c
(
p0
),
vec_short8_c
(
p1
),
perm
));
}
inline
v_uint16x8
v_mul_hi
(
const
v_uint16x8
&
a
,
const
v_uint16x8
&
b
)
{
return
v_uint16x8
(
vec_packs
(
vec_sr
(
vec_mul
(
vec_unpackhu
(
a
.
val
),
vec_unpackhu
(
b
.
val
)),
vec_uint4_sp
(
16
)),
vec_sr
(
vec_mul
(
vec_unpacklu
(
a
.
val
),
vec_unpacklu
(
b
.
val
)),
vec_uint4_sp
(
16
))
));
vec_uint4
p0
=
vec_mule
(
a
.
val
,
b
.
val
);
vec_uint4
p1
=
vec_mulo
(
a
.
val
,
b
.
val
);
static
const
vec_uchar16
perm
=
{
2
,
3
,
18
,
19
,
6
,
7
,
22
,
23
,
10
,
11
,
26
,
27
,
14
,
15
,
30
,
31
};
return
v_uint16x8
(
vec_perm
(
vec_ushort8_c
(
p0
),
vec_ushort8_c
(
p1
),
perm
));
}
/** Non-saturating arithmetics **/
...
...
@@ -480,6 +496,7 @@ inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
OPENCV_HAL_IMPL_VSX_BIN_FUNC
(
v_add_wrap
,
vec_add
)
OPENCV_HAL_IMPL_VSX_BIN_FUNC
(
v_sub_wrap
,
vec_sub
)
OPENCV_HAL_IMPL_VSX_BIN_FUNC
(
v_mul_wrap
,
vec_mul
)
/** Bitwise shifts **/
#define OPENCV_HAL_IMPL_VSX_SHIFT_OP(_Tpvec, shr, splfunc) \
...
...
modules/core/include/opencv2/core/vsx_utils.hpp
View file @
1cc3f7ab
...
...
@@ -130,19 +130,21 @@ VSX_FINLINE(rt) fnm(const rg& a, const rg& b) \
# undef vec_mul
# endif
/*
* there's no a direct instruction for supporting 16-bit multiplication in ISA 2.07,
* there's no a direct instruction for supporting
8-bit,
16-bit multiplication in ISA 2.07,
* XLC Implement it by using instruction "multiply even", "multiply odd" and "permute"
* todo: Do I need to support 8-bit ?
**/
# define VSX_IMPL_MULH(Tvec, Tcast) \
VSX_FINLINE(Tvec) vec_mul(const Tvec& a, const Tvec& b) \
{ \
static const vec_uchar16 even_perm = {0, 1, 16, 17, 4, 5, 20, 21, \
8, 9, 24, 25, 12, 13, 28, 29}; \
return vec_perm(Tcast(vec_mule(a, b)), Tcast(vec_mulo(a, b)), even_perm); \
# define VSX_IMPL_MULH(Tvec, cperm) \
VSX_FINLINE(Tvec) vec_mul(const Tvec& a, const Tvec& b) \
{ \
static const vec_uchar16 ev_od = {cperm}; \
return vec_perm((Tvec)vec_mule(a, b), (Tvec)vec_mulo(a, b), ev_od); \
}
VSX_IMPL_MULH
(
vec_short8
,
vec_short8_c
)
VSX_IMPL_MULH
(
vec_ushort8
,
vec_ushort8_c
)
#define VSX_IMPL_MULH_P16 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30
VSX_IMPL_MULH
(
vec_char16
,
VSX_IMPL_MULH_P16
)
VSX_IMPL_MULH
(
vec_uchar16
,
VSX_IMPL_MULH_P16
)
#define VSX_IMPL_MULH_P8 0, 1, 16, 17, 4, 5, 20, 21, 8, 9, 24, 25, 12, 13, 28, 29
VSX_IMPL_MULH
(
vec_short8
,
VSX_IMPL_MULH_P8
)
VSX_IMPL_MULH
(
vec_ushort8
,
VSX_IMPL_MULH_P8
)
// vmuluwm can be used for unsigned or signed integers, that's what they said
VSX_IMPL_2VRG
(
vec_int4
,
vec_int4
,
vmuluwm
,
vec_mul
)
VSX_IMPL_2VRG
(
vec_uint4
,
vec_uint4
,
vmuluwm
,
vec_mul
)
...
...
modules/core/test/test_intrin_utils.hpp
View file @
1cc3f7ab
...
...
@@ -407,10 +407,13 @@ template<typename R> struct TheTest
Data
<
Rx2
>
resB
=
vx_load_expand
(
dataA
.
d
);
Rx2
c
,
d
;
Rx2
c
,
d
,
e
,
f
;
v_expand
(
a
,
c
,
d
);
Data
<
Rx2
>
resC
=
c
,
resD
=
d
;
e
=
v_expand_low
(
a
);
f
=
v_expand_high
(
a
);
Data
<
Rx2
>
resC
=
c
,
resD
=
d
,
resE
=
e
,
resF
=
f
;
const
int
n
=
Rx2
::
nlanes
;
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
...
...
@@ -418,6 +421,8 @@ template<typename R> struct TheTest
EXPECT_EQ
(
dataA
[
i
],
resB
[
i
]);
EXPECT_EQ
(
dataA
[
i
],
resC
[
i
]);
EXPECT_EQ
(
dataA
[
i
+
n
],
resD
[
i
]);
EXPECT_EQ
(
dataA
[
i
],
resE
[
i
]);
EXPECT_EQ
(
dataA
[
i
+
n
],
resF
[
i
]);
}
return
*
this
;
...
...
@@ -455,19 +460,21 @@ template<typename R> struct TheTest
return
*
this
;
}
TheTest
&
test_a
ddsub
_wrap
()
TheTest
&
test_a
rithm
_wrap
()
{
Data
<
R
>
dataA
,
dataB
;
dataB
.
reverse
();
R
a
=
dataA
,
b
=
dataB
;
Data
<
R
>
resC
=
v_add_wrap
(
a
,
b
),
resD
=
v_sub_wrap
(
a
,
b
);
resD
=
v_sub_wrap
(
a
,
b
),
resE
=
v_mul_wrap
(
a
,
b
);
for
(
int
i
=
0
;
i
<
R
::
nlanes
;
++
i
)
{
SCOPED_TRACE
(
cv
::
format
(
"i=%d"
,
i
));
EXPECT_EQ
((
LaneType
)(
dataA
[
i
]
+
dataB
[
i
]),
resC
[
i
]);
EXPECT_EQ
((
LaneType
)(
dataA
[
i
]
-
dataB
[
i
]),
resD
[
i
]);
EXPECT_EQ
((
LaneType
)(
dataA
[
i
]
*
dataB
[
i
]),
resE
[
i
]);
}
return
*
this
;
}
...
...
@@ -475,6 +482,7 @@ template<typename R> struct TheTest
TheTest
&
test_mul
()
{
Data
<
R
>
dataA
,
dataB
;
dataA
[
1
]
=
static_cast
<
LaneType
>
(
std
::
numeric_limits
<
LaneType
>::
max
());
dataB
.
reverse
();
R
a
=
dataA
,
b
=
dataB
;
...
...
@@ -482,7 +490,7 @@ template<typename R> struct TheTest
for
(
int
i
=
0
;
i
<
R
::
nlanes
;
++
i
)
{
SCOPED_TRACE
(
cv
::
format
(
"i=%d"
,
i
));
EXPECT_EQ
(
dataA
[
i
]
*
dataB
[
i
]
,
resC
[
i
]);
EXPECT_EQ
(
saturate_cast
<
LaneType
>
(
dataA
[
i
]
*
dataB
[
i
])
,
resC
[
i
]);
}
return
*
this
;
...
...
@@ -1209,7 +1217,9 @@ void test_hal_intrin_uint8()
.
test_expand
()
.
test_expand_q
()
.
test_addsub
()
.
test_addsub_wrap
()
.
test_arithm_wrap
()
.
test_mul
()
.
test_mul_expand
()
.
test_cmp
()
.
test_logic
()
.
test_min_max
()
...
...
@@ -1242,7 +1252,9 @@ void test_hal_intrin_int8()
.
test_expand
()
.
test_expand_q
()
.
test_addsub
()
.
test_addsub_wrap
()
.
test_arithm_wrap
()
.
test_mul
()
.
test_mul_expand
()
.
test_cmp
()
.
test_logic
()
.
test_min_max
()
...
...
@@ -1267,7 +1279,7 @@ void test_hal_intrin_uint16()
.
test_interleave
()
.
test_expand
()
.
test_addsub
()
.
test_a
ddsub
_wrap
()
.
test_a
rithm
_wrap
()
.
test_mul
()
.
test_mul_expand
()
.
test_cmp
()
...
...
@@ -1295,7 +1307,7 @@ void test_hal_intrin_int16()
.
test_interleave
()
.
test_expand
()
.
test_addsub
()
.
test_a
ddsub
_wrap
()
.
test_a
rithm
_wrap
()
.
test_mul
()
.
test_mul_expand
()
.
test_cmp
()
...
...
modules/imgproc/CMakeLists.txt
View file @
1cc3f7ab
set
(
the_description
"Image Processing"
)
ocv_add_dispatched_file
(
accum SSE
2 AVX NEON
)
ocv_add_dispatched_file
(
accum SSE
4_1 AVX AVX2
)
ocv_define_module
(
imgproc opencv_core WRAP java python js
)
modules/imgproc/perf/perf_accumulate.cpp
View file @
1cc3f7ab
...
...
@@ -5,94 +5,102 @@
namespace
opencv_test
{
#ifdef HAVE_OPENVX
PERF_TEST_P
(
Size_MatType
,
Accumulate
,
testing
::
Combine
(
testing
::
Values
(
::
perf
::
szODD
,
::
perf
::
szQVGA
,
::
perf
::
szVGA
,
::
perf
::
sz1080p
),
testing
::
Values
(
CV_16SC1
,
CV_32FC1
)
)
)
#else
PERF_TEST_P
(
Size_MatType
,
Accumulate
,
testing
::
Combine
(
testing
::
Values
(
::
perf
::
szODD
,
::
perf
::
szQVGA
,
::
perf
::
szVGA
,
::
perf
::
sz1080p
),
testing
::
Values
(
CV_32FC1
)
)
)
#endif
{
Size
sz
=
get
<
0
>
(
GetParam
());
int
dstType
=
get
<
1
>
(
GetParam
());
Mat
src
(
sz
,
CV_8UC1
);
Mat
dst
(
sz
,
dstType
);
declare
.
time
(
100
);
declare
.
in
(
src
,
WARMUP_RNG
).
out
(
dst
);
TEST_CYCLE
()
accumulate
(
src
,
dst
);
SANITY_CHECK_NOTHING
();
}
#ifdef HAVE_OPENVX
PERF_TEST_P
(
Size_MatType
,
AccumulateSquare
,
testing
::
Combine
(
testing
::
Values
(
::
perf
::
szODD
,
::
perf
::
szQVGA
,
::
perf
::
szVGA
,
::
perf
::
sz1080p
),
testing
::
Values
(
CV_16SC1
,
CV_32FC1
)
)
)
#else
PERF_TEST_P
(
Size_MatType
,
AccumulateSquare
,
testing
::
Combine
(
testing
::
Values
(
::
perf
::
szODD
,
::
perf
::
szQVGA
,
::
perf
::
szVGA
,
::
perf
::
sz1080p
),
testing
::
Values
(
CV_32FC1
)
)
)
#endif
{
Size
sz
=
get
<
0
>
(
GetParam
());
int
dstType
=
get
<
1
>
(
GetParam
());
Mat
src
(
sz
,
CV_8UC1
);
Mat
dst
(
sz
,
dstType
);
declare
.
time
(
100
);
declare
.
in
(
src
,
WARMUP_RNG
).
out
(
dst
);
TEST_CYCLE
()
accumulateSquare
(
src
,
dst
);
SANITY_CHECK_NOTHING
();
}
#ifdef HAVE_OPENVX
PERF_TEST_P
(
Size_MatType
,
AccumulateWeighted
,
testing
::
Combine
(
testing
::
Values
(
::
perf
::
szODD
,
::
perf
::
szQVGA
,
::
perf
::
szVGA
,
::
perf
::
sz1080p
),
testing
::
Values
(
CV_8UC1
,
CV_32FC1
)
)
)
#else
PERF_TEST_P
(
Size_MatType
,
AccumulateWeighted
,
testing
::
Combine
(
testing
::
Values
(
::
perf
::
szODD
,
::
perf
::
szQVGA
,
::
perf
::
szVGA
,
::
perf
::
sz1080p
),
testing
::
Values
(
CV_32FC1
)
)
)
#endif
{
Size
sz
=
get
<
0
>
(
GetParam
());
int
dstType
=
get
<
1
>
(
GetParam
());
Mat
src
(
sz
,
CV_8UC1
);
Mat
dst
(
sz
,
dstType
);
declare
.
time
(
100
);
declare
.
in
(
src
,
WARMUP_RNG
).
out
(
dst
);
TEST_CYCLE
()
accumulateWeighted
(
src
,
dst
,
0.314
);
SANITY_CHECK_NOTHING
();
}
typedef
Size_MatType
Accumulate
;
#define MAT_TYPES_ACCUMLATE CV_8UC1, CV_16UC1, CV_32FC1
#define MAT_TYPES_ACCUMLATE_C MAT_TYPES_ACCUMLATE, CV_8UC3, CV_16UC3, CV_32FC3
#define MAT_TYPES_ACCUMLATE_D MAT_TYPES_ACCUMLATE, CV_64FC1
#define MAT_TYPES_ACCUMLATE_D_C MAT_TYPES_ACCUMLATE_C, CV_64FC1, CV_64FC1
#define PERF_ACCUMULATE_INIT(_FLTC) \
const Size srcSize = get<0>(GetParam()); \
const int srcType = get<1>(GetParam()); \
const int dstType = _FLTC(CV_MAT_CN(srcType)); \
Mat src1(srcSize, srcType), dst(srcSize, dstType); \
declare.in(src1, dst, WARMUP_RNG).out(dst);
#define PERF_ACCUMULATE_MASK_INIT(_FLTC) \
PERF_ACCUMULATE_INIT(_FLTC) \
Mat mask(srcSize, CV_8UC1); \
declare.in(mask, WARMUP_RNG);
#define PERF_TEST_P_ACCUMULATE(_NAME, _TYPES, _INIT, _FUN) \
PERF_TEST_P(Accumulate, _NAME, \
testing::Combine( \
testing::Values(sz1080p, sz720p, szVGA, szQVGA, szODD), \
testing::Values(_TYPES) \
) \
) \
{ \
_INIT \
TEST_CYCLE() _FUN; \
SANITY_CHECK_NOTHING(); \
}
/////////////////////////////////// Accumulate ///////////////////////////////////
PERF_TEST_P_ACCUMULATE
(
Accumulate
,
MAT_TYPES_ACCUMLATE
,
PERF_ACCUMULATE_INIT
(
CV_32FC
),
accumulate
(
src1
,
dst
))
PERF_TEST_P_ACCUMULATE
(
AccumulateMask
,
MAT_TYPES_ACCUMLATE_C
,
PERF_ACCUMULATE_MASK_INIT
(
CV_32FC
),
accumulate
(
src1
,
dst
,
mask
))
PERF_TEST_P_ACCUMULATE
(
AccumulateDouble
,
MAT_TYPES_ACCUMLATE_D
,
PERF_ACCUMULATE_INIT
(
CV_64FC
),
accumulate
(
src1
,
dst
))
PERF_TEST_P_ACCUMULATE
(
AccumulateDoubleMask
,
MAT_TYPES_ACCUMLATE_D_C
,
PERF_ACCUMULATE_MASK_INIT
(
CV_64FC
),
accumulate
(
src1
,
dst
,
mask
))
///////////////////////////// AccumulateSquare ///////////////////////////////////
PERF_TEST_P_ACCUMULATE
(
Square
,
MAT_TYPES_ACCUMLATE
,
PERF_ACCUMULATE_INIT
(
CV_32FC
),
accumulateSquare
(
src1
,
dst
))
PERF_TEST_P_ACCUMULATE
(
SquareMask
,
MAT_TYPES_ACCUMLATE_C
,
PERF_ACCUMULATE_MASK_INIT
(
CV_32FC
),
accumulateSquare
(
src1
,
dst
,
mask
))
PERF_TEST_P_ACCUMULATE
(
SquareDouble
,
MAT_TYPES_ACCUMLATE_D
,
PERF_ACCUMULATE_INIT
(
CV_64FC
),
accumulateSquare
(
src1
,
dst
))
PERF_TEST_P_ACCUMULATE
(
SquareDoubleMask
,
MAT_TYPES_ACCUMLATE_D_C
,
PERF_ACCUMULATE_MASK_INIT
(
CV_64FC
),
accumulateSquare
(
src1
,
dst
,
mask
))
///////////////////////////// AccumulateProduct ///////////////////////////////////
#define PERF_ACCUMULATE_INIT_2(_FLTC) \
PERF_ACCUMULATE_INIT(_FLTC) \
Mat src2(srcSize, srcType); \
declare.in(src2);
#define PERF_ACCUMULATE_MASK_INIT_2(_FLTC) \
PERF_ACCUMULATE_MASK_INIT(_FLTC) \
Mat src2(srcSize, srcType); \
declare.in(src2);
PERF_TEST_P_ACCUMULATE
(
Product
,
MAT_TYPES_ACCUMLATE
,
PERF_ACCUMULATE_INIT_2
(
CV_32FC
),
accumulateProduct
(
src1
,
src2
,
dst
))
PERF_TEST_P_ACCUMULATE
(
ProductMask
,
MAT_TYPES_ACCUMLATE_C
,
PERF_ACCUMULATE_MASK_INIT_2
(
CV_32FC
),
accumulateProduct
(
src1
,
src2
,
dst
,
mask
))
PERF_TEST_P_ACCUMULATE
(
ProductDouble
,
MAT_TYPES_ACCUMLATE_D
,
PERF_ACCUMULATE_INIT_2
(
CV_64FC
),
accumulateProduct
(
src1
,
src2
,
dst
))
PERF_TEST_P_ACCUMULATE
(
ProductDoubleMask
,
MAT_TYPES_ACCUMLATE_D_C
,
PERF_ACCUMULATE_MASK_INIT_2
(
CV_64FC
),
accumulateProduct
(
src1
,
src2
,
dst
,
mask
))
///////////////////////////// AccumulateWeighted ///////////////////////////////////
PERF_TEST_P_ACCUMULATE
(
Weighted
,
MAT_TYPES_ACCUMLATE
,
PERF_ACCUMULATE_INIT
(
CV_32FC
),
accumulateWeighted
(
src1
,
dst
,
0.123
))
PERF_TEST_P_ACCUMULATE
(
WeightedMask
,
MAT_TYPES_ACCUMLATE_C
,
PERF_ACCUMULATE_MASK_INIT
(
CV_32FC
),
accumulateWeighted
(
src1
,
dst
,
0.123
,
mask
))
PERF_TEST_P_ACCUMULATE
(
WeightedDouble
,
MAT_TYPES_ACCUMLATE_D
,
PERF_ACCUMULATE_INIT
(
CV_64FC
),
accumulateWeighted
(
src1
,
dst
,
0.123456
))
PERF_TEST_P_ACCUMULATE
(
WeightedDoubleMask
,
MAT_TYPES_ACCUMLATE_D_C
,
PERF_ACCUMULATE_MASK_INIT
(
CV_64FC
),
accumulateWeighted
(
src1
,
dst
,
0.123456
,
mask
))
}
// namespace
modules/imgproc/src/accum.simd.hpp
View file @
1cc3f7ab
This diff is collapsed.
Click to expand it.
modules/imgproc/src/smooth.cpp
View file @
1cc3f7ab
...
...
@@ -1825,7 +1825,7 @@ void hlineSmooth1N<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, const ufi
const
int
VECSZ
=
v_uint16
::
nlanes
;
v_uint16
v_mul
=
vx_setall_u16
(
*
((
uint16_t
*
)
m
));
for
(;
i
<=
lencn
-
VECSZ
;
i
+=
VECSZ
)
v_store
((
uint16_t
*
)
dst
+
i
,
v_mul
*
vx_load_expand
(
src
+
i
));
v_store
((
uint16_t
*
)
dst
+
i
,
v_mul
_wrap
(
v_mul
,
vx_load_expand
(
src
+
i
)
));
#endif
for
(;
i
<
lencn
;
i
++
)
dst
[
i
]
=
m
[
0
]
*
src
[
i
];
...
...
@@ -1915,7 +1915,9 @@ void hlineSmooth3N<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, const ufi
v_uint16
v_mul1
=
vx_setall_u16
(
_m
[
1
]);
v_uint16
v_mul2
=
vx_setall_u16
(
_m
[
2
]);
for
(;
i
<=
lencn
-
VECSZ
;
i
+=
VECSZ
,
src
+=
VECSZ
,
dst
+=
VECSZ
)
v_store
((
uint16_t
*
)
dst
,
vx_load_expand
(
src
-
cn
)
*
v_mul0
+
vx_load_expand
(
src
)
*
v_mul1
+
vx_load_expand
(
src
+
cn
)
*
v_mul2
);
v_store
((
uint16_t
*
)
dst
,
v_mul_wrap
(
vx_load_expand
(
src
-
cn
),
v_mul0
)
+
v_mul_wrap
(
vx_load_expand
(
src
),
v_mul1
)
+
v_mul_wrap
(
vx_load_expand
(
src
+
cn
),
v_mul2
));
#endif
for
(;
i
<
lencn
;
i
++
,
src
++
,
dst
++
)
*
dst
=
m
[
0
]
*
src
[
-
cn
]
+
m
[
1
]
*
src
[
0
]
+
m
[
2
]
*
src
[
cn
];
...
...
@@ -2089,7 +2091,8 @@ void hlineSmooth3Naba<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, const
v_uint16
v_mul0
=
vx_setall_u16
(
_m
[
0
]);
v_uint16
v_mul1
=
vx_setall_u16
(
_m
[
1
]);
for
(;
i
<=
lencn
-
VECSZ
;
i
+=
VECSZ
,
src
+=
VECSZ
,
dst
+=
VECSZ
)
v_store
((
uint16_t
*
)
dst
,
(
vx_load_expand
(
src
-
cn
)
+
vx_load_expand
(
src
+
cn
))
*
v_mul0
+
vx_load_expand
(
src
)
*
v_mul1
);
v_store
((
uint16_t
*
)
dst
,
v_mul_wrap
(
vx_load_expand
(
src
-
cn
)
+
vx_load_expand
(
src
+
cn
),
v_mul0
)
+
v_mul_wrap
(
vx_load_expand
(
src
),
v_mul1
));
#endif
for
(;
i
<
lencn
;
i
++
,
src
++
,
dst
++
)
*
((
uint16_t
*
)
dst
)
=
((
uint16_t
*
)
m
)[
1
]
*
src
[
0
]
+
((
uint16_t
*
)
m
)[
0
]
*
((
uint16_t
)(
src
[
-
cn
])
+
(
uint16_t
)(
src
[
cn
]));
...
...
@@ -2285,7 +2288,11 @@ void hlineSmooth5N<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, const ufi
v_uint16
v_mul3
=
vx_setall_u16
(
_m
[
3
]);
v_uint16
v_mul4
=
vx_setall_u16
(
_m
[
4
]);
for
(;
i
<=
lencn
-
VECSZ
;
i
+=
VECSZ
,
src
+=
VECSZ
,
dst
+=
VECSZ
)
v_store
((
uint16_t
*
)
dst
,
vx_load_expand
(
src
-
2
*
cn
)
*
v_mul0
+
vx_load_expand
(
src
-
cn
)
*
v_mul1
+
vx_load_expand
(
src
)
*
v_mul2
+
vx_load_expand
(
src
+
cn
)
*
v_mul3
+
vx_load_expand
(
src
+
2
*
cn
)
*
v_mul4
);
v_store
((
uint16_t
*
)
dst
,
v_mul_wrap
(
vx_load_expand
(
src
-
2
*
cn
),
v_mul0
)
+
v_mul_wrap
(
vx_load_expand
(
src
-
cn
),
v_mul1
)
+
v_mul_wrap
(
vx_load_expand
(
src
),
v_mul2
)
+
v_mul_wrap
(
vx_load_expand
(
src
+
cn
),
v_mul3
)
+
v_mul_wrap
(
vx_load_expand
(
src
+
2
*
cn
),
v_mul4
));
#endif
for
(;
i
<
lencn
;
i
++
,
src
++
,
dst
++
)
*
dst
=
m
[
0
]
*
src
[
-
2
*
cn
]
+
m
[
1
]
*
src
[
-
cn
]
+
m
[
2
]
*
src
[
0
]
+
m
[
3
]
*
src
[
cn
]
+
m
[
4
]
*
src
[
2
*
cn
];
...
...
@@ -2488,7 +2495,7 @@ void hlineSmooth5N14641<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, cons
const
int
VECSZ
=
v_uint16
::
nlanes
;
v_uint16
v_6
=
vx_setall_u16
(
6
);
for
(;
i
<=
lencn
-
VECSZ
;
i
+=
VECSZ
,
src
+=
VECSZ
,
dst
+=
VECSZ
)
v_store
((
uint16_t
*
)
dst
,
(
v
x_load_expand
(
src
)
*
v_6
+
((
vx_load_expand
(
src
-
cn
)
+
vx_load_expand
(
src
+
cn
))
<<
2
)
+
vx_load_expand
(
src
-
2
*
cn
)
+
vx_load_expand
(
src
+
2
*
cn
))
<<
4
);
v_store
((
uint16_t
*
)
dst
,
(
v
_mul_wrap
(
vx_load_expand
(
src
),
v_6
)
+
((
vx_load_expand
(
src
-
cn
)
+
vx_load_expand
(
src
+
cn
))
<<
2
)
+
vx_load_expand
(
src
-
2
*
cn
)
+
vx_load_expand
(
src
+
2
*
cn
))
<<
4
);
#endif
for
(;
i
<
lencn
;
i
++
,
src
++
,
dst
++
)
*
((
uint16_t
*
)
dst
)
=
(
uint16_t
(
src
[
0
])
*
6
+
((
uint16_t
(
src
[
-
cn
])
+
uint16_t
(
src
[
cn
]))
<<
2
)
+
uint16_t
(
src
[
-
2
*
cn
])
+
uint16_t
(
src
[
2
*
cn
]))
<<
4
;
...
...
@@ -2689,7 +2696,9 @@ void hlineSmooth5Nabcba<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, cons
v_uint16
v_mul1
=
vx_setall_u16
(
_m
[
1
]);
v_uint16
v_mul2
=
vx_setall_u16
(
_m
[
2
]);
for
(;
i
<=
lencn
-
VECSZ
;
i
+=
VECSZ
,
src
+=
VECSZ
,
dst
+=
VECSZ
)
v_store
((
uint16_t
*
)
dst
,
(
vx_load_expand
(
src
-
2
*
cn
)
+
vx_load_expand
(
src
+
2
*
cn
))
*
v_mul0
+
(
vx_load_expand
(
src
-
cn
)
+
vx_load_expand
(
src
+
cn
))
*
v_mul1
+
vx_load_expand
(
src
)
*
v_mul2
);
v_store
((
uint16_t
*
)
dst
,
v_mul_wrap
(
vx_load_expand
(
src
-
2
*
cn
)
+
vx_load_expand
(
src
+
2
*
cn
),
v_mul0
)
+
v_mul_wrap
(
vx_load_expand
(
src
-
cn
)
+
vx_load_expand
(
src
+
cn
),
v_mul1
)
+
v_mul_wrap
(
vx_load_expand
(
src
),
v_mul2
));
#endif
for
(;
i
<
lencn
;
i
++
,
src
++
,
dst
++
)
*
((
uint16_t
*
)
dst
)
=
((
uint16_t
*
)
m
)[
0
]
*
((
uint16_t
)(
src
[
-
2
*
cn
])
+
(
uint16_t
)(
src
[
2
*
cn
]))
+
((
uint16_t
*
)
m
)[
1
]
*
((
uint16_t
)(
src
[
-
cn
])
+
(
uint16_t
)(
src
[
cn
]))
+
((
uint16_t
*
)
m
)[
2
]
*
src
[
0
];
...
...
@@ -2804,9 +2813,9 @@ void hlineSmooth<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, const ufixe
const
int
VECSZ
=
v_uint16
::
nlanes
;
for
(;
i
<=
lencn
-
VECSZ
;
i
+=
VECSZ
,
src
+=
VECSZ
,
dst
+=
VECSZ
)
{
v_uint16
v_res0
=
v
x_load_expand
(
src
)
*
vx_setall_u16
(
*
((
uint16_t
*
)
m
));
v_uint16
v_res0
=
v
_mul_wrap
(
vx_load_expand
(
src
),
vx_setall_u16
(
*
((
uint16_t
*
)
m
)
));
for
(
int
j
=
1
;
j
<
n
;
j
++
)
v_res0
+=
v
x_load_expand
(
src
+
j
*
cn
)
*
vx_setall_u16
(
*
((
uint16_t
*
)(
m
+
j
)));
v_res0
+=
v
_mul_wrap
(
vx_load_expand
(
src
+
j
*
cn
),
vx_setall_u16
(
*
((
uint16_t
*
)(
m
+
j
)
)));
v_store
((
uint16_t
*
)
dst
,
v_res0
);
}
#endif
...
...
@@ -2923,9 +2932,9 @@ void hlineSmoothONa_yzy_a<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, co
const
int
VECSZ
=
v_uint16
::
nlanes
;
for
(;
i
<=
lencn
-
VECSZ
;
i
+=
VECSZ
,
src
+=
VECSZ
,
dst
+=
VECSZ
)
{
v_uint16
v_res0
=
v
x_load_expand
(
src
+
pre_shift
*
cn
)
*
vx_setall_u16
(
*
((
uint16_t
*
)(
m
+
pre_shift
)));
v_uint16
v_res0
=
v
_mul_wrap
(
vx_load_expand
(
src
+
pre_shift
*
cn
),
vx_setall_u16
(
*
((
uint16_t
*
)(
m
+
pre_shift
)
)));
for
(
int
j
=
0
;
j
<
pre_shift
;
j
++
)
v_res0
+=
(
vx_load_expand
(
src
+
j
*
cn
)
+
vx_load_expand
(
src
+
(
n
-
1
-
j
)
*
cn
))
*
vx_setall_u16
(
*
((
uint16_t
*
)(
m
+
j
)));
v_res0
+=
v_mul_wrap
(
vx_load_expand
(
src
+
j
*
cn
)
+
vx_load_expand
(
src
+
(
n
-
1
-
j
)
*
cn
),
vx_setall_u16
(
*
((
uint16_t
*
)(
m
+
j
)
)));
v_store
((
uint16_t
*
)
dst
,
v_res0
);
}
#endif
...
...
modules/video/src/lkpyramid.cpp
View file @
1cc3f7ab
...
...
@@ -93,7 +93,7 @@ static void calcSharrDeriv(const cv::Mat& src, cv::Mat& dst)
v_int16x8
s2
=
v_reinterpret_as_s16
(
v_load_expand
(
srow2
+
x
));
v_int16x8
t1
=
s2
-
s0
;
v_int16x8
t0
=
(
s0
+
s2
)
*
c3
+
s1
*
c10
;
v_int16x8
t0
=
v_mul_wrap
(
s0
+
s2
,
c3
)
+
v_mul_wrap
(
s1
,
c10
)
;
v_store
(
trow0
+
x
,
t0
);
v_store
(
trow1
+
x
,
t1
);
...
...
@@ -131,7 +131,7 @@ static void calcSharrDeriv(const cv::Mat& src, cv::Mat& dst)
v_int16x8
s4
=
v_load
(
trow1
+
x
+
cn
);
v_int16x8
t0
=
s1
-
s0
;
v_int16x8
t1
=
((
s2
+
s4
)
*
c3
)
+
(
s3
*
c10
);
v_int16x8
t1
=
v_mul_wrap
(
s2
+
s4
,
c3
)
+
v_mul_wrap
(
s3
,
c10
);
v_store_interleave
((
drow
+
x
*
2
),
t0
,
t1
);
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment