Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
a275489f
Commit
a275489f
authored
Aug 27, 2015
by
Maksim Shabunin
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
HAL universal intrinsics tests and documentation
parent
190d00ea
Show whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
2231 additions
and
118 deletions
+2231
-118
hal.hpp
modules/hal/include/opencv2/hal.hpp
+13
-0
defs.h
modules/hal/include/opencv2/hal/defs.h
+10
-10
intrin.hpp
modules/hal/include/opencv2/hal/intrin.hpp
+28
-0
intrin_cpp.hpp
modules/hal/include/opencv2/hal/intrin_cpp.hpp
+995
-84
intrin_neon.hpp
modules/hal/include/opencv2/hal/intrin_neon.hpp
+24
-7
intrin_sse.hpp
modules/hal/include/opencv2/hal/intrin_sse.hpp
+49
-17
test_intrin.cpp
modules/hal/test/test_intrin.cpp
+864
-0
test_intrin_utils.hpp
modules/hal/test/test_intrin_utils.hpp
+234
-0
test_main.cpp
modules/hal/test/test_main.cpp
+3
-0
test_precomp.hpp
modules/hal/test/test_precomp.hpp
+11
-0
No files found.
modules/hal/include/opencv2/hal.hpp
View file @
a275489f
...
...
@@ -49,10 +49,21 @@
/**
@defgroup hal Hardware Acceleration Layer
@{
@defgroup hal_intrin Universal intrinsics
@{
@defgroup hal_intrin_impl Private implementation helpers
@}
@defgroup hal_utils Platform-dependent utils
@}
*/
namespace
cv
{
namespace
hal
{
//! @addtogroup hal
//! @{
namespace
Error
{
enum
...
...
@@ -93,6 +104,8 @@ void sqrt(const double* src, double* dst, int len);
void
invSqrt
(
const
float
*
src
,
float
*
dst
,
int
len
);
void
invSqrt
(
const
double
*
src
,
double
*
dst
,
int
len
);
//! @}
}}
//cv::hal
#endif //__OPENCV_HAL_HPP__
modules/hal/include/opencv2/hal/defs.h
View file @
a275489f
...
...
@@ -45,6 +45,9 @@
#ifndef __OPENCV_DEF_H__
#define __OPENCV_DEF_H__
//! @addtogroup hal_utils
//! @{
#if !defined _CRT_SECURE_NO_DEPRECATE && defined _MSC_VER && _MSC_VER > 1300
# define _CRT_SECURE_NO_DEPRECATE
/* to avoid multiple Visual Studio warnings */
#endif
...
...
@@ -335,9 +338,6 @@ Cv64suf;
# include "tegra_round.hpp"
#endif
//! @addtogroup core_utils
//! @{
#if CV_VFP
// 1. general scheme
#define ARM_ROUND(_value, _asm_string) \
...
...
@@ -567,15 +567,19 @@ CV_INLINE int cvIsInf( float value )
return
(
ieee754
.
u
&
0x7fffffff
)
==
0x7f800000
;
}
//! @}
#include <algorithm>
namespace
cv
{
//! @addtogroup hal_utils
//! @{
/////////////// saturate_cast (used in image & signal processing) ///////////////////
/**
Template function for accurate conversion from one primitive type to another.
/** @brief Template function for accurate conversion from one primitive type to another.
The functions saturate_cast resemble the standard C++ cast operations, such as static_cast\<T\>()
and others. They perform an efficient and accurate conversion from one primitive type to another
...
...
@@ -618,8 +622,6 @@ template<typename _Tp> static inline _Tp saturate_cast(int64 v) { return _Tp(
/** @overload */
template
<
typename
_Tp
>
static
inline
_Tp
saturate_cast
(
uint64
v
)
{
return
_Tp
(
v
);
}
//! @cond IGNORED
template
<>
inline
uchar
saturate_cast
<
uchar
>
(
schar
v
)
{
return
(
uchar
)
std
::
max
((
int
)
v
,
0
);
}
template
<>
inline
uchar
saturate_cast
<
uchar
>
(
ushort
v
)
{
return
(
uchar
)
std
::
min
((
unsigned
)
v
,
(
unsigned
)
UCHAR_MAX
);
}
template
<>
inline
uchar
saturate_cast
<
uchar
>
(
int
v
)
{
return
(
uchar
)((
unsigned
)
v
<=
UCHAR_MAX
?
v
:
v
>
0
?
UCHAR_MAX
:
0
);
}
...
...
@@ -664,12 +666,10 @@ template<> inline int saturate_cast<int>(double v) { return cvRound(v)
template
<>
inline
unsigned
saturate_cast
<
unsigned
>
(
float
v
)
{
return
cvRound
(
v
);
}
template
<>
inline
unsigned
saturate_cast
<
unsigned
>
(
double
v
)
{
return
cvRound
(
v
);
}
//! @
endcond
//! @
}
}
#endif // __cplusplus
//! @} core_utils
#endif //__OPENCV_HAL_H__
modules/hal/include/opencv2/hal/intrin.hpp
View file @
a275489f
...
...
@@ -48,6 +48,7 @@
#include <cmath>
#include <float.h>
#include <stdlib.h>
#include "opencv2/hal/defs.h"
#define OPENCV_HAL_ADD(a, b) ((a) + (b))
#define OPENCV_HAL_AND(a, b) ((a) & (b))
...
...
@@ -59,6 +60,10 @@
// access from within opencv code more accessible
namespace
cv
{
//! @addtogroup hal_intrin
//! @{
//! @cond IGNORED
template
<
typename
_Tp
>
struct
V_TypeTraits
{
typedef
_Tp
int_type
;
...
...
@@ -82,6 +87,7 @@ template<> struct V_TypeTraits<uchar>
typedef
int
sum_type
;
typedef
ushort
w_type
;
typedef
unsigned
q_type
;
enum
{
delta
=
128
,
shift
=
8
};
...
...
@@ -99,6 +105,7 @@ template<> struct V_TypeTraits<schar>
typedef
int
sum_type
;
typedef
short
w_type
;
typedef
int
q_type
;
enum
{
delta
=
128
,
shift
=
8
};
...
...
@@ -265,8 +272,22 @@ template<> struct V_TypeTraits<double>
}
};
template
<
typename
T
>
struct
V_SIMD128Traits
{
enum
{
nlanes
=
16
/
sizeof
(
T
)
};
};
//! @endcond
//! @}
}
#ifdef CV_DOXYGEN
# undef CV_SSE2
# undef CV_NEON
#endif
#if CV_SSE2
#include "opencv2/hal/intrin_sse.hpp"
...
...
@@ -281,12 +302,19 @@ template<> struct V_TypeTraits<double>
#endif
//! @addtogroup hal_intrin
//! @{
#ifndef CV_SIMD128
//! Set to 1 if current compiler supports vector extensions (NEON or SSE is enabled)
#define CV_SIMD128 0
#endif
#ifndef CV_SIMD128_64F
//! Set to 1 if current intrinsics implementation supports 64-bit float vectors
#define CV_SIMD128_64F 0
#endif
//! @}
#endif
modules/hal/include/opencv2/hal/intrin_cpp.hpp
View file @
a275489f
...
...
@@ -45,25 +45,233 @@
#ifndef __OPENCV_HAL_INTRIN_CPP_HPP__
#define __OPENCV_HAL_INTRIN_CPP_HPP__
#include <limits>
#include <cstring>
namespace
cv
{
/** @addtogroup hal_intrin
"Universal intrinsics" is a types and functions set intended to simplify vectorization of code on
different platforms. Currently there are two supported SIMD extensions: __SSE/SSE2__ on x86
architectures and __NEON__ on ARM architectures, both allow working with 128 bit registers
containing packed values of different types. In case when there is no SIMD extension available
during compilation, fallback C++ implementation of intrinsics will be chosen and code will work as
expected although it could be slower.
### Types
There are several types representing 128-bit register as a vector of packed values, each type is
implemented as a structure based on a one SIMD register.
- cv::v_uint8x16 and cv::v_int8x16: sixteen 8-bit integer values (unsigned/signed) - char
- cv::v_uint16x8 and cv::v_int16x8: eight 16-bit integer values (unsigned/signed) - short
- cv::v_uint32x4 and cv::v_int32x4: four 32-bit integer values (unsgined/signed) - int
- cv::v_uint64x2 and cv::v_int64x2: two 64-bit integer values (unsigned/signed) - int64
- cv::v_float32x4: four 32-bit floating point values (signed) - float
- cv::v_float64x2: two 64-bit floating point valies (signed) - double
@note
cv::v_float64x2 is not implemented in NEON variant, if you want to use this type, don't forget to
check the CV_SIMD128_64F preprocessor definition:
@code
#if CV_SIMD128_64F
//...
#endif
@endcode
### Load and store operations
These operations allow to set contents of the register explicitly or by loading it from some memory
block and to save contents of the register to memory block.
- Constructors:
@ref v_reg::v_reg(const _Tp *ptr) "from memory",
@ref v_reg::v_reg(_Tp s0, _Tp s1) "from two values", ...
- Other create methods:
@ref v_setall_s8, @ref v_setall_u8, ...,
@ref v_setzero_u8, @ref v_setzero_s8, ...
- Memory operations:
@ref v_load, @ref v_load_aligned, @ref v_load_halves,
@ref v_store, @ref v_store_aligned,
@ref v_store_high, @ref v_store_low
### Value reordering
These operations allow to reorder or recombine elements in one or multiple vectors.
- Interleave, deinterleave (3 and 4 channels): @ref v_load_deinterleave, @ref v_store_interleave
- Expand: @ref v_load_expand, @ref v_load_expand_q, @ref v_expand
- Pack: @ref v_pack, @ref v_pack_u, @ref v_rshr_pack, @ref v_rshr_pack_u,
@ref v_pack_store, @ref v_pack_u_store, @ref v_rshr_pack_store, @ref v_rshr_pack_u_store
- Recombine: @ref v_zip, @ref v_recombine, @ref v_combine_low, @ref v_combine_high
- Extract: @ref v_extract
### Arithmetic, bitwise and comparison operations
Element-wise binary and unary operations.
- Arithmetics:
@ref operator+(const v_reg &a, const v_reg &b) "+",
@ref operator-(const v_reg &a, const v_reg &b) "-",
@ref operator*(const v_reg &a, const v_reg &b) "*",
@ref operator/(const v_reg &a, const v_reg &b) "/",
@ref v_mul_expand
- Non-saturating arithmetics: @ref v_add_wrap, @ref v_sub_wrap
- Bitwise shifts:
@ref operator<<(const v_reg &a, int s) "<<",
@ref operator>>(const v_reg &a, int s) ">>",
@ref v_shl, @ref v_shr
- Bitwise logic:
@ref operator&(const v_reg &a, const v_reg &b) "&",
@ref operator|(const v_reg &a, const v_reg &b) "|",
@ref operator^(const v_reg &a, const v_reg &b) "^",
@ref operator~(const v_reg &a) "~"
- Comparison:
@ref operator>(const v_reg &a, const v_reg &b) ">",
@ref operator>=(const v_reg &a, const v_reg &b) ">=",
@ref operator<(const v_reg &a, const v_reg &b) "<",
@ref operator<=(const v_reg &a, const v_reg &b) "<=",
@ref operator==(const v_reg &a, const v_reg &b) "==",
@ref operator!=(const v_reg &a, const v_reg &b) "!="
- min/max: @ref v_min, @ref v_max
### Reduce and mask
Most of these operations return only one value.
- Reduce: @ref v_reduce_min, @ref v_reduce_max, @ref v_reduce_sum
- Mask: @ref v_signmask, @ref v_check_all, @ref v_check_any, @ref v_select
### Other math
- Some frequent operations: @ref v_sqrt, @ref v_invsqrt, @ref v_magnitude, @ref v_sqr_magnitude
- Absolute values: @ref v_abs, @ref v_absdiff
### Conversions
Different type conversions and casts:
- Rounding: @ref v_round, @ref v_floor, @ref v_ceil, @ref v_trunc,
- To float: @ref v_cvt_f32, @ref v_cvt_f64
- Reinterpret: @ref v_reinterpret_as_u8, @ref v_reinterpret_as_s8, ...
### Matrix operations
In these operations vectors represent matrix rows/columns: @ref v_dotprod, @ref v_matmul, @ref v_transpose4x4
### Usability
Most operations are implemented only for some subset of the available types, following matrices
shows the applicability of different operations to the types.
Regular integers:
| Operations\\Types | uint 8x16 | int 8x16 | uint 16x8 | int 16x8 | uint 32x4 | int 32x4 |
|-------------------|:-:|:-:|:-:|:-:|:-:|:-:|
|load, store | x | x | x | x | x | x |
|interleave | x | x | x | x | x | x |
|expand | x | x | x | x | x | x |
|expand_q | x | x | | | | |
|add, sub | x | x | x | x | x | x |
|add_wrap, sub_wrap | x | x | x | x | | |
|mul | | | x | x | x | x |
|mul_expand | | | x | x | x | |
|compare | x | x | x | x | x | x |
|shift | | | x | x | x | x |
|dotprod | | | | x | | |
|logical | x | x | x | x | x | x |
|min, max | x | x | x | x | x | x |
|absdiff | x | x | x | x | x | x |
|reduce | | | | | x | x |
|mask | x | x | x | x | x | x |
|pack | x | x | x | x | x | x |
|pack_u | x | | x | | | |
|unpack | x | x | x | x | x | x |
|extract | x | x | x | x | x | x |
|cvt_flt32 | | | | | | x |
|cvt_flt64 | | | | | | x |
|transpose4x4 | | | | | x | x |
Big integers:
| Operations\\Types | uint 64x2 | int 64x2 |
|-------------------|:-:|:-:|
|load, store | x | x |
|add, sub | x | x |
|shift | x | x |
|logical | x | x |
|extract | x | x |
Floating point:
| Operations\\Types | float 32x4 | float 64x2 |
|-------------------|:-:|:-:|
|load, store | x | x |
|interleave | x | |
|add, sub | x | x |
|mul | x | x |
|div | x | x |
|compare | x | x |
|min, max | x | x |
|absdiff | x | x |
|reduce | x | |
|mask | x | x |
|unpack | x | x |
|cvt_flt32 | | x |
|cvt_flt64 | x | |
|sqrt, abs | x | x |
|float math | x | x |
|transpose4x4 | x | |
@{ */
template
<
typename
_Tp
,
int
n
>
struct
v_reg
{
//! @cond IGNORED
typedef
_Tp
lane_type
;
typedef
v_reg
<
typename
V_TypeTraits
<
_Tp
>::
int_type
,
n
>
int_vec
;
typedef
v_reg
<
typename
V_TypeTraits
<
_Tp
>::
abs_type
,
n
>
abs_vec
;
enum
{
nlanes
=
n
};
// !@endcond
/** @brief Constructor
Initializes register with data from memory
@param ptr pointer to memory block with data for register */
explicit
v_reg
(
const
_Tp
*
ptr
)
{
for
(
int
i
=
0
;
i
<
n
;
i
++
)
s
[
i
]
=
ptr
[
i
];
}
/** @brief Constructor
Initializes register with two 64-bit values */
v_reg
(
_Tp
s0
,
_Tp
s1
)
{
s
[
0
]
=
s0
;
s
[
1
]
=
s1
;
}
/** @brief Constructor
Initializes register with four 32-bit values */
v_reg
(
_Tp
s0
,
_Tp
s1
,
_Tp
s2
,
_Tp
s3
)
{
s
[
0
]
=
s0
;
s
[
1
]
=
s1
;
s
[
2
]
=
s2
;
s
[
3
]
=
s3
;
}
/** @brief Constructor
Initializes register with eight 16-bit values */
v_reg
(
_Tp
s0
,
_Tp
s1
,
_Tp
s2
,
_Tp
s3
,
_Tp
s4
,
_Tp
s5
,
_Tp
s6
,
_Tp
s7
)
{
s
[
0
]
=
s0
;
s
[
1
]
=
s1
;
s
[
2
]
=
s2
;
s
[
3
]
=
s3
;
s
[
4
]
=
s4
;
s
[
5
]
=
s5
;
s
[
6
]
=
s6
;
s
[
7
]
=
s7
;
}
/** @brief Constructor
Initializes register with sixteen 8-bit values */
v_reg
(
_Tp
s0
,
_Tp
s1
,
_Tp
s2
,
_Tp
s3
,
_Tp
s4
,
_Tp
s5
,
_Tp
s6
,
_Tp
s7
,
_Tp
s8
,
_Tp
s9
,
_Tp
s10
,
_Tp
s11
,
...
...
@@ -75,15 +283,31 @@ template<typename _Tp, int n> struct v_reg
s
[
12
]
=
s12
;
s
[
13
]
=
s13
;
s
[
14
]
=
s14
;
s
[
15
]
=
s15
;
}
/** @brief Default constructor
Does not initialize anything*/
v_reg
()
{}
/** @brief Copy constructor */
v_reg
(
const
v_reg
<
_Tp
,
n
>
&
r
)
{
for
(
int
i
=
0
;
i
<
n
;
i
++
)
s
[
i
]
=
r
.
s
[
i
];
}
/** @brief Access first value
Returns value of the first lane according to register type, for example:
@code{.cpp}
v_int32x4 r(1, 2, 3, 4);
int v = r.get0(); // returns 1
v_uint64x2 r(1, 2);
uint64_t v = r.get0(); // returns 1
@endcode
*/
_Tp
get0
()
const
{
return
s
[
0
];
}
//! @cond IGNORED
_Tp
get
(
const
int
i
)
const
{
return
s
[
i
];
}
_Tp
get0
()
const
{
return
s
[
0
];
}
v_reg
<
_Tp
,
n
>
high
()
const
{
v_reg
<
_Tp
,
n
>
c
;
...
...
@@ -116,13 +340,37 @@ template<typename _Tp, int n> struct v_reg
{
size_t
bytes
=
std
::
min
(
sizeof
(
_Tp2
)
*
n2
,
sizeof
(
_Tp
)
*
n
);
v_reg
<
_Tp2
,
n2
>
c
;
memcpy
(
&
c
.
s
[
0
],
&
s
[
0
],
bytes
);
std
::
memcpy
(
&
c
.
s
[
0
],
&
s
[
0
],
bytes
);
return
c
;
}
_Tp
s
[
n
];
//! @endcond
};
/** @brief Sixteen 8-bit unsigned integer values */
typedef
v_reg
<
uchar
,
16
>
v_uint8x16
;
/** @brief Sixteen 8-bit signed integer values */
typedef
v_reg
<
schar
,
16
>
v_int8x16
;
/** @brief Eight 16-bit unsigned integer values */
typedef
v_reg
<
ushort
,
8
>
v_uint16x8
;
/** @brief Eight 16-bit signed integer values */
typedef
v_reg
<
short
,
8
>
v_int16x8
;
/** @brief Four 32-bit unsigned integer values */
typedef
v_reg
<
unsigned
,
4
>
v_uint32x4
;
/** @brief Four 32-bit signed integer values */
typedef
v_reg
<
int
,
4
>
v_int32x4
;
/** @brief Four 32-bit floating point values (single precision) */
typedef
v_reg
<
float
,
4
>
v_float32x4
;
/** @brief Two 64-bit floating point values (double precision) */
typedef
v_reg
<
double
,
2
>
v_float64x2
;
/** @brief Two 64-bit unsigned integer values */
typedef
v_reg
<
uint64
,
2
>
v_uint64x2
;
/** @brief Two 64-bit signed integer values */
typedef
v_reg
<
int64
,
2
>
v_int64x2
;
//! @brief Helper macro
//! @ingroup hal_intrin_impl
#define OPENCV_HAL_IMPL_BIN_OP(bin_op) \
template<typename _Tp, int n> inline v_reg<_Tp, n> \
operator bin_op (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
...
...
@@ -140,11 +388,28 @@ template<typename _Tp, int n> inline v_reg<_Tp, n>& \
return a; \
}
/** @brief Add values
For all types. */
OPENCV_HAL_IMPL_BIN_OP
(
+
)
/** @brief Subtract values
For all types. */
OPENCV_HAL_IMPL_BIN_OP
(
-
)
/** @brief Multiply values
For 16- and 32-bit integer types and floating types. */
OPENCV_HAL_IMPL_BIN_OP
(
*
)
/** @brief Divide values
For floating types only. */
OPENCV_HAL_IMPL_BIN_OP
(
/
)
//! @brief Helper macro
//! @ingroup hal_intrin_impl
#define OPENCV_HAL_IMPL_BIT_OP(bit_op) \
template<typename _Tp, int n> inline v_reg<_Tp, n> operator bit_op \
(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
...
...
@@ -166,10 +431,24 @@ template<typename _Tp, int n> inline v_reg<_Tp, n>& operator \
return a; \
}
/** @brief Bitwise AND
Only for integer types. */
OPENCV_HAL_IMPL_BIT_OP
(
&
)
/** @brief Bitwise OR
Only for integer types. */
OPENCV_HAL_IMPL_BIT_OP
(
|
)
/** @brief Bitwise XOR
Only for integer types.*/
OPENCV_HAL_IMPL_BIT_OP
(
^
)
/** @brief Bitwise NOT
Only for integer types.*/
template
<
typename
_Tp
,
int
n
>
inline
v_reg
<
_Tp
,
n
>
operator
~
(
const
v_reg
<
_Tp
,
n
>&
a
)
{
v_reg
<
_Tp
,
n
>
c
;
...
...
@@ -178,6 +457,8 @@ template<typename _Tp, int n> inline v_reg<_Tp, n> operator ~ (const v_reg<_Tp,
return
c
;
}
//! @brief Helper macro
//! @ingroup hal_intrin_impl
#define OPENCV_HAL_IMPL_MATH_FUNC(func, cfunc, _Tp2) \
template<typename _Tp, int n> inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a) \
{ \
...
...
@@ -187,27 +468,59 @@ template<typename _Tp, int n> inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a)
return c; \
}
/** @brief Square root of elements
Only for floating point types.*/
OPENCV_HAL_IMPL_MATH_FUNC
(
v_sqrt
,
std
::
sqrt
,
_Tp
)
//! @cond IGNORED
OPENCV_HAL_IMPL_MATH_FUNC
(
v_sin
,
std
::
sin
,
_Tp
)
OPENCV_HAL_IMPL_MATH_FUNC
(
v_cos
,
std
::
cos
,
_Tp
)
OPENCV_HAL_IMPL_MATH_FUNC
(
v_exp
,
std
::
exp
,
_Tp
)
OPENCV_HAL_IMPL_MATH_FUNC
(
v_log
,
std
::
log
,
_Tp
)
//! @endcond
/** @brief Absolute value of elements
Only for floating point types.*/
OPENCV_HAL_IMPL_MATH_FUNC
(
v_abs
,
(
typename
V_TypeTraits
<
_Tp
>::
abs_type
)
std
::
abs
,
typename
V_TypeTraits
<
_Tp
>::
abs_type
)
/** @brief Round elements
Only for floating point types.*/
OPENCV_HAL_IMPL_MATH_FUNC
(
v_round
,
cvRound
,
int
)
/** @brief Floor elements
Only for floating point types.*/
OPENCV_HAL_IMPL_MATH_FUNC
(
v_floor
,
cvFloor
,
int
)
/** @brief Ceil elements
Only for floating point types.*/
OPENCV_HAL_IMPL_MATH_FUNC
(
v_ceil
,
cvCeil
,
int
)
/** @brief Truncate elements
Only for floating point types.*/
OPENCV_HAL_IMPL_MATH_FUNC
(
v_trunc
,
int
,
int
)
#define OPENCV_HAL_IMPL_MINMAX_FUNC(func, hfunc, cfunc) \
//! @brief Helper macro
//! @ingroup hal_intrin_impl
#define OPENCV_HAL_IMPL_MINMAX_FUNC(func, cfunc) \
template<typename _Tp, int n> inline v_reg<_Tp, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
{ \
v_reg<_Tp, n> c; \
for( int i = 0; i < n; i++ ) \
c.s[i] = cfunc(a.s[i], b.s[i]); \
return c; \
} \
template<typename _Tp, int n> inline _Tp hfunc(const v_reg<_Tp, n>& a) \
}
//! @brief Helper macro
//! @ingroup hal_intrin_impl
#define OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC(func, cfunc) \
template<typename _Tp, int n> inline _Tp func(const v_reg<_Tp, n>& a) \
{ \
_Tp c = a.s[0]; \
for( int i = 1; i < n; i++ ) \
...
...
@@ -215,9 +528,49 @@ template<typename _Tp, int n> inline _Tp hfunc(const v_reg<_Tp, n>& a) \
return c; \
}
OPENCV_HAL_IMPL_MINMAX_FUNC
(
v_min
,
v_reduce_min
,
std
::
min
)
OPENCV_HAL_IMPL_MINMAX_FUNC
(
v_max
,
v_reduce_max
,
std
::
max
)
/** @brief Choose min values for each pair
Scheme:
@code
{A1 A2 ...}
{B1 B2 ...}
--------------
{min(A1,B1) min(A2,B2) ...}
@endcode
For all types except 64-bit integer. */
OPENCV_HAL_IMPL_MINMAX_FUNC
(
v_min
,
std
::
min
)
/** @brief Choose max values for each pair
Scheme:
@code
{A1 A2 ...}
{B1 B2 ...}
--------------
{max(A1,B1) max(A2,B2) ...}
@endcode
For all types except 64-bit integer. */
OPENCV_HAL_IMPL_MINMAX_FUNC
(
v_max
,
std
::
max
)
/** @brief Find one min value
Scheme:
@code
{A1 A2 A3 ...} => min(A1,A2,A3,...)
@endcode
For 32-bit integer and 32-bit floating point types. */
OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC
(
v_reduce_min
,
std
::
min
)
/** @brief Find one max value
Scheme:
@code
{A1 A2 A3 ...} => max(A1,A2,A3,...)
@endcode
For 32-bit integer and 32-bit floating point types. */
OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC
(
v_reduce_max
,
std
::
max
)
//! @cond IGNORED
template
<
typename
_Tp
,
int
n
>
inline
void
v_minmax
(
const
v_reg
<
_Tp
,
n
>&
a
,
const
v_reg
<
_Tp
,
n
>&
b
,
v_reg
<
_Tp
,
n
>&
minval
,
v_reg
<
_Tp
,
n
>&
maxval
)
...
...
@@ -228,8 +581,10 @@ inline void v_minmax( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
maxval
.
s
[
i
]
=
std
::
max
(
a
.
s
[
i
],
b
.
s
[
i
]);
}
}
//! @endcond
//! @brief Helper macro
//! @ingroup hal_intrin_impl
#define OPENCV_HAL_IMPL_CMP_OP(cmp_op) \
template<typename _Tp, int n> \
inline v_reg<_Tp, n> operator cmp_op(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
...
...
@@ -241,13 +596,38 @@ inline v_reg<_Tp, n> operator cmp_op(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>
return c; \
}
/** @brief Less-than comparison
For all types except 64-bit integer values. */
OPENCV_HAL_IMPL_CMP_OP
(
<
)
/** @brief Greater-than comparison
For all types except 64-bit integer values. */
OPENCV_HAL_IMPL_CMP_OP
(
>
)
/** @brief Less-than or equal comparison
For all types except 64-bit integer values. */
OPENCV_HAL_IMPL_CMP_OP
(
<=
)
/** @brief Greater-than or equal comparison
For all types except 64-bit integer values. */
OPENCV_HAL_IMPL_CMP_OP
(
>=
)
/** @brief Equal comparison
For all types except 64-bit integer values. */
OPENCV_HAL_IMPL_CMP_OP
(
==
)
/** @brief Not equal comparison
For all types except 64-bit integer values. */
OPENCV_HAL_IMPL_CMP_OP
(
!=
)
//! @brief Helper macro
//! @ingroup hal_intrin_impl
#define OPENCV_HAL_IMPL_ADD_SUB_OP(func, bin_op, cast_op, _Tp2) \
template<typename _Tp, int n> \
inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
...
...
@@ -259,10 +639,73 @@ inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
return c; \
}
/** @brief Add values without saturation
For 8- and 16-bit integer values. */
OPENCV_HAL_IMPL_ADD_SUB_OP
(
v_add_wrap
,
+
,
(
_Tp
),
_Tp
)
/** @brief Subtract values without saturation
For 8- and 16-bit integer values. */
OPENCV_HAL_IMPL_ADD_SUB_OP
(
v_sub_wrap
,
-
,
(
_Tp
),
_Tp
)
OPENCV_HAL_IMPL_ADD_SUB_OP
(
v_absdiff
,
-
,
(
rtype
)
std
::
abs
,
typename
V_TypeTraits
<
_Tp
>::
abs_type
)
//! @cond IGNORED
template
<
typename
T
>
inline
T
_absdiff
(
T
a
,
T
b
)
{
return
a
>
b
?
a
-
b
:
b
-
a
;
}
//! @endcond
/** @brief Absolute difference
Returns \f$ |a - b| \f$ converted to corresponding unsigned type.
Example:
@code{.cpp}
v_int32x4 a, b; // {1, 2, 3, 4} and {4, 3, 2, 1}
v_uint32x4 c = v_absdiff(a, b); // result is {3, 1, 1, 3}
@endcode
For 8-, 16-, 32-bit integer source types. */
template
<
typename
_Tp
,
int
n
>
inline
v_reg
<
typename
V_TypeTraits
<
_Tp
>::
abs_type
,
n
>
v_absdiff
(
const
v_reg
<
_Tp
,
n
>&
a
,
const
v_reg
<
_Tp
,
n
>
&
b
)
{
typedef
typename
V_TypeTraits
<
_Tp
>::
abs_type
rtype
;
v_reg
<
rtype
,
n
>
c
;
const
rtype
mask
=
std
::
numeric_limits
<
_Tp
>::
is_signed
?
(
1
<<
(
sizeof
(
rtype
)
*
8
-
1
))
:
0
;
for
(
int
i
=
0
;
i
<
n
;
i
++
)
{
rtype
ua
=
a
.
s
[
i
]
^
mask
;
rtype
ub
=
b
.
s
[
i
]
^
mask
;
c
.
s
[
i
]
=
_absdiff
(
ua
,
ub
);
}
return
c
;
}
/** @overload
For 32-bit floating point values */
inline
v_float32x4
v_absdiff
(
const
v_float32x4
&
a
,
const
v_float32x4
&
b
)
{
v_float32x4
c
;
for
(
int
i
=
0
;
i
<
c
.
nlanes
;
i
++
)
c
.
s
[
i
]
=
_absdiff
(
a
.
s
[
i
],
b
.
s
[
i
]);
return
c
;
}
/** @overload
For 64-bit floating point values */
inline
v_float64x2
v_absdiff
(
const
v_float64x2
&
a
,
const
v_float64x2
&
b
)
{
v_float64x2
c
;
for
(
int
i
=
0
;
i
<
c
.
nlanes
;
i
++
)
c
.
s
[
i
]
=
_absdiff
(
a
.
s
[
i
],
b
.
s
[
i
]);
return
c
;
}
/** @brief Inversed square root
Returns \f$ 1/sqrt(a) \f$
For floating point types only. */
template
<
typename
_Tp
,
int
n
>
inline
v_reg
<
_Tp
,
n
>
v_invsqrt
(
const
v_reg
<
_Tp
,
n
>&
a
)
{
...
...
@@ -272,6 +715,10 @@ inline v_reg<_Tp, n> v_invsqrt(const v_reg<_Tp, n>& a)
return
c
;
}
/** @brief Magnitude
Returns \f$ sqrt(a^2 + b^2) \f$
For floating point types only. */
template
<
typename
_Tp
,
int
n
>
inline
v_reg
<
_Tp
,
n
>
v_magnitude
(
const
v_reg
<
_Tp
,
n
>&
a
,
const
v_reg
<
_Tp
,
n
>&
b
)
{
...
...
@@ -281,7 +728,10 @@ inline v_reg<_Tp, n> v_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
return
c
;
}
/** @brief Square of the magnitude
Returns \f$ a^2 + b^2 \f$
For floating point types only. */
template
<
typename
_Tp
,
int
n
>
inline
v_reg
<
_Tp
,
n
>
v_sqr_magnitude
(
const
v_reg
<
_Tp
,
n
>&
a
,
const
v_reg
<
_Tp
,
n
>&
b
)
{
...
...
@@ -291,6 +741,10 @@ inline v_reg<_Tp, n> v_sqr_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>
return
c
;
}
/** @brief Multiply and add
Returns \f$ a*b + c \f$
For floating point types only. */
template
<
typename
_Tp
,
int
n
>
inline
v_reg
<
_Tp
,
n
>
v_muladd
(
const
v_reg
<
_Tp
,
n
>&
a
,
const
v_reg
<
_Tp
,
n
>&
b
,
const
v_reg
<
_Tp
,
n
>&
c
)
...
...
@@ -301,6 +755,18 @@ inline v_reg<_Tp, n> v_muladd(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
return
d
;
}
/** @brief Dot product of elements
Multiply values in two registers and sum adjacent result pairs.
Scheme:
@code
{A1 A2 ...} // 16-bit
x {B1 B2 ...} // 16-bit
-------------
{A1B1+A2B2 ...} // 32-bit
@endcode
Implemented only for 16-bit signed source type (v_int16x8).
*/
template
<
typename
_Tp
,
int
n
>
inline
v_reg
<
typename
V_TypeTraits
<
_Tp
>::
w_type
,
n
/
2
>
v_dotprod
(
const
v_reg
<
_Tp
,
n
>&
a
,
const
v_reg
<
_Tp
,
n
>&
b
)
{
...
...
@@ -311,6 +777,25 @@ template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n
return
c
;
}
/** @brief Multiply and expand
Multiply values two registers and store results in two registers with wider pack type.
Scheme:
@code
{A B C D} // 32-bit
x {E F G H} // 32-bit
---------------
{AE BF} // 64-bit
{CG DH} // 64-bit
@endcode
Example:
@code{.cpp}
v_uint32x4 a, b; // {1,2,3,4} and {2,2,2,2}
v_uint64x2 c, d; // results
v_mul_expand(a, b, c, d); // c, d = {2,4}, {6, 8}
@endcode
Implemented only for 16- and unsigned 32-bit source types (v_int16x8, v_uint16x8, v_uint32x4).
*/
template
<
typename
_Tp
,
int
n
>
inline
void
v_mul_expand
(
const
v_reg
<
_Tp
,
n
>&
a
,
const
v_reg
<
_Tp
,
n
>&
b
,
v_reg
<
typename
V_TypeTraits
<
_Tp
>::
w_type
,
n
/
2
>&
c
,
v_reg
<
typename
V_TypeTraits
<
_Tp
>::
w_type
,
n
/
2
>&
d
)
...
...
@@ -318,11 +803,12 @@ template<typename _Tp, int n> inline void v_mul_expand(const v_reg<_Tp, n>& a, c
typedef
typename
V_TypeTraits
<
_Tp
>::
w_type
w_type
;
for
(
int
i
=
0
;
i
<
(
n
/
2
);
i
++
)
{
c
.
s
[
i
]
=
(
w_type
)
a
.
s
[
i
]
*
b
.
s
[
i
]
*
2
;
c
.
s
[
i
]
=
(
w_type
)
a
.
s
[
i
]
*
b
.
s
[
i
];
d
.
s
[
i
]
=
(
w_type
)
a
.
s
[
i
+
(
n
/
2
)]
*
b
.
s
[
i
+
(
n
/
2
)];
}
}
//! @cond IGNORED
template
<
typename
_Tp
,
int
n
>
inline
void
v_hsum
(
const
v_reg
<
_Tp
,
n
>&
a
,
v_reg
<
typename
V_TypeTraits
<
_Tp
>::
w_type
,
n
/
2
>&
c
)
{
...
...
@@ -332,7 +818,10 @@ template<typename _Tp, int n> inline void v_hsum(const v_reg<_Tp, n>& a,
c
.
s
[
i
]
=
(
w_type
)
a
.
s
[
i
*
2
]
+
a
.
s
[
i
*
2
+
1
];
}
}
//! @endcond
//! @brief Helper macro
//! @ingroup hal_intrin_impl
#define OPENCV_HAL_IMPL_SHIFT_OP(shift_op) \
template<typename _Tp, int n> inline v_reg<_Tp, n> operator shift_op(const v_reg<_Tp, n>& a, int imm) \
{ \
...
...
@@ -342,9 +831,23 @@ template<typename _Tp, int n> inline v_reg<_Tp, n> operator shift_op(const v_reg
return c; \
}
/** @brief Bitwise shift left
For 16-, 32- and 64-bit integer values. */
OPENCV_HAL_IMPL_SHIFT_OP
(
<<
)
/** @brief Bitwise shift right
For 16-, 32- and 64-bit integer values. */
OPENCV_HAL_IMPL_SHIFT_OP
(
>>
)
/** @brief Sum packed values
Scheme:
@code
{A1 A2 A3 ...} => sum{A1,A2,A3,...}
@endcode
For 32-bit integer and 32-bit floating point types.*/
template
<
typename
_Tp
,
int
n
>
inline
typename
V_TypeTraits
<
_Tp
>::
sum_type
v_reduce_sum
(
const
v_reg
<
_Tp
,
n
>&
a
)
{
typename
V_TypeTraits
<
_Tp
>::
sum_type
c
=
a
.
s
[
0
];
...
...
@@ -353,6 +856,15 @@ template<typename _Tp, int n> inline typename V_TypeTraits<_Tp>::sum_type v_redu
return
c
;
}
/** @brief Get negative values mask
Returned value is a bit mask with bits set to 1 on places corresponding to negative packed values indexes.
Example:
@code{.cpp}
v_int32x4 r; // set to {-1, -1, 1, 1}
int mask = v_signmask(r); // mask = 3 <== 00000000 00000000 00000000 00000011
@endcode
For all types except 64-bit. */
template
<
typename
_Tp
,
int
n
>
inline
int
v_signmask
(
const
v_reg
<
_Tp
,
n
>&
a
)
{
int
mask
=
0
;
...
...
@@ -361,6 +873,10 @@ template<typename _Tp, int n> inline int v_signmask(const v_reg<_Tp, n>& a)
return
mask
;
}
/** @brief Check if all packed values are less than zero
Unsigned values will be casted to signed: `uchar 254 => char -2`.
For all types except 64-bit. */
template
<
typename
_Tp
,
int
n
>
inline
bool
v_check_all
(
const
v_reg
<
_Tp
,
n
>&
a
)
{
for
(
int
i
=
0
;
i
<
n
;
i
++
)
...
...
@@ -369,6 +885,10 @@ template<typename _Tp, int n> inline bool v_check_all(const v_reg<_Tp, n>& a)
return
true
;
}
/** @brief Check if any of packed values is less than zero
Unsigned values will be casted to signed: `uchar 254 => char -2`.
For all types except 64-bit. */
template
<
typename
_Tp
,
int
n
>
inline
bool
v_check_any
(
const
v_reg
<
_Tp
,
n
>&
a
)
{
for
(
int
i
=
0
;
i
<
n
;
i
++
)
...
...
@@ -377,15 +897,36 @@ template<typename _Tp, int n> inline bool v_check_any(const v_reg<_Tp, n>& a)
return
false
;
}
/** @brief Bitwise select
Return value will be built by combining values a and b using the following scheme:
If the i-th bit in _mask_ is 1
select i-th bit from _a_
else
select i-th bit from _b_ */
template
<
typename
_Tp
,
int
n
>
inline
v_reg
<
_Tp
,
n
>
v_select
(
const
v_reg
<
_Tp
,
n
>&
mask
,
const
v_reg
<
_Tp
,
n
>&
a
,
const
v_reg
<
_Tp
,
n
>&
b
)
{
typedef
V_TypeTraits
<
_Tp
>
Traits
;
typedef
typename
Traits
::
int_type
int_type
;
v_reg
<
_Tp
,
n
>
c
;
for
(
int
i
=
0
;
i
<
n
;
i
++
)
c
.
s
[
i
]
=
V_TypeTraits
<
_Tp
>::
reinterpret_int
(
mask
.
s
[
i
])
<
0
?
b
.
s
[
i
]
:
a
.
s
[
i
];
{
int_type
m
=
Traits
::
reinterpret_int
(
mask
.
s
[
i
]);
c
.
s
[
i
]
=
Traits
::
reinterpret_from_int
((
Traits
::
reinterpret_int
(
a
.
s
[
i
])
&
m
)
|
(
Traits
::
reinterpret_int
(
b
.
s
[
i
])
&
~
m
));
}
return
c
;
}
/** @brief Expand values to the wider pack type
Copy contents of register to two registers with 2x wider pack type.
Scheme:
@code
int32x4 int64x2 int64x2
{A B C D} ==> {A B} , {C D}
@endcode */
template
<
typename
_Tp
,
int
n
>
inline
void
v_expand
(
const
v_reg
<
_Tp
,
n
>&
a
,
v_reg
<
typename
V_TypeTraits
<
_Tp
>::
w_type
,
n
/
2
>&
b0
,
v_reg
<
typename
V_TypeTraits
<
_Tp
>::
w_type
,
n
/
2
>&
b1
)
...
...
@@ -397,6 +938,7 @@ template<typename _Tp, int n> inline void v_expand(const v_reg<_Tp, n>& a,
}
}
//! @cond IGNORED
template
<
typename
_Tp
,
int
n
>
inline
v_reg
<
typename
V_TypeTraits
<
_Tp
>::
int_type
,
n
>
v_reinterpret_as_int
(
const
v_reg
<
_Tp
,
n
>&
a
)
{
...
...
@@ -414,7 +956,19 @@ template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::uint_type
c
.
s
[
i
]
=
V_TypeTraits
<
_Tp
>::
reinterpret_uint
(
a
.
s
[
i
]);
return
c
;
}
//! @endcond
/** @brief Interleave two vectors
Scheme:
@code
{A1 A2 A3 A4}
{B1 B2 B3 B4}
---------------
{A1 B1 A2 B2} and {A3 B3 A4 B4}
@endcode
For all types except 64-bit.
*/
template
<
typename
_Tp
,
int
n
>
inline
void
v_zip
(
const
v_reg
<
_Tp
,
n
>&
a0
,
const
v_reg
<
_Tp
,
n
>&
a1
,
v_reg
<
_Tp
,
n
>&
b0
,
v_reg
<
_Tp
,
n
>&
b1
)
{
...
...
@@ -431,50 +985,102 @@ template<typename _Tp, int n> inline void v_zip( const v_reg<_Tp, n>& a0, const
}
}
template
<
typename
_Tp
,
int
n
>
inline
v_reg
<
_Tp
,
n
>
v_load
(
const
_Tp
*
ptr
)
/** @brief Load register contents from memory
@param ptr pointer to memory block with data
@return register object
@note Returned type will be detected from passed pointer type, for example uchar ==> cv::v_uint8x16, int ==> cv::v_int32x4, etc.
*/
template
<
typename
_Tp
>
inline
v_reg
<
_Tp
,
V_SIMD128Traits
<
_Tp
>::
nlanes
>
v_load
(
const
_Tp
*
ptr
)
{
return
v_reg
<
_Tp
,
n
>
(
ptr
);
return
v_reg
<
_Tp
,
V_SIMD128Traits
<
_Tp
>::
nlanes
>
(
ptr
);
}
template
<
typename
_Tp
,
int
n
>
inline
v_reg
<
_Tp
,
n
>
v_load_aligned
(
const
_Tp
*
ptr
)
/** @brief Load register contents from memory (aligned)
similar to cv::v_load, but source memory block should be aligned (to 16-byte boundary)
*/
template
<
typename
_Tp
>
inline
v_reg
<
_Tp
,
V_SIMD128Traits
<
_Tp
>::
nlanes
>
v_load_aligned
(
const
_Tp
*
ptr
)
{
return
v_reg
<
_Tp
,
n
>
(
ptr
);
return
v_reg
<
_Tp
,
V_SIMD128Traits
<
_Tp
>::
nlanes
>
(
ptr
);
}
template
<
typename
_Tp
,
int
n
>
inline
void
v_load_halves
(
const
_Tp
*
loptr
,
const
_Tp
*
hiptr
)
/** @brief Load register contents from two memory blocks
@param loptr memory block containing data for first half (0..n/2)
@param hiptr memory block containing data for second half (n/2..n)
@code{.cpp}
int lo[2] = { 1, 2 }, hi[2] = { 3, 4 };
v_int32x4 r = v_load_halves(lo, hi);
@endcode
*/
template
<
typename
_Tp
>
inline
v_reg
<
_Tp
,
V_SIMD128Traits
<
_Tp
>::
nlanes
>
v_load_halves
(
const
_Tp
*
loptr
,
const
_Tp
*
hiptr
)
{
v_reg
<
_Tp
,
n
>
c
;
for
(
int
i
=
0
;
i
<
n
/
2
;
i
++
)
v_reg
<
_Tp
,
V_SIMD128Traits
<
_Tp
>::
nlanes
>
c
;
for
(
int
i
=
0
;
i
<
c
.
nlanes
/
2
;
i
++
)
{
c
.
s
[
i
]
=
loptr
[
i
];
c
.
s
[
i
+
n
/
2
]
=
hiptr
[
i
];
c
.
s
[
i
+
c
.
nlanes
/
2
]
=
hiptr
[
i
];
}
return
c
;
}
template
<
typename
_Tp
,
int
n
>
inline
v_reg
<
typename
V_TypeTraits
<
_Tp
>::
w_type
,
n
>
v_load_expand
(
const
_Tp
*
ptr
)
/** @brief Load register contents from memory with double expand
Same as cv::v_load, but result pack type will be 2x wider than memory type.
@code{.cpp}
short buf[4] = {1, 2, 3, 4}; // type is int16
v_int32x4 r = v_load_expand(buf); // r = {1, 2, 3, 4} - type is int32
@endcode
For 8-, 16-, 32-bit integer source types. */
template
<
typename
_Tp
>
inline
v_reg
<
typename
V_TypeTraits
<
_Tp
>::
w_type
,
V_SIMD128Traits
<
_Tp
>::
nlanes
/
2
>
v_load_expand
(
const
_Tp
*
ptr
)
{
typedef
typename
V_TypeTraits
<
_Tp
>::
w_type
w_type
;
v_reg
<
w_type
,
n
>
c
;
for
(
int
i
=
0
;
i
<
n
;
i
++
)
v_reg
<
w_type
,
V_SIMD128Traits
<
w_type
>::
nlanes
>
c
;
for
(
int
i
=
0
;
i
<
c
.
nlanes
;
i
++
)
{
c
.
s
[
i
]
=
ptr
[
i
];
}
return
c
;
}
template
<
typename
_Tp
,
int
n
>
inline
v_reg
<
typename
V_TypeTraits
<
typename
V_TypeTraits
<
_Tp
>::
w_type
>::
w_type
,
n
>
v_load_expand_q
(
const
_Tp
*
ptr
)
/** @brief Load register contents from memory with quad expand
Same as cv::v_load_expand, but result type is 4 times wider than source.
@code{.cpp}
char buf[4] = {1, 2, 3, 4}; // type is int8
v_int32x4 r = v_load_q(buf); // r = {1, 2, 3, 4} - type is int32
@endcode
For 8-bit integer source types. */
template
<
typename
_Tp
>
inline
v_reg
<
typename
V_TypeTraits
<
_Tp
>::
q_type
,
V_SIMD128Traits
<
_Tp
>::
nlanes
/
4
>
v_load_expand_q
(
const
_Tp
*
ptr
)
{
typedef
typename
V_TypeTraits
<
typename
V_TypeTraits
<
_Tp
>::
w_type
>::
w_type
w
_type
;
v_reg
<
w_type
,
n
>
c
;
for
(
int
i
=
0
;
i
<
n
;
i
++
)
typedef
typename
V_TypeTraits
<
_Tp
>::
q_type
q
_type
;
v_reg
<
q_type
,
V_SIMD128Traits
<
q_type
>::
nlanes
>
c
;
for
(
int
i
=
0
;
i
<
c
.
nlanes
;
i
++
)
{
c
.
s
[
i
]
=
ptr
[
i
];
}
return
c
;
}
/** @brief Load and deinterleave (4 channels)
Load data from memory deinterleave and store to 4 registers.
Scheme:
@code
{A1 B1 C1 D1 A2 B2 C2 D2 ...} ==> {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}, {D1 D2 ...}
@endcode
For all types except 64-bit. */
template
<
typename
_Tp
,
int
n
>
inline
void
v_load_deinterleave
(
const
_Tp
*
ptr
,
v_reg
<
_Tp
,
n
>&
a
,
v_reg
<
_Tp
,
n
>&
b
,
v_reg
<
_Tp
,
n
>&
c
)
{
...
...
@@ -487,6 +1093,14 @@ template<typename _Tp, int n> inline void v_load_deinterleave(const _Tp* ptr, v_
}
}
/** @brief Load and deinterleave (3 channels)
Load data from memory deinterleave and store to 3 registers.
Scheme:
@code
{A1 B1 C1 A2 B2 C2 ...} ==> {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}
@endcode
For all types except 64-bit. */
template
<
typename
_Tp
,
int
n
>
inline
void
v_load_deinterleave
(
const
_Tp
*
ptr
,
v_reg
<
_Tp
,
n
>&
a
,
v_reg
<
_Tp
,
n
>&
b
,
v_reg
<
_Tp
,
n
>&
c
,
...
...
@@ -502,6 +1116,14 @@ inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
}
}
/** @brief Interleave and store (3 channels)
Interleave and store data from 3 registers to memory.
Scheme:
@code
{A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}, {D1 D2 ...} ==> {A1 B1 C1 D1 A2 B2 C2 D2 ...}
@endcode
For all types except 64-bit. */
template
<
typename
_Tp
,
int
n
>
inline
void
v_store_interleave
(
_Tp
*
ptr
,
const
v_reg
<
_Tp
,
n
>&
a
,
const
v_reg
<
_Tp
,
n
>&
b
,
const
v_reg
<
_Tp
,
n
>&
c
)
...
...
@@ -515,6 +1137,14 @@ inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
}
}
/** @brief Interleave and store (4 channels)
Interleave and store data from 4 registers to memory.
Scheme:
@code
{A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}, {D1 D2 ...} ==> {A1 B1 C1 D1 A2 B2 C2 D2 ...}
@endcode
For all types except 64-bit. */
template
<
typename
_Tp
,
int
n
>
inline
void
v_store_interleave
(
_Tp
*
ptr
,
const
v_reg
<
_Tp
,
n
>&
a
,
const
v_reg
<
_Tp
,
n
>&
b
,
const
v_reg
<
_Tp
,
n
>&
c
,
const
v_reg
<
_Tp
,
n
>&
d
)
...
...
@@ -529,6 +1159,14 @@ template<typename _Tp, int n> inline void v_store_interleave( _Tp* ptr, const v_
}
}
/** @brief Store data to memory
Store register contents to memory.
Scheme:
@code
REG {A B C D} ==> MEM {A B C D}
@endcode
Pointer can be unaligned. */
template
<
typename
_Tp
,
int
n
>
inline
void
v_store
(
_Tp
*
ptr
,
const
v_reg
<
_Tp
,
n
>&
a
)
{
...
...
@@ -536,6 +1174,13 @@ inline void v_store(_Tp* ptr, const v_reg<_Tp, n>& a)
ptr
[
i
]
=
a
.
s
[
i
];
}
/** @brief Store data to memory (lower half)
Store lower half of register contents to memory.
Scheme:
@code
REG {A B C D} ==> MEM {A B}
@endcode */
template
<
typename
_Tp
,
int
n
>
inline
void
v_store_low
(
_Tp
*
ptr
,
const
v_reg
<
_Tp
,
n
>&
a
)
{
...
...
@@ -543,6 +1188,13 @@ inline void v_store_low(_Tp* ptr, const v_reg<_Tp, n>& a)
ptr
[
i
]
=
a
.
s
[
i
];
}
/** @brief Store data to memory (higher half)
Store higher half of register contents to memory.
Scheme:
@code
REG {A B C D} ==> MEM {C D}
@endcode */
template
<
typename
_Tp
,
int
n
>
inline
void
v_store_high
(
_Tp
*
ptr
,
const
v_reg
<
_Tp
,
n
>&
a
)
{
...
...
@@ -550,6 +1202,14 @@ inline void v_store_high(_Tp* ptr, const v_reg<_Tp, n>& a)
ptr
[
i
]
=
a
.
s
[
i
+
(
n
/
2
)];
}
/** @brief Store data to memory (aligned)
Store register contents to memory.
Scheme:
@code
REG {A B C D} ==> MEM {A B C D}
@endcode
Pointer __should__ be aligned by 16-byte boundary. */
template
<
typename
_Tp
,
int
n
>
inline
void
v_store_aligned
(
_Tp
*
ptr
,
const
v_reg
<
_Tp
,
n
>&
a
)
{
...
...
@@ -557,6 +1217,16 @@ inline void v_store_aligned(_Tp* ptr, const v_reg<_Tp, n>& a)
ptr
[
i
]
=
a
.
s
[
i
];
}
/** @brief Combine vector from first elements of two vectors
Scheme:
@code
{A1 A2 A3 A4}
{B1 B2 B3 B4}
---------------
{A1 A2 B1 B2}
@endcode
For all types except 64-bit. */
template
<
typename
_Tp
,
int
n
>
inline
v_reg
<
_Tp
,
n
>
v_combine_low
(
const
v_reg
<
_Tp
,
n
>&
a
,
const
v_reg
<
_Tp
,
n
>&
b
)
{
...
...
@@ -569,6 +1239,16 @@ inline v_reg<_Tp, n> v_combine_low(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>&
return
c
;
}
/** @brief Combine vector from last elements of two vectors
Scheme:
@code
{A1 A2 A3 A4}
{B1 B2 B3 B4}
---------------
{A3 A4 B3 B4}
@endcode
For all types except 64-bit. */
template
<
typename
_Tp
,
int
n
>
inline
v_reg
<
_Tp
,
n
>
v_combine_high
(
const
v_reg
<
_Tp
,
n
>&
a
,
const
v_reg
<
_Tp
,
n
>&
b
)
{
...
...
@@ -581,6 +1261,12 @@ inline v_reg<_Tp, n> v_combine_high(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>&
return
c
;
}
/** @brief Combine two vectors from lower and higher parts of two other vectors
@code{.cpp}
low = cv::v_combine_low(a, b);
high = cv::v_combine_high(a, b);
@endcode */
template
<
typename
_Tp
,
int
n
>
inline
void
v_recombine
(
const
v_reg
<
_Tp
,
n
>&
a
,
const
v_reg
<
_Tp
,
n
>&
b
,
v_reg
<
_Tp
,
n
>&
low
,
v_reg
<
_Tp
,
n
>&
high
)
...
...
@@ -594,18 +1280,41 @@ inline void v_recombine(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
}
}
/** @brief Vector extract
Scheme:
@code
{A1 A2 A3 A4}
{B1 B2 B3 B4}
========================
shift = 1 {A2 A3 A4 B1}
shift = 2 {A3 A4 B1 B2}
shift = 3 {A4 B1 B2 B3}
@endcode
Restriction: 0 <= shift < nlanes
Usage:
@code
v_int32x4 a, b, c;
c = v_extract<2>(a, b);
@endcode
For integer types only. */
template
<
int
s
,
typename
_Tp
,
int
n
>
inline
v_reg
<
_Tp
,
n
>
v_extract
(
const
v_reg
<
_Tp
,
n
>&
a
,
const
v_reg
<
_Tp
,
n
>&
b
)
{
v_reg
<
_Tp
,
n
>
r
;
const
int
shift
=
n
-
s
;
int
i
=
0
;
for
(;
i
<
s
;
++
i
)
r
.
s
[
i
]
=
a
.
s
[
i
+
n
-
s
];
for
(;
i
<
s
hift
;
++
i
)
r
.
s
[
i
]
=
a
.
s
[
i
+
s
];
for
(;
i
<
n
;
++
i
)
r
.
s
[
i
]
=
b
.
s
[
i
-
s
];
r
.
s
[
i
]
=
b
.
s
[
i
-
s
hift
];
return
r
;
}
/** @brief Round
Rounds each value. Input type is float vector ==> output type is int vector.*/
template
<
int
n
>
inline
v_reg
<
int
,
n
>
v_round
(
const
v_reg
<
float
,
n
>&
a
)
{
v_reg
<
int
,
n
>
c
;
...
...
@@ -614,6 +1323,9 @@ template<int n> inline v_reg<int, n> v_round(const v_reg<float, n>& a)
return
c
;
}
/** @brief Floor
Floor each value. Input type is float vector ==> output type is int vector.*/
template
<
int
n
>
inline
v_reg
<
int
,
n
>
v_floor
(
const
v_reg
<
float
,
n
>&
a
)
{
v_reg
<
int
,
n
>
c
;
...
...
@@ -622,6 +1334,9 @@ template<int n> inline v_reg<int, n> v_floor(const v_reg<float, n>& a)
return
c
;
}
/** @brief Ceil
Ceil each value. Input type is float vector ==> output type is int vector.*/
template
<
int
n
>
inline
v_reg
<
int
,
n
>
v_ceil
(
const
v_reg
<
float
,
n
>&
a
)
{
v_reg
<
int
,
n
>
c
;
...
...
@@ -630,6 +1345,9 @@ template<int n> inline v_reg<int, n> v_ceil(const v_reg<float, n>& a)
return
c
;
}
/** @brief Trunc
Truncate each value. Input type is float vector ==> output type is int vector.*/
template
<
int
n
>
inline
v_reg
<
int
,
n
>
v_trunc
(
const
v_reg
<
float
,
n
>&
a
)
{
v_reg
<
int
,
n
>
c
;
...
...
@@ -638,6 +1356,7 @@ template<int n> inline v_reg<int, n> v_trunc(const v_reg<float, n>& a)
return
c
;
}
/** @overload */
template
<
int
n
>
inline
v_reg
<
int
,
n
*
2
>
v_round
(
const
v_reg
<
double
,
n
>&
a
)
{
v_reg
<
int
,
n
*
2
>
c
;
...
...
@@ -649,6 +1368,7 @@ template<int n> inline v_reg<int, n*2> v_round(const v_reg<double, n>& a)
return
c
;
}
/** @overload */
template
<
int
n
>
inline
v_reg
<
int
,
n
*
2
>
v_floor
(
const
v_reg
<
double
,
n
>&
a
)
{
v_reg
<
int
,
n
>
c
;
...
...
@@ -660,6 +1380,7 @@ template<int n> inline v_reg<int, n*2> v_floor(const v_reg<double, n>& a)
return
c
;
}
/** @overload */
template
<
int
n
>
inline
v_reg
<
int
,
n
*
2
>
v_ceil
(
const
v_reg
<
double
,
n
>&
a
)
{
v_reg
<
int
,
n
>
c
;
...
...
@@ -671,6 +1392,7 @@ template<int n> inline v_reg<int, n*2> v_ceil(const v_reg<double, n>& a)
return
c
;
}
/** @overload */
template
<
int
n
>
inline
v_reg
<
int
,
n
*
2
>
v_trunc
(
const
v_reg
<
double
,
n
>&
a
)
{
v_reg
<
int
,
n
>
c
;
...
...
@@ -682,6 +1404,9 @@ template<int n> inline v_reg<int, n*2> v_trunc(const v_reg<double, n>& a)
return
c
;
}
/** @brief Convert to float
Supported input type is cv::v_int32x4. */
template
<
int
n
>
inline
v_reg
<
float
,
n
>
v_cvt_f32
(
const
v_reg
<
int
,
n
>&
a
)
{
v_reg
<
float
,
n
>
c
;
...
...
@@ -690,6 +1415,9 @@ template<int n> inline v_reg<float, n> v_cvt_f32(const v_reg<int, n>& a)
return
c
;
}
/** @brief Convert to double
Supported input type is cv::v_int32x4. */
template
<
int
n
>
inline
v_reg
<
double
,
n
>
v_cvt_f64
(
const
v_reg
<
int
,
n
*
2
>&
a
)
{
v_reg
<
double
,
n
>
c
;
...
...
@@ -698,6 +1426,9 @@ template<int n> inline v_reg<double, n> v_cvt_f64(const v_reg<int, n*2>& a)
return
c
;
}
/** @brief Convert to double
Supported input type is cv::v_float32x4. */
template
<
int
n
>
inline
v_reg
<
double
,
n
>
v_cvt_f64
(
const
v_reg
<
float
,
n
*
2
>&
a
)
{
v_reg
<
double
,
n
>
c
;
...
...
@@ -706,6 +1437,21 @@ template<int n> inline v_reg<double, n> v_cvt_f64(const v_reg<float, n*2>& a)
return
c
;
}
/** @brief Transpose 4x4 matrix
Scheme:
@code
a0 {A1 A2 A3 A4}
a1 {B1 B2 B3 B4}
a2 {C1 C2 C3 C4}
a3 {D1 D2 D3 D4}
===============
b0 {A1 B1 C1 D1}
b1 {A2 B2 C2 D2}
b2 {A3 B3 C3 D3}
b3 {A4 B4 C4 D4}
@endcode
*/
template
<
typename
_Tp
>
inline
void
v_transpose4x4
(
v_reg
<
_Tp
,
4
>&
a0
,
const
v_reg
<
_Tp
,
4
>&
a1
,
const
v_reg
<
_Tp
,
4
>&
a2
,
const
v_reg
<
_Tp
,
4
>&
a3
,
...
...
@@ -718,41 +1464,105 @@ inline void v_transpose4x4( v_reg<_Tp, 4>& a0, const v_reg<_Tp, 4>& a1,
b3
=
v_reg
<
_Tp
,
4
>
(
a0
.
s
[
3
],
a1
.
s
[
3
],
a2
.
s
[
3
],
a3
.
s
[
3
]);
}
typedef
v_reg
<
uchar
,
16
>
v_uint8x16
;
typedef
v_reg
<
schar
,
16
>
v_int8x16
;
typedef
v_reg
<
ushort
,
8
>
v_uint16x8
;
typedef
v_reg
<
short
,
8
>
v_int16x8
;
typedef
v_reg
<
unsigned
,
4
>
v_uint32x4
;
typedef
v_reg
<
int
,
4
>
v_int32x4
;
typedef
v_reg
<
float
,
4
>
v_float32x4
;
typedef
v_reg
<
float
,
8
>
v_float32x8
;
typedef
v_reg
<
double
,
2
>
v_float64x2
;
typedef
v_reg
<
uint64
,
2
>
v_uint64x2
;
typedef
v_reg
<
int64
,
2
>
v_int64x2
;
#define OPENCV_HAL_IMPL_C_INIT(_Tpvec, _Tp, suffix) \
inline _Tpvec v_setzero_##suffix() { return _Tpvec::zero(); } \
inline _Tpvec v_setall_##suffix(_Tp val) { return _Tpvec::all(val); } \
//! @brief Helper macro
//! @ingroup hal_intrin_impl
#define OPENCV_HAL_IMPL_C_INIT_ZERO(_Tpvec, _Tp, suffix) \
inline _Tpvec v_setzero_##suffix() { return _Tpvec::zero(); }
//! @name Init with zero
//! @{
//! @brief Create new vector with zero elements
OPENCV_HAL_IMPL_C_INIT_ZERO
(
v_uint8x16
,
uchar
,
u8
)
OPENCV_HAL_IMPL_C_INIT_ZERO
(
v_int8x16
,
schar
,
s8
)
OPENCV_HAL_IMPL_C_INIT_ZERO
(
v_uint16x8
,
ushort
,
u16
)
OPENCV_HAL_IMPL_C_INIT_ZERO
(
v_int16x8
,
short
,
s16
)
OPENCV_HAL_IMPL_C_INIT_ZERO
(
v_uint32x4
,
unsigned
,
u32
)
OPENCV_HAL_IMPL_C_INIT_ZERO
(
v_int32x4
,
int
,
s32
)
OPENCV_HAL_IMPL_C_INIT_ZERO
(
v_float32x4
,
float
,
f32
)
OPENCV_HAL_IMPL_C_INIT_ZERO
(
v_float64x2
,
double
,
f64
)
OPENCV_HAL_IMPL_C_INIT_ZERO
(
v_uint64x2
,
uint64
,
u64
)
OPENCV_HAL_IMPL_C_INIT_ZERO
(
v_int64x2
,
int64
,
s64
)
//! @}
//! @brief Helper macro
//! @ingroup hal_intrin_impl
#define OPENCV_HAL_IMPL_C_INIT_VAL(_Tpvec, _Tp, suffix) \
inline _Tpvec v_setall_##suffix(_Tp val) { return _Tpvec::all(val); }
//! @name Init with value
//! @{
//! @brief Create new vector with elements set to a specific value
OPENCV_HAL_IMPL_C_INIT_VAL
(
v_uint8x16
,
uchar
,
u8
)
OPENCV_HAL_IMPL_C_INIT_VAL
(
v_int8x16
,
schar
,
s8
)
OPENCV_HAL_IMPL_C_INIT_VAL
(
v_uint16x8
,
ushort
,
u16
)
OPENCV_HAL_IMPL_C_INIT_VAL
(
v_int16x8
,
short
,
s16
)
OPENCV_HAL_IMPL_C_INIT_VAL
(
v_uint32x4
,
unsigned
,
u32
)
OPENCV_HAL_IMPL_C_INIT_VAL
(
v_int32x4
,
int
,
s32
)
OPENCV_HAL_IMPL_C_INIT_VAL
(
v_float32x4
,
float
,
f32
)
OPENCV_HAL_IMPL_C_INIT_VAL
(
v_float64x2
,
double
,
f64
)
OPENCV_HAL_IMPL_C_INIT_VAL
(
v_uint64x2
,
uint64
,
u64
)
OPENCV_HAL_IMPL_C_INIT_VAL
(
v_int64x2
,
int64
,
s64
)
//! @}
//! @brief Helper macro
//! @ingroup hal_intrin_impl
#define OPENCV_HAL_IMPL_C_REINTERPRET(_Tpvec, _Tp, suffix) \
template<typename _Tp0, int n0> inline _Tpvec \
v_reinterpret_as_##suffix(const v_reg<_Tp0, n0>& a) \
{ return a.template reinterpret_as<_Tp, _Tpvec::nlanes>(a); }
OPENCV_HAL_IMPL_C_INIT
(
v_uint8x16
,
uchar
,
u8
)
OPENCV_HAL_IMPL_C_INIT
(
v_int8x16
,
schar
,
s8
)
OPENCV_HAL_IMPL_C_INIT
(
v_uint16x8
,
ushort
,
u16
)
OPENCV_HAL_IMPL_C_INIT
(
v_int16x8
,
short
,
s16
)
OPENCV_HAL_IMPL_C_INIT
(
v_uint32x4
,
unsigned
,
u32
)
OPENCV_HAL_IMPL_C_INIT
(
v_int32x4
,
int
,
s32
)
OPENCV_HAL_IMPL_C_INIT
(
v_float32x4
,
float
,
f32
)
OPENCV_HAL_IMPL_C_INIT
(
v_float64x2
,
double
,
f64
)
OPENCV_HAL_IMPL_C_INIT
(
v_uint64x2
,
uint64
,
u64
)
OPENCV_HAL_IMPL_C_INIT
(
v_uint64x2
,
int64
,
s64
)
#define OPENCV_HAL_IMPL_C_SHIFT(_Tpvec, _Tp) \
{ return a.template reinterpret_as<_Tp, _Tpvec::nlanes>(); }
//! @name Reinterpret
//! @{
//! @brief Convert vector to different type without modifying underlying data.
OPENCV_HAL_IMPL_C_REINTERPRET
(
v_uint8x16
,
uchar
,
u8
)
OPENCV_HAL_IMPL_C_REINTERPRET
(
v_int8x16
,
schar
,
s8
)
OPENCV_HAL_IMPL_C_REINTERPRET
(
v_uint16x8
,
ushort
,
u16
)
OPENCV_HAL_IMPL_C_REINTERPRET
(
v_int16x8
,
short
,
s16
)
OPENCV_HAL_IMPL_C_REINTERPRET
(
v_uint32x4
,
unsigned
,
u32
)
OPENCV_HAL_IMPL_C_REINTERPRET
(
v_int32x4
,
int
,
s32
)
OPENCV_HAL_IMPL_C_REINTERPRET
(
v_float32x4
,
float
,
f32
)
OPENCV_HAL_IMPL_C_REINTERPRET
(
v_float64x2
,
double
,
f64
)
OPENCV_HAL_IMPL_C_REINTERPRET
(
v_uint64x2
,
uint64
,
u64
)
OPENCV_HAL_IMPL_C_REINTERPRET
(
v_int64x2
,
int64
,
s64
)
//! @}
//! @brief Helper macro
//! @ingroup hal_intrin_impl
#define OPENCV_HAL_IMPL_C_SHIFTL(_Tpvec, _Tp) \
template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
{ return a << n; } \
{ return a << n; }
//! @name Left shift
//! @{
//! @brief Shift left
OPENCV_HAL_IMPL_C_SHIFTL
(
v_uint16x8
,
ushort
)
OPENCV_HAL_IMPL_C_SHIFTL
(
v_int16x8
,
short
)
OPENCV_HAL_IMPL_C_SHIFTL
(
v_uint32x4
,
unsigned
)
OPENCV_HAL_IMPL_C_SHIFTL
(
v_int32x4
,
int
)
OPENCV_HAL_IMPL_C_SHIFTL
(
v_uint64x2
,
uint64
)
OPENCV_HAL_IMPL_C_SHIFTL
(
v_int64x2
,
int64
)
//! @}
//! @brief Helper macro
//! @ingroup hal_intrin_impl
#define OPENCV_HAL_IMPL_C_SHIFTR(_Tpvec, _Tp) \
template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
{ return a >> n; } \
{ return a >> n; }
//! @name Right shift
//! @{
//! @brief Shift right
OPENCV_HAL_IMPL_C_SHIFTR
(
v_uint16x8
,
ushort
)
OPENCV_HAL_IMPL_C_SHIFTR
(
v_int16x8
,
short
)
OPENCV_HAL_IMPL_C_SHIFTR
(
v_uint32x4
,
unsigned
)
OPENCV_HAL_IMPL_C_SHIFTR
(
v_int32x4
,
int
)
OPENCV_HAL_IMPL_C_SHIFTR
(
v_uint64x2
,
uint64
)
OPENCV_HAL_IMPL_C_SHIFTR
(
v_int64x2
,
int64
)
//! @}
//! @brief Helper macro
//! @ingroup hal_intrin_impl
#define OPENCV_HAL_IMPL_C_RSHIFTR(_Tpvec, _Tp) \
template<int n> inline _Tpvec v_rshr(const _Tpvec& a) \
{ \
_Tpvec c; \
...
...
@@ -761,15 +1571,20 @@ template<int n> inline _Tpvec v_rshr(const _Tpvec& a) \
return c; \
}
OPENCV_HAL_IMPL_C_SHIFT
(
v_uint16x8
,
ushort
)
OPENCV_HAL_IMPL_C_SHIFT
(
v_int16x8
,
short
)
OPENCV_HAL_IMPL_C_SHIFT
(
v_uint32x4
,
unsigned
)
OPENCV_HAL_IMPL_C_SHIFT
(
v_int32x4
,
int
)
OPENCV_HAL_IMPL_C_SHIFT
(
v_uint64x2
,
uint64
)
OPENCV_HAL_IMPL_C_SHIFT
(
v_int64x2
,
int64
)
#define OPENCV_HAL_IMPL_C_PACK(_Tpvec, _Tp, _Tpnvec, _Tpn, pack_suffix) \
//! @name Rounding shift
//! @{
//! @brief Rounding shift right
OPENCV_HAL_IMPL_C_RSHIFTR
(
v_uint16x8
,
ushort
)
OPENCV_HAL_IMPL_C_RSHIFTR
(
v_int16x8
,
short
)
OPENCV_HAL_IMPL_C_RSHIFTR
(
v_uint32x4
,
unsigned
)
OPENCV_HAL_IMPL_C_RSHIFTR
(
v_int32x4
,
int
)
OPENCV_HAL_IMPL_C_RSHIFTR
(
v_uint64x2
,
uint64
)
OPENCV_HAL_IMPL_C_RSHIFTR
(
v_int64x2
,
int64
)
//! @}
//! @brief Helper macro
//! @ingroup hal_intrin_impl
#define OPENCV_HAL_IMPL_C_PACK(_Tpvec, _Tpnvec, _Tpn, pack_suffix) \
inline _Tpnvec v_##pack_suffix(const _Tpvec& a, const _Tpvec& b) \
{ \
_Tpnvec c; \
...
...
@@ -779,7 +1594,30 @@ inline _Tpnvec v_##pack_suffix(const _Tpvec& a, const _Tpvec& b) \
c.s[i+_Tpvec::nlanes] = saturate_cast<_Tpn>(b.s[i]); \
} \
return c; \
} \
}
//! @name Pack
//! @{
//! @brief Pack values from two vectors to one
//!
//! Return vector type have twice more elements than input vector types. Variant with _u_ suffix also
//! converts to corresponding unsigned type.
//!
//! - pack: for 16-, 32- and 64-bit integer input types
//! - pack_u: for 16- and 32-bit signed integer input types
OPENCV_HAL_IMPL_C_PACK
(
v_uint16x8
,
v_uint8x16
,
uchar
,
pack
)
OPENCV_HAL_IMPL_C_PACK
(
v_int16x8
,
v_int8x16
,
schar
,
pack
)
OPENCV_HAL_IMPL_C_PACK
(
v_uint32x4
,
v_uint16x8
,
ushort
,
pack
)
OPENCV_HAL_IMPL_C_PACK
(
v_int32x4
,
v_int16x8
,
short
,
pack
)
OPENCV_HAL_IMPL_C_PACK
(
v_uint64x2
,
v_uint32x4
,
unsigned
,
pack
)
OPENCV_HAL_IMPL_C_PACK
(
v_int64x2
,
v_int32x4
,
int
,
pack
)
OPENCV_HAL_IMPL_C_PACK
(
v_int16x8
,
v_uint8x16
,
uchar
,
pack_u
)
OPENCV_HAL_IMPL_C_PACK
(
v_int32x4
,
v_uint16x8
,
ushort
,
pack_u
)
//! @}
//! @brief Helper macro
//! @ingroup hal_intrin_impl
#define OPENCV_HAL_IMPL_C_RSHR_PACK(_Tpvec, _Tp, _Tpnvec, _Tpn, pack_suffix) \
template<int n> inline _Tpnvec v_rshr_##pack_suffix(const _Tpvec& a, const _Tpvec& b) \
{ \
_Tpnvec c; \
...
...
@@ -789,27 +1627,98 @@ template<int n> inline _Tpnvec v_rshr_##pack_suffix(const _Tpvec& a, const _Tpve
c.s[i+_Tpvec::nlanes] = saturate_cast<_Tpn>((b.s[i] + ((_Tp)1 << (n - 1))) >> n); \
} \
return c; \
} \
}
//! @name Pack with rounding shift
//! @{
//! @brief Pack values from two vectors to one with rounding shift
//!
//! Values from the input vectors will be shifted right by _n_ bits with rounding, converted to narrower
//! type and returned in the result vector. Variant with _u_ suffix converts to unsigned type.
//!
//! - pack: for 16-, 32- and 64-bit integer input types
//! - pack_u: for 16- and 32-bit signed integer input types
OPENCV_HAL_IMPL_C_RSHR_PACK
(
v_uint16x8
,
ushort
,
v_uint8x16
,
uchar
,
pack
)
OPENCV_HAL_IMPL_C_RSHR_PACK
(
v_int16x8
,
short
,
v_int8x16
,
schar
,
pack
)
OPENCV_HAL_IMPL_C_RSHR_PACK
(
v_uint32x4
,
unsigned
,
v_uint16x8
,
ushort
,
pack
)
OPENCV_HAL_IMPL_C_RSHR_PACK
(
v_int32x4
,
int
,
v_int16x8
,
short
,
pack
)
OPENCV_HAL_IMPL_C_RSHR_PACK
(
v_uint64x2
,
uint64
,
v_uint32x4
,
unsigned
,
pack
)
OPENCV_HAL_IMPL_C_RSHR_PACK
(
v_int64x2
,
int64
,
v_int32x4
,
int
,
pack
)
OPENCV_HAL_IMPL_C_RSHR_PACK
(
v_int16x8
,
short
,
v_uint8x16
,
uchar
,
pack_u
)
OPENCV_HAL_IMPL_C_RSHR_PACK
(
v_int32x4
,
int
,
v_uint16x8
,
ushort
,
pack_u
)
//! @}
//! @brief Helper macro
//! @ingroup hal_intrin_impl
#define OPENCV_HAL_IMPL_C_PACK_STORE(_Tpvec, _Tp, _Tpnvec, _Tpn, pack_suffix) \
inline void v_##pack_suffix##_store(_Tpn* ptr, const _Tpvec& a) \
{ \
for( int i = 0; i < _Tpvec::nlanes; i++ ) \
ptr[i] = saturate_cast<_Tpn>(a.s[i]); \
} \
}
//! @name Pack and store
//! @{
//! @brief Store values from the input vector into memory with pack
//!
//! Values will be stored into memory with saturating conversion to narrower type.
//! Variant with _u_ suffix converts to corresponding unsigned type.
//!
//! - pack: for 16-, 32- and 64-bit integer input types
//! - pack_u: for 16- and 32-bit signed integer input types
OPENCV_HAL_IMPL_C_PACK_STORE
(
v_uint16x8
,
ushort
,
v_uint8x16
,
uchar
,
pack
)
OPENCV_HAL_IMPL_C_PACK_STORE
(
v_int16x8
,
short
,
v_int8x16
,
schar
,
pack
)
OPENCV_HAL_IMPL_C_PACK_STORE
(
v_uint32x4
,
unsigned
,
v_uint16x8
,
ushort
,
pack
)
OPENCV_HAL_IMPL_C_PACK_STORE
(
v_int32x4
,
int
,
v_int16x8
,
short
,
pack
)
OPENCV_HAL_IMPL_C_PACK_STORE
(
v_uint64x2
,
uint64
,
v_uint32x4
,
unsigned
,
pack
)
OPENCV_HAL_IMPL_C_PACK_STORE
(
v_int64x2
,
int64
,
v_int32x4
,
int
,
pack
)
OPENCV_HAL_IMPL_C_PACK_STORE
(
v_int16x8
,
short
,
v_uint8x16
,
uchar
,
pack_u
)
OPENCV_HAL_IMPL_C_PACK_STORE
(
v_int32x4
,
int
,
v_uint16x8
,
ushort
,
pack_u
)
//! @}
//! @brief Helper macro
//! @ingroup hal_intrin_impl
#define OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(_Tpvec, _Tp, _Tpnvec, _Tpn, pack_suffix) \
template<int n> inline void v_rshr_##pack_suffix##_store(_Tpn* ptr, const _Tpvec& a) \
{ \
for( int i = 0; i < _Tpvec::nlanes; i++ ) \
ptr[i] = saturate_cast<_Tpn>((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \
}
OPENCV_HAL_IMPL_C_PACK
(
v_uint16x8
,
ushort
,
v_uint8x16
,
uchar
,
pack
)
OPENCV_HAL_IMPL_C_PACK
(
v_int16x8
,
short
,
v_int8x16
,
schar
,
pack
)
OPENCV_HAL_IMPL_C_PACK
(
v_int16x8
,
short
,
v_uint8x16
,
uchar
,
pack_u
)
OPENCV_HAL_IMPL_C_PACK
(
v_uint32x4
,
unsigned
,
v_uint16x8
,
ushort
,
pack
)
OPENCV_HAL_IMPL_C_PACK
(
v_int32x4
,
int
,
v_int16x8
,
short
,
pack
)
OPENCV_HAL_IMPL_C_PACK
(
v_int32x4
,
int
,
v_uint16x8
,
ushort
,
pack_u
)
OPENCV_HAL_IMPL_C_PACK
(
v_uint64x2
,
uint64
,
v_uint32x4
,
unsigned
,
pack
)
OPENCV_HAL_IMPL_C_PACK
(
v_int64x2
,
int64
,
v_int32x4
,
int
,
pack
)
//! @name Pack and store with rounding shift
//! @{
//! @brief Store values from the input vector into memory with pack
//!
//! Values will be shifted _n_ bits right with rounding, converted to narrower type and stored into
//! memory. Variant with _u_ suffix converts to unsigned type.
//!
//! - pack: for 16-, 32- and 64-bit integer input types
//! - pack_u: for 16- and 32-bit signed integer input types
OPENCV_HAL_IMPL_C_RSHR_PACK_STORE
(
v_uint16x8
,
ushort
,
v_uint8x16
,
uchar
,
pack
)
OPENCV_HAL_IMPL_C_RSHR_PACK_STORE
(
v_int16x8
,
short
,
v_int8x16
,
schar
,
pack
)
OPENCV_HAL_IMPL_C_RSHR_PACK_STORE
(
v_uint32x4
,
unsigned
,
v_uint16x8
,
ushort
,
pack
)
OPENCV_HAL_IMPL_C_RSHR_PACK_STORE
(
v_int32x4
,
int
,
v_int16x8
,
short
,
pack
)
OPENCV_HAL_IMPL_C_RSHR_PACK_STORE
(
v_uint64x2
,
uint64
,
v_uint32x4
,
unsigned
,
pack
)
OPENCV_HAL_IMPL_C_RSHR_PACK_STORE
(
v_int64x2
,
int64
,
v_int32x4
,
int
,
pack
)
OPENCV_HAL_IMPL_C_RSHR_PACK_STORE
(
v_int16x8
,
short
,
v_uint8x16
,
uchar
,
pack_u
)
OPENCV_HAL_IMPL_C_RSHR_PACK_STORE
(
v_int32x4
,
int
,
v_uint16x8
,
ushort
,
pack_u
)
//! @}
/** @brief Matrix multiplication
Scheme:
@code
{A0 A1 A2 A3} |V0|
{B0 B1 B2 B3} |V1|
{C0 C1 C2 C3} |V2|
{D0 D1 D2 D3} x |V3|
====================
{R0 R1 R2 R3}, where:
R0 = A0V0 + A1V1 + A2V2 + A3V3,
R1 = B0V0 + B1V1 + B2V2 + B3V3
...
@endcode
*/
inline
v_float32x4
v_matmul
(
const
v_float32x4
&
v
,
const
v_float32x4
&
m0
,
const
v_float32x4
&
m1
,
const
v_float32x4
&
m2
,
const
v_float32x4
&
m3
)
...
...
@@ -820,6 +1729,8 @@ inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
v
.
s
[
0
]
*
m0
.
s
[
3
]
+
v
.
s
[
1
]
*
m1
.
s
[
3
]
+
v
.
s
[
2
]
*
m2
.
s
[
3
]
+
v
.
s
[
3
]
*
m3
.
s
[
3
]);
}
//! @}
}
#endif
modules/hal/include/opencv2/hal/intrin_neon.hpp
View file @
a275489f
...
...
@@ -48,6 +48,8 @@
namespace
cv
{
//! @cond IGNORED
#define CV_SIMD128 1
struct
v_uint8x16
...
...
@@ -278,14 +280,15 @@ void v_rshr_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
}
OPENCV_HAL_IMPL_NEON_PACK
(
v_uint8x16
,
uchar
,
uint8x8_t
,
u8
,
v_uint16x8
,
u16
,
pack
,
n
)
OPENCV_HAL_IMPL_NEON_PACK
(
v_uint8x16
,
uchar
,
uint8x8_t
,
u8
,
v_int16x8
,
s16
,
pack_u
,
un
)
OPENCV_HAL_IMPL_NEON_PACK
(
v_int8x16
,
schar
,
int8x8_t
,
s8
,
v_int16x8
,
s16
,
pack
,
n
)
OPENCV_HAL_IMPL_NEON_PACK
(
v_uint16x8
,
ushort
,
uint16x4_t
,
u16
,
v_uint32x4
,
u32
,
pack
,
n
)
OPENCV_HAL_IMPL_NEON_PACK
(
v_uint16x8
,
ushort
,
uint16x4_t
,
u16
,
v_int32x4
,
s32
,
pack_u
,
un
)
OPENCV_HAL_IMPL_NEON_PACK
(
v_int16x8
,
short
,
int16x4_t
,
s16
,
v_int32x4
,
s32
,
pack
,
n
)
OPENCV_HAL_IMPL_NEON_PACK
(
v_uint32x4
,
unsigned
,
uint32x2_t
,
u32
,
v_uint64x2
,
u64
,
pack
,
n
)
OPENCV_HAL_IMPL_NEON_PACK
(
v_int32x4
,
int
,
int32x2_t
,
s32
,
v_int64x2
,
s64
,
pack
,
n
)
OPENCV_HAL_IMPL_NEON_PACK
(
v_uint8x16
,
uchar
,
uint8x8_t
,
u8
,
v_int16x8
,
s16
,
pack_u
,
un
)
OPENCV_HAL_IMPL_NEON_PACK
(
v_uint16x8
,
ushort
,
uint16x4_t
,
u16
,
v_int32x4
,
s32
,
pack_u
,
un
)
inline
v_float32x4
v_matmul
(
const
v_float32x4
&
v
,
const
v_float32x4
&
m0
,
const
v_float32x4
&
m1
,
const
v_float32x4
&
m2
,
const
v_float32x4
&
m3
)
...
...
@@ -374,7 +377,7 @@ inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
{
int32x4_t
c
=
vmull_s16
(
vget_low_s16
(
a
.
val
),
vget_low_s16
(
b
.
val
));
int32x4_t
d
=
vmull_s16
(
vget_high_s16
(
a
.
val
),
vget_high_s16
(
b
.
val
));
int32x4x2_t
cd
=
v
trn
q_s32
(
c
,
d
);
int32x4x2_t
cd
=
v
uzp
q_s32
(
c
,
d
);
return
v_int32x4
(
vaddq_s32
(
cd
.
val
[
0
],
cd
.
val
[
1
]));
}
...
...
@@ -497,6 +500,16 @@ OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_absdiff, vabdq_u16)
OPENCV_HAL_IMPL_NEON_BIN_FUNC
(
v_uint32x4
,
v_absdiff
,
vabdq_u32
)
OPENCV_HAL_IMPL_NEON_BIN_FUNC
(
v_float32x4
,
v_absdiff
,
vabdq_f32
)
#define OPENCV_HAL_IMPL_NEON_BIN_FUNC2(_Tpvec, _Tpvec2, cast, func, intrin) \
inline _Tpvec2 func(const _Tpvec& a, const _Tpvec& b) \
{ \
return _Tpvec2(cast(intrin(a.val, b.val))); \
}
OPENCV_HAL_IMPL_NEON_BIN_FUNC2
(
v_int8x16
,
v_uint8x16
,
vreinterpretq_u8_s8
,
v_absdiff
,
vabdq_s8
)
OPENCV_HAL_IMPL_NEON_BIN_FUNC2
(
v_int16x8
,
v_uint16x8
,
vreinterpretq_u16_s16
,
v_absdiff
,
vabdq_s16
)
OPENCV_HAL_IMPL_NEON_BIN_FUNC2
(
v_int32x4
,
v_uint32x4
,
vreinterpretq_u32_s32
,
v_absdiff
,
vabdq_s32
)
inline
v_float32x4
v_magnitude
(
const
v_float32x4
&
a
,
const
v_float32x4
&
b
)
{
v_float32x4
x
(
vmlaq_f32
(
vmulq_f32
(
a
.
val
,
a
.
val
),
b
.
val
,
b
.
val
));
...
...
@@ -641,13 +654,13 @@ inline bool v_check_all(const v_float32x4& a)
{
return
v_check_all
(
v_reinterpret_as_u32
(
a
));
}
inline
bool
v_check_any
(
const
v_int8x16
&
a
)
{
return
v_check_a
ll
(
v_reinterpret_as_u8
(
a
));
}
{
return
v_check_a
ny
(
v_reinterpret_as_u8
(
a
));
}
inline
bool
v_check_any
(
const
v_int16x8
&
a
)
{
return
v_check_a
ll
(
v_reinterpret_as_u16
(
a
));
}
{
return
v_check_a
ny
(
v_reinterpret_as_u16
(
a
));
}
inline
bool
v_check_any
(
const
v_int32x4
&
a
)
{
return
v_check_a
ll
(
v_reinterpret_as_u32
(
a
));
}
{
return
v_check_a
ny
(
v_reinterpret_as_u32
(
a
));
}
inline
bool
v_check_any
(
const
v_float32x4
&
a
)
{
return
v_check_a
ll
(
v_reinterpret_as_u32
(
a
));
}
{
return
v_check_a
ny
(
v_reinterpret_as_u32
(
a
));
}
#define OPENCV_HAL_IMPL_NEON_SELECT(_Tpvec, suffix, usuffix) \
inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
...
...
@@ -678,6 +691,8 @@ OPENCV_HAL_IMPL_NEON_EXPAND(v_uint8x16, v_uint16x8, uchar, u8)
OPENCV_HAL_IMPL_NEON_EXPAND
(
v_int8x16
,
v_int16x8
,
schar
,
s8
)
OPENCV_HAL_IMPL_NEON_EXPAND
(
v_uint16x8
,
v_uint32x4
,
ushort
,
u16
)
OPENCV_HAL_IMPL_NEON_EXPAND
(
v_int16x8
,
v_int32x4
,
short
,
s16
)
OPENCV_HAL_IMPL_NEON_EXPAND
(
v_uint32x4
,
v_uint64x2
,
uint
,
u32
)
OPENCV_HAL_IMPL_NEON_EXPAND
(
v_int32x4
,
v_int64x2
,
int
,
s32
)
inline
v_uint32x4
v_load_expand_q
(
const
uchar
*
ptr
)
{
...
...
@@ -840,6 +855,8 @@ inline v_float32x4 v_cvt_f32(const v_int32x4& a)
return
v_float32x4
(
vcvtq_f32_s32
(
a
.
val
));
}
//! @endcond
}
#endif
modules/hal/include/opencv2/hal/intrin_sse.hpp
View file @
a275489f
...
...
@@ -51,6 +51,8 @@
namespace
cv
{
//! @cond IGNORED
struct
v_uint8x16
{
typedef
uchar
lane_type
;
...
...
@@ -296,6 +298,11 @@ OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int32x4, s32)
OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT
(
v_uint64x2
,
u64
)
OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT
(
v_int64x2
,
s64
)
inline
v_float32x4
v_reinterpret_as_f32
(
const
v_float32x4
&
a
)
{
return
a
;
}
inline
v_float64x2
v_reinterpret_as_f64
(
const
v_float64x2
&
a
)
{
return
a
;
}
inline
v_float32x4
v_reinterpret_as_f32
(
const
v_float64x2
&
a
)
{
return
v_float32x4
(
_mm_castpd_ps
(
a
.
val
));
}
inline
v_float64x2
v_reinterpret_as_f64
(
const
v_float32x4
&
a
)
{
return
v_float64x2
(
_mm_castps_pd
(
a
.
val
));
}
//////////////// PACK ///////////////
inline
v_uint8x16
v_pack
(
const
v_uint16x8
&
a
,
const
v_uint16x8
&
b
)
{
...
...
@@ -430,6 +437,17 @@ inline void v_pack_u_store(ushort* ptr, const v_int32x4& a)
_mm_storel_epi64
((
__m128i
*
)
ptr
,
r
);
}
template
<
int
n
>
inline
v_uint16x8
v_rshr_pack_u
(
const
v_int32x4
&
a
,
const
v_int32x4
&
b
)
{
__m128i
delta
=
_mm_set1_epi32
(
1
<<
(
n
-
1
)),
delta32
=
_mm_set1_epi32
(
32768
);
__m128i
a1
=
_mm_sub_epi32
(
_mm_srai_epi32
(
_mm_add_epi32
(
a
.
val
,
delta
),
n
),
delta32
);
__m128i
a2
=
_mm_sub_epi16
(
_mm_packs_epi32
(
a1
,
a1
),
_mm_set1_epi16
(
-
32768
));
__m128i
b1
=
_mm_sub_epi32
(
_mm_srai_epi32
(
_mm_add_epi32
(
b
.
val
,
delta
),
n
),
delta32
);
__m128i
b2
=
_mm_sub_epi16
(
_mm_packs_epi32
(
b1
,
b1
),
_mm_set1_epi16
(
-
32768
));
return
v_uint16x8
(
_mm_unpacklo_epi64
(
a2
,
b2
));
}
template
<
int
n
>
inline
void
v_rshr_pack_u_store
(
ushort
*
ptr
,
const
v_int32x4
&
a
)
{
...
...
@@ -460,7 +478,7 @@ void v_rshr_pack_store(short* ptr, const v_int32x4& a)
{
__m128i
delta
=
_mm_set1_epi32
(
1
<<
(
n
-
1
));
__m128i
a1
=
_mm_srai_epi32
(
_mm_add_epi32
(
a
.
val
,
delta
),
n
);
_mm_storel_epi64
((
__m128i
*
)
ptr
,
a1
);
_mm_storel_epi64
((
__m128i
*
)
ptr
,
_mm_packs_epi32
(
a1
,
a1
)
);
}
...
...
@@ -469,7 +487,7 @@ inline v_uint32x4 v_pack(const v_uint64x2& a, const v_uint64x2& b)
{
__m128i
v0
=
_mm_unpacklo_epi32
(
a
.
val
,
b
.
val
);
// a0 a1 0 0
__m128i
v1
=
_mm_unpackhi_epi32
(
a
.
val
,
b
.
val
);
// b0 b1 0 0
return
v_uint32x4
(
_mm_unpacklo_epi
64
(
v0
,
v1
));
return
v_uint32x4
(
_mm_unpacklo_epi
32
(
v0
,
v1
));
}
inline
void
v_pack_store
(
unsigned
*
ptr
,
const
v_uint64x2
&
a
)
...
...
@@ -483,7 +501,7 @@ inline v_int32x4 v_pack(const v_int64x2& a, const v_int64x2& b)
{
__m128i
v0
=
_mm_unpacklo_epi32
(
a
.
val
,
b
.
val
);
// a0 a1 0 0
__m128i
v1
=
_mm_unpackhi_epi32
(
a
.
val
,
b
.
val
);
// b0 b1 0 0
return
v_int32x4
(
_mm_unpacklo_epi
64
(
v0
,
v1
));
return
v_int32x4
(
_mm_unpacklo_epi
32
(
v0
,
v1
));
}
inline
void
v_pack_store
(
int
*
ptr
,
const
v_int64x2
&
a
)
...
...
@@ -501,7 +519,7 @@ v_uint32x4 v_rshr_pack(const v_uint64x2& a, const v_uint64x2& b)
__m128i
b1
=
_mm_srli_epi64
(
_mm_add_epi64
(
b
.
val
,
delta2
.
val
),
n
);
__m128i
v0
=
_mm_unpacklo_epi32
(
a1
,
b1
);
// a0 a1 0 0
__m128i
v1
=
_mm_unpackhi_epi32
(
a1
,
b1
);
// b0 b1 0 0
return
v_uint32x4
(
_mm_unpacklo_epi
64
(
v0
,
v1
));
return
v_uint32x4
(
_mm_unpacklo_epi
32
(
v0
,
v1
));
}
template
<
int
n
>
inline
...
...
@@ -534,7 +552,7 @@ v_int32x4 v_rshr_pack(const v_int64x2& a, const v_int64x2& b)
__m128i
b1
=
v_srai_epi64
(
_mm_add_epi64
(
b
.
val
,
delta2
.
val
),
n
);
__m128i
v0
=
_mm_unpacklo_epi32
(
a1
,
b1
);
// a0 a1 0 0
__m128i
v1
=
_mm_unpackhi_epi32
(
a1
,
b1
);
// b0 b1 0 0
return
v_int32x4
(
_mm_unpacklo_epi
64
(
v0
,
v1
));
return
v_int32x4
(
_mm_unpacklo_epi
32
(
v0
,
v1
));
}
template
<
int
n
>
inline
...
...
@@ -630,8 +648,8 @@ inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
{
__m128i
v0
=
_mm_mullo_epi16
(
a
.
val
,
b
.
val
);
__m128i
v1
=
_mm_mulhi_epi16
(
a
.
val
,
b
.
val
);
c
.
val
=
_mm_unpacklo_epi
32
(
v0
,
v1
);
d
.
val
=
_mm_unpackhi_epi
32
(
v0
,
v1
);
c
.
val
=
_mm_unpacklo_epi
16
(
v0
,
v1
);
d
.
val
=
_mm_unpackhi_epi
16
(
v0
,
v1
);
}
inline
void
v_mul_expand
(
const
v_uint16x8
&
a
,
const
v_uint16x8
&
b
,
...
...
@@ -639,8 +657,8 @@ inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
{
__m128i
v0
=
_mm_mullo_epi16
(
a
.
val
,
b
.
val
);
__m128i
v1
=
_mm_mulhi_epu16
(
a
.
val
,
b
.
val
);
c
.
val
=
_mm_unpacklo_epi
32
(
v0
,
v1
);
d
.
val
=
_mm_unpackhi_epi
32
(
v0
,
v1
);
c
.
val
=
_mm_unpacklo_epi
16
(
v0
,
v1
);
d
.
val
=
_mm_unpackhi_epi
16
(
v0
,
v1
);
}
inline
void
v_mul_expand
(
const
v_uint32x4
&
a
,
const
v_uint32x4
&
b
,
...
...
@@ -869,6 +887,18 @@ inline _Tpuvec v_absdiff(const _Tpsvec& a, const _Tpsvec& b) \
OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16
(
v_uint8x16
,
v_int8x16
,
8
,
(
int
)
0x80808080
)
OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16
(
v_uint16x8
,
v_int16x8
,
16
,
(
int
)
0x80008000
)
inline
v_uint32x4
v_absdiff
(
const
v_uint32x4
&
a
,
const
v_uint32x4
&
b
)
{
return
v_max
(
a
,
b
)
-
v_min
(
a
,
b
);
}
inline
v_uint32x4
v_absdiff
(
const
v_int32x4
&
a
,
const
v_int32x4
&
b
)
{
__m128i
d
=
_mm_sub_epi32
(
a
.
val
,
b
.
val
);
__m128i
m
=
_mm_cmpgt_epi32
(
b
.
val
,
a
.
val
);
return
v_uint32x4
(
_mm_sub_epi32
(
_mm_xor_si128
(
d
,
m
),
m
));
}
#define OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(_Tpvec, _Tp, _Tpreg, suffix, absmask_vec) \
inline _Tpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \
{ \
...
...
@@ -1047,8 +1077,8 @@ OPENCV_HAL_IMPL_SSE_SELECT(v_uint16x8, si128)
OPENCV_HAL_IMPL_SSE_SELECT
(
v_int16x8
,
si128
)
OPENCV_HAL_IMPL_SSE_SELECT
(
v_uint32x4
,
si128
)
OPENCV_HAL_IMPL_SSE_SELECT
(
v_int32x4
,
si128
)
OPENCV_HAL_IMPL_SSE_SELECT
(
v_uint64x2
,
si128
)
OPENCV_HAL_IMPL_SSE_SELECT
(
v_int64x2
,
si128
)
//
OPENCV_HAL_IMPL_SSE_SELECT(v_uint64x2, si128)
//
OPENCV_HAL_IMPL_SSE_SELECT(v_int64x2, si128)
OPENCV_HAL_IMPL_SSE_SELECT
(
v_float32x4
,
ps
)
OPENCV_HAL_IMPL_SSE_SELECT
(
v_float64x2
,
pd
)
...
...
@@ -1257,7 +1287,7 @@ inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b,
__m128i
v0
=
_mm_unpacklo_epi8
(
u0
,
u2
);
// a0 a8 b0 b8 ...
__m128i
v1
=
_mm_unpackhi_epi8
(
u0
,
u2
);
// a2 a10 b2 b10 ...
__m128i
v2
=
_mm_unpacklo_epi8
(
u1
,
u3
);
// a4 a12 b4 b12 ...
__m128i
v3
=
_mm_unpackhi_epi8
(
u1
,
u3
);
// a6 a14 b
4
b14 ...
__m128i
v3
=
_mm_unpackhi_epi8
(
u1
,
u3
);
// a6 a14 b
6
b14 ...
u0
=
_mm_unpacklo_epi8
(
v0
,
v2
);
// a0 a4 a8 a12 ...
u1
=
_mm_unpacklo_epi8
(
v1
,
v3
);
// a2 a6 a10 a14 ...
...
...
@@ -1266,13 +1296,13 @@ inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b,
v0
=
_mm_unpacklo_epi8
(
u0
,
u1
);
// a0 a2 a4 a6 ...
v1
=
_mm_unpacklo_epi8
(
u2
,
u3
);
// a1 a3 a5 a7 ...
v2
=
_mm_unpackhi_epi8
(
u0
,
u1
);
//
b0 b2 b4 b
6 ...
v3
=
_mm_unpackhi_epi8
(
u2
,
u3
);
//
b1 b3 b5 b
7 ...
v2
=
_mm_unpackhi_epi8
(
u0
,
u1
);
//
c0 c2 c4 c
6 ...
v3
=
_mm_unpackhi_epi8
(
u2
,
u3
);
//
c1 c3 c5 c
7 ...
a
.
val
=
_mm_unpacklo_epi8
(
v0
,
v1
);
b
.
val
=
_mm_unpack
lo_epi8
(
v2
,
v3
);
c
.
val
=
_mm_unpack
hi_epi8
(
v0
,
v1
);
d
.
val
=
_mm_unpack
lo
_epi8
(
v2
,
v3
);
b
.
val
=
_mm_unpack
hi_epi8
(
v0
,
v1
);
c
.
val
=
_mm_unpack
lo_epi8
(
v2
,
v3
);
d
.
val
=
_mm_unpack
hi
_epi8
(
v2
,
v3
);
}
inline
void
v_load_deinterleave
(
const
ushort
*
ptr
,
v_uint16x8
&
a
,
v_uint16x8
&
b
,
v_uint16x8
&
c
)
...
...
@@ -1560,6 +1590,8 @@ inline v_float64x2 v_cvt_f64(const v_float32x4& a)
return
v_float64x2
(
_mm_cvtps_pd
(
a
.
val
));
}
//! @endcond
}
#endif
modules/hal/test/test_intrin.cpp
0 → 100644
View file @
a275489f
#include "test_intrin_utils.hpp"
#include <climits>
using
namespace
cv
;
template
<
typename
R
>
struct
TheTest
{
typedef
typename
R
::
lane_type
LaneType
;
TheTest
&
test_loadstore
()
{
AlignedData
<
R
>
data
;
AlignedData
<
R
>
out
;
// check if addresses are aligned and unaligned respectively
EXPECT_EQ
((
size_t
)
0
,
(
size_t
)
&
data
.
a
.
d
%
16
);
EXPECT_NE
((
size_t
)
0
,
(
size_t
)
&
data
.
u
.
d
%
16
);
EXPECT_EQ
((
size_t
)
0
,
(
size_t
)
&
out
.
a
.
d
%
16
);
EXPECT_NE
((
size_t
)
0
,
(
size_t
)
&
out
.
u
.
d
%
16
);
// check some initialization methods
R
r1
=
data
.
a
;
R
r2
=
v_load
(
data
.
u
.
d
);
R
r3
=
v_load_aligned
(
data
.
a
.
d
);
R
r4
(
r2
);
EXPECT_EQ
(
data
.
a
[
0
],
r1
.
get0
());
EXPECT_EQ
(
data
.
u
[
0
],
r2
.
get0
());
EXPECT_EQ
(
data
.
a
[
0
],
r3
.
get0
());
EXPECT_EQ
(
data
.
u
[
0
],
r4
.
get0
());
// check some store methods
out
.
u
.
clear
();
out
.
a
.
clear
();
v_store
(
out
.
u
.
d
,
r1
);
v_store_aligned
(
out
.
a
.
d
,
r2
);
EXPECT_EQ
(
data
.
a
,
out
.
a
);
EXPECT_EQ
(
data
.
u
,
out
.
u
);
// check more store methods
Data
<
R
>
d
,
res
(
0
);
R
r5
=
d
;
v_store_high
(
res
.
mid
(),
r5
);
v_store_low
(
res
.
d
,
r5
);
EXPECT_EQ
(
d
,
res
);
// check halves load correctness
res
.
clear
();
R
r6
=
v_load_halves
(
d
.
d
,
d
.
mid
());
v_store
(
res
.
d
,
r6
);
EXPECT_EQ
(
d
,
res
);
// zero, all
Data
<
R
>
resZ
=
RegTrait
<
R
>::
zero
();
Data
<
R
>
resV
=
RegTrait
<
R
>::
all
(
8
);
for
(
int
i
=
0
;
i
<
R
::
nlanes
;
++
i
)
{
EXPECT_EQ
((
LaneType
)
0
,
resZ
[
i
]);
EXPECT_EQ
((
LaneType
)
8
,
resV
[
i
]);
}
// reinterpret_as
v_uint8x16
vu8
=
v_reinterpret_as_u8
(
r1
);
out
.
a
.
clear
();
v_store
((
uchar
*
)
out
.
a
.
d
,
vu8
);
EXPECT_EQ
(
data
.
a
,
out
.
a
);
v_int8x16
vs8
=
v_reinterpret_as_s8
(
r1
);
out
.
a
.
clear
();
v_store
((
schar
*
)
out
.
a
.
d
,
vs8
);
EXPECT_EQ
(
data
.
a
,
out
.
a
);
v_uint16x8
vu16
=
v_reinterpret_as_u16
(
r1
);
out
.
a
.
clear
();
v_store
((
ushort
*
)
out
.
a
.
d
,
vu16
);
EXPECT_EQ
(
data
.
a
,
out
.
a
);
v_int16x8
vs16
=
v_reinterpret_as_s16
(
r1
);
out
.
a
.
clear
();
v_store
((
short
*
)
out
.
a
.
d
,
vs16
);
EXPECT_EQ
(
data
.
a
,
out
.
a
);
v_uint32x4
vu32
=
v_reinterpret_as_u32
(
r1
);
out
.
a
.
clear
();
v_store
((
unsigned
*
)
out
.
a
.
d
,
vu32
);
EXPECT_EQ
(
data
.
a
,
out
.
a
);
v_int32x4
vs32
=
v_reinterpret_as_s32
(
r1
);
out
.
a
.
clear
();
v_store
((
int
*
)
out
.
a
.
d
,
vs32
);
EXPECT_EQ
(
data
.
a
,
out
.
a
);
v_uint64x2
vu64
=
v_reinterpret_as_u64
(
r1
);
out
.
a
.
clear
();
v_store
((
uint64
*
)
out
.
a
.
d
,
vu64
);
EXPECT_EQ
(
data
.
a
,
out
.
a
);
v_int64x2
vs64
=
v_reinterpret_as_s64
(
r1
);
out
.
a
.
clear
();
v_store
((
int64
*
)
out
.
a
.
d
,
vs64
);
EXPECT_EQ
(
data
.
a
,
out
.
a
);
v_float32x4
vf32
=
v_reinterpret_as_f32
(
r1
);
out
.
a
.
clear
();
v_store
((
float
*
)
out
.
a
.
d
,
vf32
);
EXPECT_EQ
(
data
.
a
,
out
.
a
);
#if CV_SIMD128_64F
v_float64x2
vf64
=
v_reinterpret_as_f64
(
r1
);
out
.
a
.
clear
();
v_store
((
double
*
)
out
.
a
.
d
,
vf64
);
EXPECT_EQ
(
data
.
a
,
out
.
a
);
#endif
return
*
this
;
}
TheTest
&
test_interleave
()
{
Data
<
R
>
data1
,
data2
,
data3
,
data4
;
data2
+=
20
;
data3
+=
40
;
data4
+=
60
;
R
a
=
data1
,
b
=
data2
,
c
=
data3
;
R
d
=
data1
,
e
=
data2
,
f
=
data3
,
g
=
data4
;
LaneType
buf3
[
R
::
nlanes
*
3
];
LaneType
buf4
[
R
::
nlanes
*
4
];
v_store_interleave
(
buf3
,
a
,
b
,
c
);
v_store_interleave
(
buf4
,
d
,
e
,
f
,
g
);
Data
<
R
>
z
(
0
);
a
=
b
=
c
=
d
=
e
=
f
=
g
=
z
;
v_load_deinterleave
(
buf3
,
a
,
b
,
c
);
v_load_deinterleave
(
buf4
,
d
,
e
,
f
,
g
);
for
(
int
i
=
0
;
i
<
R
::
nlanes
;
++
i
)
{
EXPECT_EQ
(
data1
,
Data
<
R
>
(
a
));
EXPECT_EQ
(
data2
,
Data
<
R
>
(
b
));
EXPECT_EQ
(
data3
,
Data
<
R
>
(
c
));
EXPECT_EQ
(
data1
,
Data
<
R
>
(
d
));
EXPECT_EQ
(
data2
,
Data
<
R
>
(
e
));
EXPECT_EQ
(
data3
,
Data
<
R
>
(
f
));
EXPECT_EQ
(
data4
,
Data
<
R
>
(
g
));
}
return
*
this
;
}
// v_expand and v_load_expand
TheTest
&
test_expand
()
{
typedef
typename
RegTrait
<
R
>::
w_reg
Rx2
;
Data
<
R
>
dataA
;
R
a
=
dataA
;
Data
<
Rx2
>
resB
=
v_load_expand
(
dataA
.
d
);
Rx2
c
,
d
;
v_expand
(
a
,
c
,
d
);
Data
<
Rx2
>
resC
=
c
,
resD
=
d
;
const
int
n
=
Rx2
::
nlanes
;
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
EXPECT_EQ
(
dataA
[
i
],
resB
[
i
]);
EXPECT_EQ
(
dataA
[
i
],
resC
[
i
]);
EXPECT_EQ
(
dataA
[
i
+
n
],
resD
[
i
]);
}
return
*
this
;
}
TheTest
&
test_expand_q
()
{
typedef
typename
RegTrait
<
R
>::
q_reg
Rx4
;
Data
<
R
>
data
;
Data
<
Rx4
>
out
=
v_load_expand_q
(
data
.
d
);
const
int
n
=
Rx4
::
nlanes
;
for
(
int
i
=
0
;
i
<
n
;
++
i
)
EXPECT_EQ
(
data
[
i
],
out
[
i
]);
return
*
this
;
}
TheTest
&
test_addsub
()
{
Data
<
R
>
dataA
,
dataB
;
dataB
.
reverse
();
R
a
=
dataA
,
b
=
dataB
;
Data
<
R
>
resC
=
a
+
b
,
resD
=
a
-
b
;
for
(
int
i
=
0
;
i
<
R
::
nlanes
;
++
i
)
{
EXPECT_EQ
(
saturate_cast
<
LaneType
>
(
dataA
[
i
]
+
dataB
[
i
]),
resC
[
i
]);
EXPECT_EQ
(
saturate_cast
<
LaneType
>
(
dataA
[
i
]
-
dataB
[
i
]),
resD
[
i
]);
}
return
*
this
;
}
TheTest
&
test_addsub_wrap
()
{
Data
<
R
>
dataA
,
dataB
;
dataB
.
reverse
();
R
a
=
dataA
,
b
=
dataB
;
Data
<
R
>
resC
=
v_add_wrap
(
a
,
b
),
resD
=
v_sub_wrap
(
a
,
b
);
for
(
int
i
=
0
;
i
<
R
::
nlanes
;
++
i
)
{
EXPECT_EQ
((
LaneType
)(
dataA
[
i
]
+
dataB
[
i
]),
resC
[
i
]);
EXPECT_EQ
((
LaneType
)(
dataA
[
i
]
-
dataB
[
i
]),
resD
[
i
]);
}
return
*
this
;
}
TheTest
&
test_mul
()
{
Data
<
R
>
dataA
,
dataB
;
dataB
.
reverse
();
R
a
=
dataA
,
b
=
dataB
;
Data
<
R
>
resC
=
a
*
b
;
for
(
int
i
=
0
;
i
<
R
::
nlanes
;
++
i
)
{
EXPECT_EQ
(
dataA
[
i
]
*
dataB
[
i
],
resC
[
i
]);
}
return
*
this
;
}
TheTest
&
test_div
()
{
Data
<
R
>
dataA
,
dataB
;
dataB
.
reverse
();
R
a
=
dataA
,
b
=
dataB
;
Data
<
R
>
resC
=
a
/
b
;
for
(
int
i
=
0
;
i
<
R
::
nlanes
;
++
i
)
{
EXPECT_EQ
(
dataA
[
i
]
/
dataB
[
i
],
resC
[
i
]);
}
return
*
this
;
}
TheTest
&
test_mul_expand
()
{
typedef
typename
RegTrait
<
R
>::
w_reg
Rx2
;
Data
<
R
>
dataA
,
dataB
(
2
);
R
a
=
dataA
,
b
=
dataB
;
Rx2
c
,
d
;
v_mul_expand
(
a
,
b
,
c
,
d
);
Data
<
Rx2
>
resC
=
c
,
resD
=
d
;
const
int
n
=
R
::
nlanes
/
2
;
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
EXPECT_EQ
((
typename
Rx2
::
lane_type
)
dataA
[
i
]
*
dataB
[
i
],
resC
[
i
]);
EXPECT_EQ
((
typename
Rx2
::
lane_type
)
dataA
[
i
+
n
]
*
dataB
[
i
+
n
],
resD
[
i
]);
}
return
*
this
;
}
template
<
int
s
>
TheTest
&
test_shift
()
{
Data
<
R
>
dataA
;
R
a
=
dataA
;
Data
<
R
>
resB
=
a
<<
s
,
resC
=
v_shl
<
s
>
(
a
),
resD
=
a
>>
s
,
resE
=
v_shr
<
s
>
(
a
);
for
(
int
i
=
0
;
i
<
R
::
nlanes
;
++
i
)
{
EXPECT_EQ
(
dataA
[
i
]
<<
s
,
resB
[
i
]);
EXPECT_EQ
(
dataA
[
i
]
<<
s
,
resC
[
i
]);
EXPECT_EQ
(
dataA
[
i
]
>>
s
,
resD
[
i
]);
EXPECT_EQ
(
dataA
[
i
]
>>
s
,
resE
[
i
]);
}
return
*
this
;
}
TheTest
&
test_cmp
()
{
Data
<
R
>
dataA
,
dataB
;
dataB
.
reverse
();
dataB
+=
1
;
R
a
=
dataA
,
b
=
dataB
;
Data
<
R
>
resC
=
(
a
==
b
);
Data
<
R
>
resD
=
(
a
!=
b
);
Data
<
R
>
resE
=
(
a
>
b
);
Data
<
R
>
resF
=
(
a
>=
b
);
Data
<
R
>
resG
=
(
a
<
b
);
Data
<
R
>
resH
=
(
a
<=
b
);
for
(
int
i
=
0
;
i
<
R
::
nlanes
;
++
i
)
{
EXPECT_EQ
(
dataA
[
i
]
==
dataB
[
i
],
resC
[
i
]
!=
0
);
EXPECT_EQ
(
dataA
[
i
]
!=
dataB
[
i
],
resD
[
i
]
!=
0
);
EXPECT_EQ
(
dataA
[
i
]
>
dataB
[
i
],
resE
[
i
]
!=
0
);
EXPECT_EQ
(
dataA
[
i
]
>=
dataB
[
i
],
resF
[
i
]
!=
0
);
EXPECT_EQ
(
dataA
[
i
]
<
dataB
[
i
],
resG
[
i
]
!=
0
);
EXPECT_EQ
(
dataA
[
i
]
<=
dataB
[
i
],
resH
[
i
]
!=
0
);
}
return
*
this
;
}
TheTest
&
test_dot_prod
()
{
typedef
typename
RegTrait
<
R
>::
w_reg
Rx2
;
Data
<
R
>
dataA
,
dataB
(
2
);
R
a
=
dataA
,
b
=
dataB
;
Data
<
Rx2
>
res
=
v_dotprod
(
a
,
b
);
const
int
n
=
R
::
nlanes
/
2
;
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
EXPECT_EQ
(
dataA
[
i
*
2
]
*
dataB
[
i
*
2
]
+
dataA
[
i
*
2
+
1
]
*
dataB
[
i
*
2
+
1
],
res
[
i
]);
}
return
*
this
;
}
TheTest
&
test_logic
()
{
Data
<
R
>
dataA
,
dataB
(
2
);
R
a
=
dataA
,
b
=
dataB
;
Data
<
R
>
resC
=
a
&
b
,
resD
=
a
|
b
,
resE
=
a
^
b
,
resF
=
~
a
;
for
(
int
i
=
0
;
i
<
R
::
nlanes
;
++
i
)
{
EXPECT_EQ
(
dataA
[
i
]
&
dataB
[
i
],
resC
[
i
]);
EXPECT_EQ
(
dataA
[
i
]
|
dataB
[
i
],
resD
[
i
]);
EXPECT_EQ
(
dataA
[
i
]
^
dataB
[
i
],
resE
[
i
]);
EXPECT_EQ
((
LaneType
)
~
dataA
[
i
],
resF
[
i
]);
}
return
*
this
;
}
TheTest
&
test_sqrt_abs
()
{
Data
<
R
>
dataA
,
dataD
;
dataD
*=
-
1.0
;
R
a
=
dataA
,
d
=
dataD
;
Data
<
R
>
resB
=
v_sqrt
(
a
),
resC
=
v_invsqrt
(
a
),
resE
=
v_abs
(
d
);
for
(
int
i
=
0
;
i
<
R
::
nlanes
;
++
i
)
{
EXPECT_FLOAT_EQ
((
float
)
std
::
sqrt
(
dataA
[
i
]),
(
float
)
resB
[
i
]);
EXPECT_FLOAT_EQ
((
float
)
1
/
std
::
sqrt
(
dataA
[
i
]),
(
float
)
resC
[
i
]);
EXPECT_FLOAT_EQ
((
float
)
abs
(
dataA
[
i
]),
(
float
)
resE
[
i
]);
}
return
*
this
;
}
TheTest
&
test_min_max
()
{
Data
<
R
>
dataA
,
dataB
;
dataB
.
reverse
();
R
a
=
dataA
,
b
=
dataB
;
Data
<
R
>
resC
=
v_min
(
a
,
b
),
resD
=
v_max
(
a
,
b
);
for
(
int
i
=
0
;
i
<
R
::
nlanes
;
++
i
)
{
EXPECT_EQ
(
std
::
min
(
dataA
[
i
],
dataB
[
i
]),
resC
[
i
]);
EXPECT_EQ
(
std
::
max
(
dataA
[
i
],
dataB
[
i
]),
resD
[
i
]);
}
return
*
this
;
}
TheTest
&
test_absdiff
()
{
typedef
typename
RegTrait
<
R
>::
u_reg
Ru
;
typedef
typename
Ru
::
lane_type
u_type
;
Data
<
R
>
dataA
(
std
::
numeric_limits
<
LaneType
>::
max
()),
dataB
(
std
::
numeric_limits
<
LaneType
>::
min
());
dataA
[
0
]
=
(
LaneType
)
-
1
;
dataB
[
0
]
=
1
;
dataA
[
1
]
=
2
;
dataB
[
1
]
=
(
LaneType
)
-
2
;
R
a
=
dataA
,
b
=
dataB
;
Data
<
Ru
>
resC
=
v_absdiff
(
a
,
b
);
const
u_type
mask
=
std
::
numeric_limits
<
LaneType
>::
is_signed
?
(
u_type
)(
1
<<
(
sizeof
(
u_type
)
*
8
-
1
))
:
0
;
for
(
int
i
=
0
;
i
<
Ru
::
nlanes
;
++
i
)
{
u_type
uA
=
dataA
[
i
]
^
mask
;
u_type
uB
=
dataB
[
i
]
^
mask
;
EXPECT_EQ
(
uA
>
uB
?
uA
-
uB
:
uB
-
uA
,
resC
[
i
]);
}
return
*
this
;
}
TheTest
&
test_float_absdiff
()
{
Data
<
R
>
dataA
(
std
::
numeric_limits
<
LaneType
>::
max
()),
dataB
(
std
::
numeric_limits
<
LaneType
>::
min
());
dataA
[
0
]
=
-
1
;
dataB
[
0
]
=
1
;
dataA
[
1
]
=
2
;
dataB
[
1
]
=
-
2
;
R
a
=
dataA
,
b
=
dataB
;
Data
<
R
>
resC
=
v_absdiff
(
a
,
b
);
for
(
int
i
=
0
;
i
<
R
::
nlanes
;
++
i
)
{
EXPECT_EQ
(
dataA
[
i
]
>
dataB
[
i
]
?
dataA
[
i
]
-
dataB
[
i
]
:
dataB
[
i
]
-
dataA
[
i
],
resC
[
i
]);
}
return
*
this
;
}
TheTest
&
test_reduce
()
{
Data
<
R
>
dataA
;
R
a
=
dataA
;
EXPECT_EQ
((
LaneType
)
1
,
v_reduce_min
(
a
));
EXPECT_EQ
((
LaneType
)
R
::
nlanes
,
v_reduce_max
(
a
));
EXPECT_EQ
((
LaneType
)(
1
+
R
::
nlanes
)
*
2
,
v_reduce_sum
(
a
));
return
*
this
;
}
TheTest
&
test_mask
()
{
Data
<
R
>
dataA
,
dataB
,
dataC
,
dataD
(
1
),
dataE
(
2
);
dataA
[
1
]
*=
(
LaneType
)
-
1
;
dataC
*=
(
LaneType
)
-
1
;
R
a
=
dataA
,
b
=
dataB
,
c
=
dataC
,
d
=
dataD
,
e
=
dataE
;
int
m
=
v_signmask
(
a
);
EXPECT_EQ
(
2
,
m
);
EXPECT_EQ
(
false
,
v_check_all
(
a
));
EXPECT_EQ
(
false
,
v_check_all
(
b
));
EXPECT_EQ
(
true
,
v_check_all
(
c
));
EXPECT_EQ
(
true
,
v_check_any
(
a
));
EXPECT_EQ
(
false
,
v_check_any
(
b
));
EXPECT_EQ
(
true
,
v_check_any
(
c
));
typedef
V_TypeTraits
<
LaneType
>
Traits
;
typedef
typename
Traits
::
int_type
int_type
;
R
f
=
v_select
(
b
,
d
,
e
);
Data
<
R
>
resF
=
f
;
for
(
int
i
=
0
;
i
<
R
::
nlanes
;
++
i
)
{
int_type
m2
=
Traits
::
reinterpret_int
(
dataB
[
i
]);
EXPECT_EQ
((
Traits
::
reinterpret_int
(
dataD
[
i
])
&
m2
)
|
(
Traits
::
reinterpret_int
(
dataE
[
i
])
&
~
m2
),
Traits
::
reinterpret_int
(
resF
[
i
]));
}
return
*
this
;
}
template
<
int
s
>
TheTest
&
test_pack
()
{
typedef
typename
RegTrait
<
R
>::
w_reg
Rx2
;
typedef
typename
Rx2
::
lane_type
w_type
;
Data
<
Rx2
>
dataA
,
dataB
;
dataA
+=
std
::
numeric_limits
<
LaneType
>::
is_signed
?
-
10
:
10
;
dataB
*=
10
;
Rx2
a
=
dataA
,
b
=
dataB
;
Data
<
R
>
resC
=
v_pack
(
a
,
b
);
Data
<
R
>
resD
=
v_rshr_pack
<
s
>
(
a
,
b
);
Data
<
R
>
resE
(
0
);
v_pack_store
(
resE
.
d
,
b
);
Data
<
R
>
resF
(
0
);
v_rshr_pack_store
<
s
>
(
resF
.
d
,
b
);
const
int
n
=
Rx2
::
nlanes
;
const
w_type
add
=
(
w_type
)
1
<<
(
s
-
1
);
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
EXPECT_EQ
(
saturate_cast
<
LaneType
>
(
dataA
[
i
]),
resC
[
i
]);
EXPECT_EQ
(
saturate_cast
<
LaneType
>
(
dataB
[
i
]),
resC
[
i
+
n
]);
EXPECT_EQ
(
saturate_cast
<
LaneType
>
((
dataA
[
i
]
+
add
)
>>
s
),
resD
[
i
]);
EXPECT_EQ
(
saturate_cast
<
LaneType
>
((
dataB
[
i
]
+
add
)
>>
s
),
resD
[
i
+
n
]);
EXPECT_EQ
(
saturate_cast
<
LaneType
>
(
dataB
[
i
]),
resE
[
i
]);
EXPECT_EQ
((
LaneType
)
0
,
resE
[
i
+
n
]);
EXPECT_EQ
(
saturate_cast
<
LaneType
>
((
dataB
[
i
]
+
add
)
>>
s
),
resF
[
i
]);
EXPECT_EQ
((
LaneType
)
0
,
resF
[
i
+
n
]);
}
return
*
this
;
}
template
<
int
s
>
TheTest
&
test_pack_u
()
{
typedef
typename
RegTrait
<
R
>::
w_reg
Rx2
;
typedef
typename
RegTrait
<
Rx2
>::
int_reg
Ri2
;
typedef
typename
Ri2
::
lane_type
w_type
;
Data
<
Ri2
>
dataA
,
dataB
;
dataA
+=
-
10
;
dataB
*=
10
;
Ri2
a
=
dataA
,
b
=
dataB
;
Data
<
R
>
resC
=
v_pack_u
(
a
,
b
);
Data
<
R
>
resD
=
v_rshr_pack_u
<
s
>
(
a
,
b
);
Data
<
R
>
resE
(
0
);
v_pack_u_store
(
resE
.
d
,
b
);
Data
<
R
>
resF
(
0
);
v_rshr_pack_u_store
<
s
>
(
resF
.
d
,
b
);
const
int
n
=
Ri2
::
nlanes
;
const
w_type
add
=
(
w_type
)
1
<<
(
s
-
1
);
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
EXPECT_EQ
(
saturate_cast
<
LaneType
>
(
dataA
[
i
]),
resC
[
i
]);
EXPECT_EQ
(
saturate_cast
<
LaneType
>
(
dataB
[
i
]),
resC
[
i
+
n
]);
EXPECT_EQ
(
saturate_cast
<
LaneType
>
((
dataA
[
i
]
+
add
)
>>
s
),
resD
[
i
]);
EXPECT_EQ
(
saturate_cast
<
LaneType
>
((
dataB
[
i
]
+
add
)
>>
s
),
resD
[
i
+
n
]);
EXPECT_EQ
(
saturate_cast
<
LaneType
>
(
dataB
[
i
]),
resE
[
i
]);
EXPECT_EQ
((
LaneType
)
0
,
resE
[
i
+
n
]);
EXPECT_EQ
(
saturate_cast
<
LaneType
>
((
dataB
[
i
]
+
add
)
>>
s
),
resF
[
i
]);
EXPECT_EQ
((
LaneType
)
0
,
resF
[
i
+
n
]);
}
return
*
this
;
}
TheTest
&
test_unpack
()
{
Data
<
R
>
dataA
,
dataB
;
dataB
*=
10
;
R
a
=
dataA
,
b
=
dataB
;
R
c
,
d
,
e
,
f
,
lo
,
hi
;
v_zip
(
a
,
b
,
c
,
d
);
v_recombine
(
a
,
b
,
e
,
f
);
lo
=
v_combine_low
(
a
,
b
);
hi
=
v_combine_high
(
a
,
b
);
Data
<
R
>
resC
=
c
,
resD
=
d
,
resE
=
e
,
resF
=
f
,
resLo
=
lo
,
resHi
=
hi
;
const
int
n
=
R
::
nlanes
/
2
;
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
EXPECT_EQ
(
dataA
[
i
],
resC
[
i
*
2
]);
EXPECT_EQ
(
dataB
[
i
],
resC
[
i
*
2
+
1
]);
EXPECT_EQ
(
dataA
[
i
+
n
],
resD
[
i
*
2
]);
EXPECT_EQ
(
dataB
[
i
+
n
],
resD
[
i
*
2
+
1
]);
EXPECT_EQ
(
dataA
[
i
],
resE
[
i
]);
EXPECT_EQ
(
dataB
[
i
],
resE
[
i
+
n
]);
EXPECT_EQ
(
dataA
[
i
+
n
],
resF
[
i
]);
EXPECT_EQ
(
dataB
[
i
+
n
],
resF
[
i
+
n
]);
EXPECT_EQ
(
dataA
[
i
],
resLo
[
i
]);
EXPECT_EQ
(
dataB
[
i
],
resLo
[
i
+
n
]);
EXPECT_EQ
(
dataA
[
i
+
n
],
resHi
[
i
]);
EXPECT_EQ
(
dataB
[
i
+
n
],
resHi
[
i
+
n
]);
}
return
*
this
;
}
template
<
int
s
>
TheTest
&
test_extract
()
{
Data
<
R
>
dataA
,
dataB
;
dataB
*=
10
;
R
a
=
dataA
,
b
=
dataB
;
Data
<
R
>
resC
=
v_extract
<
s
>
(
a
,
b
);
for
(
int
i
=
0
;
i
<
R
::
nlanes
;
++
i
)
{
if
(
i
+
s
>=
R
::
nlanes
)
EXPECT_EQ
(
dataB
[
i
-
R
::
nlanes
+
s
],
resC
[
i
]);
else
EXPECT_EQ
(
dataA
[
i
+
s
],
resC
[
i
]);
}
return
*
this
;
}
TheTest
&
test_float_math
()
{
typedef
typename
RegTrait
<
R
>::
int_reg
Ri
;
Data
<
R
>
data1
,
data2
,
data3
;
data1
*=
1.1
;
data2
+=
10
;
R
a1
=
data1
,
a2
=
data2
,
a3
=
data3
;
Data
<
Ri
>
resB
=
v_round
(
a1
),
resC
=
v_trunc
(
a1
),
resD
=
v_floor
(
a1
),
resE
=
v_ceil
(
a1
);
Data
<
R
>
resF
=
v_magnitude
(
a1
,
a2
),
resG
=
v_sqr_magnitude
(
a1
,
a2
),
resH
=
v_muladd
(
a1
,
a2
,
a3
);
for
(
int
i
=
0
;
i
<
R
::
nlanes
;
++
i
)
{
EXPECT_EQ
(
cvRound
(
data1
[
i
]),
resB
[
i
]);
EXPECT_EQ
((
typename
Ri
::
lane_type
)
data1
[
i
],
resC
[
i
]);
EXPECT_EQ
(
cvFloor
(
data1
[
i
]),
resD
[
i
]);
EXPECT_EQ
(
cvCeil
(
data1
[
i
]),
resE
[
i
]);
EXPECT_DOUBLE_EQ
(
std
::
sqrt
(
data1
[
i
]
*
data1
[
i
]
+
data2
[
i
]
*
data2
[
i
]),
resF
[
i
]);
EXPECT_DOUBLE_EQ
(
data1
[
i
]
*
data1
[
i
]
+
data2
[
i
]
*
data2
[
i
],
resG
[
i
]);
EXPECT_DOUBLE_EQ
(
data1
[
i
]
*
data2
[
i
]
+
data3
[
i
],
resH
[
i
]);
}
return
*
this
;
}
TheTest
&
test_float_cvt32
()
{
typedef
v_float32x4
Rt
;
Data
<
R
>
dataA
;
dataA
*=
1.1
;
R
a
=
dataA
;
Rt
b
=
v_cvt_f32
(
a
);
Data
<
Rt
>
resB
=
b
;
int
n
=
std
::
min
<
int
>
(
Rt
::
nlanes
,
R
::
nlanes
);
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
EXPECT_EQ
((
typename
Rt
::
lane_type
)
dataA
[
i
],
resB
[
i
]);
}
return
*
this
;
}
TheTest
&
test_float_cvt64
()
{
#if CV_SIMD128_64F
typedef
v_float64x2
Rt
;
Data
<
R
>
dataA
;
dataA
*=
1.1
;
R
a
=
dataA
;
Rt
b
=
v_cvt_f64
(
a
);
Data
<
Rt
>
resB
=
b
;
int
n
=
std
::
min
<
int
>
(
Rt
::
nlanes
,
R
::
nlanes
);
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
EXPECT_EQ
((
typename
Rt
::
lane_type
)
dataA
[
i
],
resB
[
i
]);
}
#endif
return
*
this
;
}
TheTest
&
test_matmul
()
{
Data
<
R
>
dataV
,
dataA
,
dataB
,
dataC
,
dataD
;
dataB
.
reverse
();
dataC
+=
2
;
dataD
*=
0.3
;
R
v
=
dataV
,
a
=
dataA
,
b
=
dataB
,
c
=
dataC
,
d
=
dataD
;
Data
<
R
>
res
=
v_matmul
(
v
,
a
,
b
,
c
,
d
);
for
(
int
i
=
0
;
i
<
R
::
nlanes
;
++
i
)
{
LaneType
val
=
dataV
[
0
]
*
dataA
[
i
]
+
dataV
[
1
]
*
dataB
[
i
]
+
dataV
[
2
]
*
dataC
[
i
]
+
dataV
[
3
]
*
dataD
[
i
];
EXPECT_DOUBLE_EQ
(
val
,
res
[
i
]);
}
return
*
this
;
}
TheTest
&
test_transpose
()
{
Data
<
R
>
dataA
,
dataB
,
dataC
,
dataD
;
dataB
*=
5
;
dataC
*=
10
;
dataD
*=
15
;
R
a
=
dataA
,
b
=
dataB
,
c
=
dataC
,
d
=
dataD
;
R
e
,
f
,
g
,
h
;
v_transpose4x4
(
a
,
b
,
c
,
d
,
e
,
f
,
g
,
h
);
Data
<
R
>
res
[
4
]
=
{
e
,
f
,
g
,
h
};
for
(
int
i
=
0
;
i
<
R
::
nlanes
;
++
i
)
{
EXPECT_EQ
(
dataA
[
i
],
res
[
i
][
0
]);
EXPECT_EQ
(
dataB
[
i
],
res
[
i
][
1
]);
EXPECT_EQ
(
dataC
[
i
],
res
[
i
][
2
]);
EXPECT_EQ
(
dataD
[
i
],
res
[
i
][
3
]);
}
return
*
this
;
}
};
//============= 8-bit integer =====================================================================
TEST
(
hal_intrin
,
uint8x16
)
{
TheTest
<
v_uint8x16
>
()
.
test_loadstore
()
.
test_interleave
()
.
test_expand
()
.
test_expand_q
()
.
test_addsub
()
.
test_addsub_wrap
()
.
test_cmp
()
.
test_logic
()
.
test_min_max
()
.
test_absdiff
()
.
test_mask
()
.
test_pack
<
1
>
().
test_pack
<
2
>
().
test_pack
<
3
>
().
test_pack
<
8
>
()
.
test_pack_u
<
1
>
().
test_pack_u
<
2
>
().
test_pack_u
<
3
>
().
test_pack_u
<
8
>
()
.
test_unpack
()
.
test_extract
<
0
>
().
test_extract
<
1
>
().
test_extract
<
8
>
().
test_extract
<
15
>
()
;
}
TEST
(
hal_intrin
,
int8x16
)
{
TheTest
<
v_int8x16
>
()
.
test_loadstore
()
.
test_interleave
()
.
test_expand
()
.
test_expand_q
()
.
test_addsub
()
.
test_addsub_wrap
()
.
test_cmp
()
.
test_logic
()
.
test_min_max
()
.
test_absdiff
()
.
test_mask
()
.
test_pack
<
1
>
().
test_pack
<
2
>
().
test_pack
<
3
>
().
test_pack
<
8
>
()
.
test_unpack
()
.
test_extract
<
0
>
().
test_extract
<
1
>
().
test_extract
<
8
>
().
test_extract
<
15
>
()
;
}
//============= 16-bit integer =====================================================================
TEST
(
hal_intrin
,
uint16x8
)
{
TheTest
<
v_uint16x8
>
()
.
test_loadstore
()
.
test_interleave
()
.
test_expand
()
.
test_addsub
()
.
test_addsub_wrap
()
.
test_mul
()
.
test_mul_expand
()
.
test_cmp
()
.
test_shift
<
1
>
()
.
test_shift
<
8
>
()
.
test_logic
()
.
test_min_max
()
.
test_absdiff
()
.
test_mask
()
.
test_pack
<
1
>
().
test_pack
<
2
>
().
test_pack
<
7
>
().
test_pack
<
16
>
()
.
test_pack_u
<
1
>
().
test_pack_u
<
2
>
().
test_pack_u
<
7
>
().
test_pack_u
<
16
>
()
.
test_unpack
()
.
test_extract
<
0
>
().
test_extract
<
1
>
().
test_extract
<
4
>
().
test_extract
<
7
>
()
;
}
TEST
(
hal_intrin
,
int16x8
)
{
TheTest
<
v_int16x8
>
()
.
test_loadstore
()
.
test_interleave
()
.
test_expand
()
.
test_addsub
()
.
test_addsub_wrap
()
.
test_mul
()
.
test_mul_expand
()
.
test_cmp
()
.
test_shift
<
1
>
()
.
test_shift
<
8
>
()
.
test_dot_prod
()
.
test_logic
()
.
test_min_max
()
.
test_absdiff
()
.
test_mask
()
.
test_pack
<
1
>
().
test_pack
<
2
>
().
test_pack
<
7
>
().
test_pack
<
16
>
()
.
test_unpack
()
.
test_extract
<
0
>
().
test_extract
<
1
>
().
test_extract
<
4
>
().
test_extract
<
7
>
()
;
}
//============= 32-bit integer =====================================================================
TEST
(
hal_intrin
,
uint32x4
)
{
TheTest
<
v_uint32x4
>
()
.
test_loadstore
()
.
test_interleave
()
.
test_expand
()
.
test_addsub
()
.
test_mul
()
.
test_mul_expand
()
.
test_cmp
()
.
test_shift
<
1
>
()
.
test_shift
<
8
>
()
.
test_logic
()
.
test_min_max
()
.
test_absdiff
()
.
test_reduce
()
.
test_mask
()
.
test_pack
<
1
>
().
test_pack
<
2
>
().
test_pack
<
15
>
().
test_pack
<
32
>
()
.
test_unpack
()
.
test_extract
<
0
>
().
test_extract
<
1
>
().
test_extract
<
2
>
().
test_extract
<
3
>
()
.
test_transpose
()
;
}
TEST
(
hal_intrin
,
int32x4
)
{
TheTest
<
v_int32x4
>
()
.
test_loadstore
()
.
test_interleave
()
.
test_expand
()
.
test_addsub
()
.
test_mul
()
.
test_cmp
()
.
test_shift
<
1
>
().
test_shift
<
8
>
()
.
test_logic
()
.
test_min_max
()
.
test_absdiff
()
.
test_reduce
()
.
test_mask
()
.
test_pack
<
1
>
().
test_pack
<
2
>
().
test_pack
<
15
>
().
test_pack
<
32
>
()
.
test_unpack
()
.
test_extract
<
0
>
().
test_extract
<
1
>
().
test_extract
<
2
>
().
test_extract
<
3
>
()
.
test_float_cvt32
()
.
test_float_cvt64
()
.
test_transpose
()
;
}
//============= 64-bit integer =====================================================================
TEST
(
hal_intrin
,
uint64x2
)
{
TheTest
<
v_uint64x2
>
()
.
test_loadstore
()
.
test_addsub
()
.
test_shift
<
1
>
().
test_shift
<
8
>
()
.
test_logic
()
.
test_extract
<
0
>
().
test_extract
<
1
>
()
;
}
TEST
(
hal_intrin
,
int64x2
)
{
TheTest
<
v_int64x2
>
()
.
test_loadstore
()
.
test_addsub
()
.
test_shift
<
1
>
().
test_shift
<
8
>
()
.
test_logic
()
.
test_extract
<
0
>
().
test_extract
<
1
>
()
;
}
//============= Floating point =====================================================================
TEST
(
hal_intrin
,
float32x4
)
{
TheTest
<
v_float32x4
>
()
.
test_loadstore
()
.
test_interleave
()
.
test_addsub
()
.
test_mul
()
.
test_div
()
.
test_cmp
()
.
test_sqrt_abs
()
.
test_min_max
()
.
test_float_absdiff
()
.
test_reduce
()
.
test_mask
()
.
test_unpack
()
.
test_float_math
()
.
test_float_cvt64
()
.
test_matmul
()
.
test_transpose
()
;
}
#if CV_SIMD128_64F
TEST
(
hal_intrin
,
float64x2
)
{
TheTest
<
v_float64x2
>
()
.
test_loadstore
()
.
test_addsub
()
.
test_mul
()
.
test_div
()
.
test_cmp
()
.
test_sqrt_abs
()
.
test_min_max
()
.
test_float_absdiff
()
.
test_mask
()
.
test_unpack
()
.
test_float_math
()
.
test_float_cvt32
()
;
}
#endif
modules/hal/test/test_intrin_utils.hpp
0 → 100644
View file @
a275489f
#ifndef _TEST_UTILS_HPP_
#define _TEST_UTILS_HPP_
#include "opencv2/hal/intrin.hpp"
#include "opencv2/ts.hpp"
#include <ostream>
#include <algorithm>
template
<
typename
R
>
struct
Data
;
template
<
int
N
>
struct
initializer
;
template
<>
struct
initializer
<
16
>
{
template
<
typename
R
>
static
R
init
(
const
Data
<
R
>
&
d
)
{
return
R
(
d
[
0
],
d
[
1
],
d
[
2
],
d
[
3
],
d
[
4
],
d
[
5
],
d
[
6
],
d
[
7
],
d
[
8
],
d
[
9
],
d
[
10
],
d
[
11
],
d
[
12
],
d
[
13
],
d
[
14
],
d
[
15
]);
}
};
template
<>
struct
initializer
<
8
>
{
template
<
typename
R
>
static
R
init
(
const
Data
<
R
>
&
d
)
{
return
R
(
d
[
0
],
d
[
1
],
d
[
2
],
d
[
3
],
d
[
4
],
d
[
5
],
d
[
6
],
d
[
7
]);
}
};
template
<>
struct
initializer
<
4
>
{
template
<
typename
R
>
static
R
init
(
const
Data
<
R
>
&
d
)
{
return
R
(
d
[
0
],
d
[
1
],
d
[
2
],
d
[
3
]);
}
};
template
<>
struct
initializer
<
2
>
{
template
<
typename
R
>
static
R
init
(
const
Data
<
R
>
&
d
)
{
return
R
(
d
[
0
],
d
[
1
]);
}
};
//==================================================================================================
template
<
typename
R
>
struct
Data
{
typedef
typename
R
::
lane_type
LaneType
;
Data
()
{
for
(
int
i
=
0
;
i
<
R
::
nlanes
;
++
i
)
d
[
i
]
=
(
LaneType
)(
i
+
1
);
}
Data
(
LaneType
val
)
{
fill
(
val
);
}
Data
(
const
R
&
r
)
{
*
this
=
r
;
}
operator
R
()
{
return
initializer
<
R
::
nlanes
>
().
init
(
*
this
);
}
Data
<
R
>
&
operator
=
(
const
R
&
r
)
{
v_store
(
d
,
r
);
return
*
this
;
}
template
<
typename
T
>
Data
<
R
>
&
operator
*=
(
T
m
)
{
for
(
int
i
=
0
;
i
<
R
::
nlanes
;
++
i
)
d
[
i
]
*=
(
LaneType
)
m
;
return
*
this
;
}
template
<
typename
T
>
Data
<
R
>
&
operator
+=
(
T
m
)
{
for
(
int
i
=
0
;
i
<
R
::
nlanes
;
++
i
)
d
[
i
]
+=
(
LaneType
)
m
;
return
*
this
;
}
void
fill
(
LaneType
val
)
{
for
(
int
i
=
0
;
i
<
R
::
nlanes
;
++
i
)
d
[
i
]
=
val
;
}
void
reverse
()
{
for
(
int
i
=
0
;
i
<
R
::
nlanes
/
2
;
++
i
)
std
::
swap
(
d
[
i
],
d
[
R
::
nlanes
-
i
-
1
]);
}
const
LaneType
&
operator
[](
int
i
)
const
{
CV_Assert
(
i
>=
0
&&
i
<
R
::
nlanes
);
return
d
[
i
];
}
LaneType
&
operator
[](
int
i
)
{
CV_Assert
(
i
>=
0
&&
i
<
R
::
nlanes
);
return
d
[
i
];
}
const
LaneType
*
mid
()
const
{
return
d
+
R
::
nlanes
/
2
;
}
LaneType
*
mid
()
{
return
d
+
R
::
nlanes
/
2
;
}
bool
operator
==
(
const
Data
<
R
>
&
other
)
const
{
for
(
int
i
=
0
;
i
<
R
::
nlanes
;
++
i
)
if
(
d
[
i
]
!=
other
.
d
[
i
])
return
false
;
return
true
;
}
void
clear
()
{
fill
(
0
);
}
bool
isZero
()
const
{
return
isValue
(
0
);
}
bool
isValue
(
uchar
val
)
const
{
for
(
int
i
=
0
;
i
<
R
::
nlanes
;
++
i
)
if
(
d
[
i
]
!=
val
)
return
false
;
return
true
;
}
LaneType
d
[
R
::
nlanes
];
};
template
<
typename
R
>
struct
AlignedData
{
Data
<
R
>
CV_DECL_ALIGNED
(
16
)
a
;
// aligned
char
dummy
;
Data
<
R
>
u
;
// unaligned
};
template
<
typename
R
>
std
::
ostream
&
operator
<<
(
std
::
ostream
&
out
,
const
Data
<
R
>
&
d
)
{
out
<<
"{ "
;
for
(
int
i
=
0
;
i
<
R
::
nlanes
;
++
i
)
{
// out << std::hex << +V_TypeTraits<typename R::lane_type>::reinterpret_int(d.d[i]);
out
<<
+
d
.
d
[
i
];
if
(
i
+
1
<
R
::
nlanes
)
out
<<
", "
;
}
out
<<
" }"
;
return
out
;
}
//==================================================================================================
template
<
typename
R
>
struct
RegTrait
;
template
<>
struct
RegTrait
<
cv
::
v_uint8x16
>
{
typedef
cv
::
v_uint16x8
w_reg
;
typedef
cv
::
v_uint32x4
q_reg
;
typedef
cv
::
v_uint8x16
u_reg
;
static
cv
::
v_uint8x16
zero
()
{
return
cv
::
v_setzero_u8
();
}
static
cv
::
v_uint8x16
all
(
uchar
val
)
{
return
cv
::
v_setall_u8
(
val
);
}
};
template
<>
struct
RegTrait
<
cv
::
v_int8x16
>
{
typedef
cv
::
v_int16x8
w_reg
;
typedef
cv
::
v_int32x4
q_reg
;
typedef
cv
::
v_uint8x16
u_reg
;
static
cv
::
v_int8x16
zero
()
{
return
cv
::
v_setzero_s8
();
}
static
cv
::
v_int8x16
all
(
schar
val
)
{
return
cv
::
v_setall_s8
(
val
);
}
};
template
<>
struct
RegTrait
<
cv
::
v_uint16x8
>
{
typedef
cv
::
v_uint32x4
w_reg
;
typedef
cv
::
v_int16x8
int_reg
;
typedef
cv
::
v_uint16x8
u_reg
;
static
cv
::
v_uint16x8
zero
()
{
return
cv
::
v_setzero_u16
();
}
static
cv
::
v_uint16x8
all
(
ushort
val
)
{
return
cv
::
v_setall_u16
(
val
);
}
};
template
<>
struct
RegTrait
<
cv
::
v_int16x8
>
{
typedef
cv
::
v_int32x4
w_reg
;
typedef
cv
::
v_uint16x8
u_reg
;
static
cv
::
v_int16x8
zero
()
{
return
cv
::
v_setzero_s16
();
}
static
cv
::
v_int16x8
all
(
short
val
)
{
return
cv
::
v_setall_s16
(
val
);
}
};
template
<>
struct
RegTrait
<
cv
::
v_uint32x4
>
{
typedef
cv
::
v_uint64x2
w_reg
;
typedef
cv
::
v_int32x4
int_reg
;
typedef
cv
::
v_uint32x4
u_reg
;
static
cv
::
v_uint32x4
zero
()
{
return
cv
::
v_setzero_u32
();
}
static
cv
::
v_uint32x4
all
(
unsigned
val
)
{
return
cv
::
v_setall_u32
(
val
);
}
};
template
<>
struct
RegTrait
<
cv
::
v_int32x4
>
{
typedef
cv
::
v_int64x2
w_reg
;
typedef
cv
::
v_uint32x4
u_reg
;
static
cv
::
v_int32x4
zero
()
{
return
cv
::
v_setzero_s32
();
}
static
cv
::
v_int32x4
all
(
int
val
)
{
return
cv
::
v_setall_s32
(
val
);
}
};
template
<>
struct
RegTrait
<
cv
::
v_uint64x2
>
{
static
cv
::
v_uint64x2
zero
()
{
return
cv
::
v_setzero_u64
();
}
static
cv
::
v_uint64x2
all
(
uint64
val
)
{
return
cv
::
v_setall_u64
(
val
);
}
};
template
<>
struct
RegTrait
<
cv
::
v_int64x2
>
{
static
cv
::
v_int64x2
zero
()
{
return
cv
::
v_setzero_s64
();
}
static
cv
::
v_int64x2
all
(
int64
val
)
{
return
cv
::
v_setall_s64
(
val
);
}
};
template
<>
struct
RegTrait
<
cv
::
v_float32x4
>
{
typedef
cv
::
v_int32x4
int_reg
;
typedef
cv
::
v_float32x4
u_reg
;
static
cv
::
v_float32x4
zero
()
{
return
cv
::
v_setzero_f32
();
}
static
cv
::
v_float32x4
all
(
float
val
)
{
return
cv
::
v_setall_f32
(
val
);
}
};
#if CV_SIMD128_64F
template
<>
struct
RegTrait
<
cv
::
v_float64x2
>
{
typedef
cv
::
v_int32x4
int_reg
;
typedef
cv
::
v_float64x2
u_reg
;
static
cv
::
v_float64x2
zero
()
{
return
cv
::
v_setzero_f64
();
}
static
cv
::
v_float64x2
all
(
double
val
)
{
return
cv
::
v_setall_f64
(
val
);
}
};
#endif
#endif
modules/hal/test/test_main.cpp
0 → 100644
View file @
a275489f
#include "opencv2/ts.hpp"
CV_TEST_MAIN
(
"cv"
)
modules/hal/test/test_precomp.hpp
0 → 100644
View file @
a275489f
#ifndef __OPENCV_HAL_TEST_PRECOMP_HPP__
#define __OPENCV_HAL_TEST_PRECOMP_HPP__
#include <iostream>
#include <limits>
#include "opencv2/ts.hpp"
#include "opencv2/hal.hpp"
#include "opencv2/hal/defs.h"
#include "opencv2/hal/intrin.hpp"
#endif
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment