Commit a275489f authored by Maksim Shabunin's avatar Maksim Shabunin

HAL universal intrinsics tests and documentation

parent 190d00ea
...@@ -49,10 +49,21 @@ ...@@ -49,10 +49,21 @@
/** /**
@defgroup hal Hardware Acceleration Layer @defgroup hal Hardware Acceleration Layer
@{
@defgroup hal_intrin Universal intrinsics
@{
@defgroup hal_intrin_impl Private implementation helpers
@}
@defgroup hal_utils Platform-dependent utils
@}
*/ */
namespace cv { namespace hal { namespace cv { namespace hal {
//! @addtogroup hal
//! @{
namespace Error { namespace Error {
enum enum
...@@ -93,6 +104,8 @@ void sqrt(const double* src, double* dst, int len); ...@@ -93,6 +104,8 @@ void sqrt(const double* src, double* dst, int len);
void invSqrt(const float* src, float* dst, int len); void invSqrt(const float* src, float* dst, int len);
void invSqrt(const double* src, double* dst, int len); void invSqrt(const double* src, double* dst, int len);
//! @}
}} //cv::hal }} //cv::hal
#endif //__OPENCV_HAL_HPP__ #endif //__OPENCV_HAL_HPP__
...@@ -45,6 +45,9 @@ ...@@ -45,6 +45,9 @@
#ifndef __OPENCV_DEF_H__ #ifndef __OPENCV_DEF_H__
#define __OPENCV_DEF_H__ #define __OPENCV_DEF_H__
//! @addtogroup hal_utils
//! @{
#if !defined _CRT_SECURE_NO_DEPRECATE && defined _MSC_VER && _MSC_VER > 1300 #if !defined _CRT_SECURE_NO_DEPRECATE && defined _MSC_VER && _MSC_VER > 1300
# define _CRT_SECURE_NO_DEPRECATE /* to avoid multiple Visual Studio warnings */ # define _CRT_SECURE_NO_DEPRECATE /* to avoid multiple Visual Studio warnings */
#endif #endif
...@@ -335,9 +338,6 @@ Cv64suf; ...@@ -335,9 +338,6 @@ Cv64suf;
# include "tegra_round.hpp" # include "tegra_round.hpp"
#endif #endif
//! @addtogroup core_utils
//! @{
#if CV_VFP #if CV_VFP
// 1. general scheme // 1. general scheme
#define ARM_ROUND(_value, _asm_string) \ #define ARM_ROUND(_value, _asm_string) \
...@@ -567,15 +567,19 @@ CV_INLINE int cvIsInf( float value ) ...@@ -567,15 +567,19 @@ CV_INLINE int cvIsInf( float value )
return (ieee754.u & 0x7fffffff) == 0x7f800000; return (ieee754.u & 0x7fffffff) == 0x7f800000;
} }
//! @}
#include <algorithm> #include <algorithm>
namespace cv namespace cv
{ {
//! @addtogroup hal_utils
//! @{
/////////////// saturate_cast (used in image & signal processing) /////////////////// /////////////// saturate_cast (used in image & signal processing) ///////////////////
/** /** @brief Template function for accurate conversion from one primitive type to another.
Template function for accurate conversion from one primitive type to another.
The functions saturate_cast resemble the standard C++ cast operations, such as static_cast\<T\>() The functions saturate_cast resemble the standard C++ cast operations, such as static_cast\<T\>()
and others. They perform an efficient and accurate conversion from one primitive type to another and others. They perform an efficient and accurate conversion from one primitive type to another
...@@ -618,8 +622,6 @@ template<typename _Tp> static inline _Tp saturate_cast(int64 v) { return _Tp( ...@@ -618,8 +622,6 @@ template<typename _Tp> static inline _Tp saturate_cast(int64 v) { return _Tp(
/** @overload */ /** @overload */
template<typename _Tp> static inline _Tp saturate_cast(uint64 v) { return _Tp(v); } template<typename _Tp> static inline _Tp saturate_cast(uint64 v) { return _Tp(v); }
//! @cond IGNORED
template<> inline uchar saturate_cast<uchar>(schar v) { return (uchar)std::max((int)v, 0); } template<> inline uchar saturate_cast<uchar>(schar v) { return (uchar)std::max((int)v, 0); }
template<> inline uchar saturate_cast<uchar>(ushort v) { return (uchar)std::min((unsigned)v, (unsigned)UCHAR_MAX); } template<> inline uchar saturate_cast<uchar>(ushort v) { return (uchar)std::min((unsigned)v, (unsigned)UCHAR_MAX); }
template<> inline uchar saturate_cast<uchar>(int v) { return (uchar)((unsigned)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); } template<> inline uchar saturate_cast<uchar>(int v) { return (uchar)((unsigned)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); }
...@@ -664,12 +666,10 @@ template<> inline int saturate_cast<int>(double v) { return cvRound(v) ...@@ -664,12 +666,10 @@ template<> inline int saturate_cast<int>(double v) { return cvRound(v)
template<> inline unsigned saturate_cast<unsigned>(float v) { return cvRound(v); } template<> inline unsigned saturate_cast<unsigned>(float v) { return cvRound(v); }
template<> inline unsigned saturate_cast<unsigned>(double v) { return cvRound(v); } template<> inline unsigned saturate_cast<unsigned>(double v) { return cvRound(v); }
//! @endcond //! @}
} }
#endif // __cplusplus #endif // __cplusplus
//! @} core_utils
#endif //__OPENCV_HAL_H__ #endif //__OPENCV_HAL_H__
...@@ -48,6 +48,7 @@ ...@@ -48,6 +48,7 @@
#include <cmath> #include <cmath>
#include <float.h> #include <float.h>
#include <stdlib.h> #include <stdlib.h>
#include "opencv2/hal/defs.h"
#define OPENCV_HAL_ADD(a, b) ((a) + (b)) #define OPENCV_HAL_ADD(a, b) ((a) + (b))
#define OPENCV_HAL_AND(a, b) ((a) & (b)) #define OPENCV_HAL_AND(a, b) ((a) & (b))
...@@ -59,6 +60,10 @@ ...@@ -59,6 +60,10 @@
// access from within opencv code more accessible // access from within opencv code more accessible
namespace cv { namespace cv {
//! @addtogroup hal_intrin
//! @{
//! @cond IGNORED
template<typename _Tp> struct V_TypeTraits template<typename _Tp> struct V_TypeTraits
{ {
typedef _Tp int_type; typedef _Tp int_type;
...@@ -82,6 +87,7 @@ template<> struct V_TypeTraits<uchar> ...@@ -82,6 +87,7 @@ template<> struct V_TypeTraits<uchar>
typedef int sum_type; typedef int sum_type;
typedef ushort w_type; typedef ushort w_type;
typedef unsigned q_type;
enum { delta = 128, shift = 8 }; enum { delta = 128, shift = 8 };
...@@ -99,6 +105,7 @@ template<> struct V_TypeTraits<schar> ...@@ -99,6 +105,7 @@ template<> struct V_TypeTraits<schar>
typedef int sum_type; typedef int sum_type;
typedef short w_type; typedef short w_type;
typedef int q_type;
enum { delta = 128, shift = 8 }; enum { delta = 128, shift = 8 };
...@@ -265,8 +272,22 @@ template<> struct V_TypeTraits<double> ...@@ -265,8 +272,22 @@ template<> struct V_TypeTraits<double>
} }
}; };
template <typename T> struct V_SIMD128Traits
{
enum { nlanes = 16 / sizeof(T) };
};
//! @endcond
//! @}
} }
#ifdef CV_DOXYGEN
# undef CV_SSE2
# undef CV_NEON
#endif
#if CV_SSE2 #if CV_SSE2
#include "opencv2/hal/intrin_sse.hpp" #include "opencv2/hal/intrin_sse.hpp"
...@@ -281,12 +302,19 @@ template<> struct V_TypeTraits<double> ...@@ -281,12 +302,19 @@ template<> struct V_TypeTraits<double>
#endif #endif
//! @addtogroup hal_intrin
//! @{
#ifndef CV_SIMD128 #ifndef CV_SIMD128
//! Set to 1 if current compiler supports vector extensions (NEON or SSE is enabled)
#define CV_SIMD128 0 #define CV_SIMD128 0
#endif #endif
#ifndef CV_SIMD128_64F #ifndef CV_SIMD128_64F
//! Set to 1 if current intrinsics implementation supports 64-bit float vectors
#define CV_SIMD128_64F 0 #define CV_SIMD128_64F 0
#endif #endif
//! @}
#endif #endif
...@@ -48,6 +48,8 @@ ...@@ -48,6 +48,8 @@
namespace cv namespace cv
{ {
//! @cond IGNORED
#define CV_SIMD128 1 #define CV_SIMD128 1
struct v_uint8x16 struct v_uint8x16
...@@ -278,14 +280,15 @@ void v_rshr_##pack##_store(_Tp* ptr, const _Tpwvec& a) \ ...@@ -278,14 +280,15 @@ void v_rshr_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
} }
OPENCV_HAL_IMPL_NEON_PACK(v_uint8x16, uchar, uint8x8_t, u8, v_uint16x8, u16, pack, n) OPENCV_HAL_IMPL_NEON_PACK(v_uint8x16, uchar, uint8x8_t, u8, v_uint16x8, u16, pack, n)
OPENCV_HAL_IMPL_NEON_PACK(v_uint8x16, uchar, uint8x8_t, u8, v_int16x8, s16, pack_u, un)
OPENCV_HAL_IMPL_NEON_PACK(v_int8x16, schar, int8x8_t, s8, v_int16x8, s16, pack, n) OPENCV_HAL_IMPL_NEON_PACK(v_int8x16, schar, int8x8_t, s8, v_int16x8, s16, pack, n)
OPENCV_HAL_IMPL_NEON_PACK(v_uint16x8, ushort, uint16x4_t, u16, v_uint32x4, u32, pack, n) OPENCV_HAL_IMPL_NEON_PACK(v_uint16x8, ushort, uint16x4_t, u16, v_uint32x4, u32, pack, n)
OPENCV_HAL_IMPL_NEON_PACK(v_uint16x8, ushort, uint16x4_t, u16, v_int32x4, s32, pack_u, un)
OPENCV_HAL_IMPL_NEON_PACK(v_int16x8, short, int16x4_t, s16, v_int32x4, s32, pack, n) OPENCV_HAL_IMPL_NEON_PACK(v_int16x8, short, int16x4_t, s16, v_int32x4, s32, pack, n)
OPENCV_HAL_IMPL_NEON_PACK(v_uint32x4, unsigned, uint32x2_t, u32, v_uint64x2, u64, pack, n) OPENCV_HAL_IMPL_NEON_PACK(v_uint32x4, unsigned, uint32x2_t, u32, v_uint64x2, u64, pack, n)
OPENCV_HAL_IMPL_NEON_PACK(v_int32x4, int, int32x2_t, s32, v_int64x2, s64, pack, n) OPENCV_HAL_IMPL_NEON_PACK(v_int32x4, int, int32x2_t, s32, v_int64x2, s64, pack, n)
OPENCV_HAL_IMPL_NEON_PACK(v_uint8x16, uchar, uint8x8_t, u8, v_int16x8, s16, pack_u, un)
OPENCV_HAL_IMPL_NEON_PACK(v_uint16x8, ushort, uint16x4_t, u16, v_int32x4, s32, pack_u, un)
inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0, inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
const v_float32x4& m1, const v_float32x4& m2, const v_float32x4& m1, const v_float32x4& m2,
const v_float32x4& m3) const v_float32x4& m3)
...@@ -374,7 +377,7 @@ inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b) ...@@ -374,7 +377,7 @@ inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
{ {
int32x4_t c = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val)); int32x4_t c = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
int32x4_t d = vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val)); int32x4_t d = vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val));
int32x4x2_t cd = vtrnq_s32(c, d); int32x4x2_t cd = vuzpq_s32(c, d);
return v_int32x4(vaddq_s32(cd.val[0], cd.val[1])); return v_int32x4(vaddq_s32(cd.val[0], cd.val[1]));
} }
...@@ -497,6 +500,16 @@ OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_absdiff, vabdq_u16) ...@@ -497,6 +500,16 @@ OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_absdiff, vabdq_u16)
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_absdiff, vabdq_u32) OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_absdiff, vabdq_u32)
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_absdiff, vabdq_f32) OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_absdiff, vabdq_f32)
#define OPENCV_HAL_IMPL_NEON_BIN_FUNC2(_Tpvec, _Tpvec2, cast, func, intrin) \
inline _Tpvec2 func(const _Tpvec& a, const _Tpvec& b) \
{ \
return _Tpvec2(cast(intrin(a.val, b.val))); \
}
OPENCV_HAL_IMPL_NEON_BIN_FUNC2(v_int8x16, v_uint8x16, vreinterpretq_u8_s8, v_absdiff, vabdq_s8)
OPENCV_HAL_IMPL_NEON_BIN_FUNC2(v_int16x8, v_uint16x8, vreinterpretq_u16_s16, v_absdiff, vabdq_s16)
OPENCV_HAL_IMPL_NEON_BIN_FUNC2(v_int32x4, v_uint32x4, vreinterpretq_u32_s32, v_absdiff, vabdq_s32)
inline v_float32x4 v_magnitude(const v_float32x4& a, const v_float32x4& b) inline v_float32x4 v_magnitude(const v_float32x4& a, const v_float32x4& b)
{ {
v_float32x4 x(vmlaq_f32(vmulq_f32(a.val, a.val), b.val, b.val)); v_float32x4 x(vmlaq_f32(vmulq_f32(a.val, a.val), b.val, b.val));
...@@ -641,13 +654,13 @@ inline bool v_check_all(const v_float32x4& a) ...@@ -641,13 +654,13 @@ inline bool v_check_all(const v_float32x4& a)
{ return v_check_all(v_reinterpret_as_u32(a)); } { return v_check_all(v_reinterpret_as_u32(a)); }
inline bool v_check_any(const v_int8x16& a) inline bool v_check_any(const v_int8x16& a)
{ return v_check_all(v_reinterpret_as_u8(a)); } { return v_check_any(v_reinterpret_as_u8(a)); }
inline bool v_check_any(const v_int16x8& a) inline bool v_check_any(const v_int16x8& a)
{ return v_check_all(v_reinterpret_as_u16(a)); } { return v_check_any(v_reinterpret_as_u16(a)); }
inline bool v_check_any(const v_int32x4& a) inline bool v_check_any(const v_int32x4& a)
{ return v_check_all(v_reinterpret_as_u32(a)); } { return v_check_any(v_reinterpret_as_u32(a)); }
inline bool v_check_any(const v_float32x4& a) inline bool v_check_any(const v_float32x4& a)
{ return v_check_all(v_reinterpret_as_u32(a)); } { return v_check_any(v_reinterpret_as_u32(a)); }
#define OPENCV_HAL_IMPL_NEON_SELECT(_Tpvec, suffix, usuffix) \ #define OPENCV_HAL_IMPL_NEON_SELECT(_Tpvec, suffix, usuffix) \
inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \ inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
...@@ -678,6 +691,8 @@ OPENCV_HAL_IMPL_NEON_EXPAND(v_uint8x16, v_uint16x8, uchar, u8) ...@@ -678,6 +691,8 @@ OPENCV_HAL_IMPL_NEON_EXPAND(v_uint8x16, v_uint16x8, uchar, u8)
OPENCV_HAL_IMPL_NEON_EXPAND(v_int8x16, v_int16x8, schar, s8) OPENCV_HAL_IMPL_NEON_EXPAND(v_int8x16, v_int16x8, schar, s8)
OPENCV_HAL_IMPL_NEON_EXPAND(v_uint16x8, v_uint32x4, ushort, u16) OPENCV_HAL_IMPL_NEON_EXPAND(v_uint16x8, v_uint32x4, ushort, u16)
OPENCV_HAL_IMPL_NEON_EXPAND(v_int16x8, v_int32x4, short, s16) OPENCV_HAL_IMPL_NEON_EXPAND(v_int16x8, v_int32x4, short, s16)
OPENCV_HAL_IMPL_NEON_EXPAND(v_uint32x4, v_uint64x2, uint, u32)
OPENCV_HAL_IMPL_NEON_EXPAND(v_int32x4, v_int64x2, int, s32)
inline v_uint32x4 v_load_expand_q(const uchar* ptr) inline v_uint32x4 v_load_expand_q(const uchar* ptr)
{ {
...@@ -840,6 +855,8 @@ inline v_float32x4 v_cvt_f32(const v_int32x4& a) ...@@ -840,6 +855,8 @@ inline v_float32x4 v_cvt_f32(const v_int32x4& a)
return v_float32x4(vcvtq_f32_s32(a.val)); return v_float32x4(vcvtq_f32_s32(a.val));
} }
//! @endcond
} }
#endif #endif
...@@ -51,6 +51,8 @@ ...@@ -51,6 +51,8 @@
namespace cv namespace cv
{ {
//! @cond IGNORED
struct v_uint8x16 struct v_uint8x16
{ {
typedef uchar lane_type; typedef uchar lane_type;
...@@ -296,6 +298,11 @@ OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int32x4, s32) ...@@ -296,6 +298,11 @@ OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int32x4, s32)
OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint64x2, u64) OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint64x2, u64)
OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int64x2, s64) OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int64x2, s64)
inline v_float32x4 v_reinterpret_as_f32(const v_float32x4& a) {return a; }
inline v_float64x2 v_reinterpret_as_f64(const v_float64x2& a) {return a; }
inline v_float32x4 v_reinterpret_as_f32(const v_float64x2& a) {return v_float32x4(_mm_castpd_ps(a.val)); }
inline v_float64x2 v_reinterpret_as_f64(const v_float32x4& a) {return v_float64x2(_mm_castps_pd(a.val)); }
//////////////// PACK /////////////// //////////////// PACK ///////////////
inline v_uint8x16 v_pack(const v_uint16x8& a, const v_uint16x8& b) inline v_uint8x16 v_pack(const v_uint16x8& a, const v_uint16x8& b)
{ {
...@@ -430,6 +437,17 @@ inline void v_pack_u_store(ushort* ptr, const v_int32x4& a) ...@@ -430,6 +437,17 @@ inline void v_pack_u_store(ushort* ptr, const v_int32x4& a)
_mm_storel_epi64((__m128i*)ptr, r); _mm_storel_epi64((__m128i*)ptr, r);
} }
template<int n> inline
v_uint16x8 v_rshr_pack_u(const v_int32x4& a, const v_int32x4& b)
{
__m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
__m128i a1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n), delta32);
__m128i a2 = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
__m128i b1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(b.val, delta), n), delta32);
__m128i b2 = _mm_sub_epi16(_mm_packs_epi32(b1, b1), _mm_set1_epi16(-32768));
return v_uint16x8(_mm_unpacklo_epi64(a2, b2));
}
template<int n> inline template<int n> inline
void v_rshr_pack_u_store(ushort* ptr, const v_int32x4& a) void v_rshr_pack_u_store(ushort* ptr, const v_int32x4& a)
{ {
...@@ -460,7 +478,7 @@ void v_rshr_pack_store(short* ptr, const v_int32x4& a) ...@@ -460,7 +478,7 @@ void v_rshr_pack_store(short* ptr, const v_int32x4& a)
{ {
__m128i delta = _mm_set1_epi32(1 << (n-1)); __m128i delta = _mm_set1_epi32(1 << (n-1));
__m128i a1 = _mm_srai_epi32(_mm_add_epi32(a.val, delta), n); __m128i a1 = _mm_srai_epi32(_mm_add_epi32(a.val, delta), n);
_mm_storel_epi64((__m128i*)ptr, a1); _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi32(a1, a1));
} }
...@@ -469,7 +487,7 @@ inline v_uint32x4 v_pack(const v_uint64x2& a, const v_uint64x2& b) ...@@ -469,7 +487,7 @@ inline v_uint32x4 v_pack(const v_uint64x2& a, const v_uint64x2& b)
{ {
__m128i v0 = _mm_unpacklo_epi32(a.val, b.val); // a0 a1 0 0 __m128i v0 = _mm_unpacklo_epi32(a.val, b.val); // a0 a1 0 0
__m128i v1 = _mm_unpackhi_epi32(a.val, b.val); // b0 b1 0 0 __m128i v1 = _mm_unpackhi_epi32(a.val, b.val); // b0 b1 0 0
return v_uint32x4(_mm_unpacklo_epi64(v0, v1)); return v_uint32x4(_mm_unpacklo_epi32(v0, v1));
} }
inline void v_pack_store(unsigned* ptr, const v_uint64x2& a) inline void v_pack_store(unsigned* ptr, const v_uint64x2& a)
...@@ -483,7 +501,7 @@ inline v_int32x4 v_pack(const v_int64x2& a, const v_int64x2& b) ...@@ -483,7 +501,7 @@ inline v_int32x4 v_pack(const v_int64x2& a, const v_int64x2& b)
{ {
__m128i v0 = _mm_unpacklo_epi32(a.val, b.val); // a0 a1 0 0 __m128i v0 = _mm_unpacklo_epi32(a.val, b.val); // a0 a1 0 0
__m128i v1 = _mm_unpackhi_epi32(a.val, b.val); // b0 b1 0 0 __m128i v1 = _mm_unpackhi_epi32(a.val, b.val); // b0 b1 0 0
return v_int32x4(_mm_unpacklo_epi64(v0, v1)); return v_int32x4(_mm_unpacklo_epi32(v0, v1));
} }
inline void v_pack_store(int* ptr, const v_int64x2& a) inline void v_pack_store(int* ptr, const v_int64x2& a)
...@@ -501,7 +519,7 @@ v_uint32x4 v_rshr_pack(const v_uint64x2& a, const v_uint64x2& b) ...@@ -501,7 +519,7 @@ v_uint32x4 v_rshr_pack(const v_uint64x2& a, const v_uint64x2& b)
__m128i b1 = _mm_srli_epi64(_mm_add_epi64(b.val, delta2.val), n); __m128i b1 = _mm_srli_epi64(_mm_add_epi64(b.val, delta2.val), n);
__m128i v0 = _mm_unpacklo_epi32(a1, b1); // a0 a1 0 0 __m128i v0 = _mm_unpacklo_epi32(a1, b1); // a0 a1 0 0
__m128i v1 = _mm_unpackhi_epi32(a1, b1); // b0 b1 0 0 __m128i v1 = _mm_unpackhi_epi32(a1, b1); // b0 b1 0 0
return v_uint32x4(_mm_unpacklo_epi64(v0, v1)); return v_uint32x4(_mm_unpacklo_epi32(v0, v1));
} }
template<int n> inline template<int n> inline
...@@ -534,7 +552,7 @@ v_int32x4 v_rshr_pack(const v_int64x2& a, const v_int64x2& b) ...@@ -534,7 +552,7 @@ v_int32x4 v_rshr_pack(const v_int64x2& a, const v_int64x2& b)
__m128i b1 = v_srai_epi64(_mm_add_epi64(b.val, delta2.val), n); __m128i b1 = v_srai_epi64(_mm_add_epi64(b.val, delta2.val), n);
__m128i v0 = _mm_unpacklo_epi32(a1, b1); // a0 a1 0 0 __m128i v0 = _mm_unpacklo_epi32(a1, b1); // a0 a1 0 0
__m128i v1 = _mm_unpackhi_epi32(a1, b1); // b0 b1 0 0 __m128i v1 = _mm_unpackhi_epi32(a1, b1); // b0 b1 0 0
return v_int32x4(_mm_unpacklo_epi64(v0, v1)); return v_int32x4(_mm_unpacklo_epi32(v0, v1));
} }
template<int n> inline template<int n> inline
...@@ -630,8 +648,8 @@ inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b, ...@@ -630,8 +648,8 @@ inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
{ {
__m128i v0 = _mm_mullo_epi16(a.val, b.val); __m128i v0 = _mm_mullo_epi16(a.val, b.val);
__m128i v1 = _mm_mulhi_epi16(a.val, b.val); __m128i v1 = _mm_mulhi_epi16(a.val, b.val);
c.val = _mm_unpacklo_epi32(v0, v1); c.val = _mm_unpacklo_epi16(v0, v1);
d.val = _mm_unpackhi_epi32(v0, v1); d.val = _mm_unpackhi_epi16(v0, v1);
} }
inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b, inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
...@@ -639,8 +657,8 @@ inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b, ...@@ -639,8 +657,8 @@ inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
{ {
__m128i v0 = _mm_mullo_epi16(a.val, b.val); __m128i v0 = _mm_mullo_epi16(a.val, b.val);
__m128i v1 = _mm_mulhi_epu16(a.val, b.val); __m128i v1 = _mm_mulhi_epu16(a.val, b.val);
c.val = _mm_unpacklo_epi32(v0, v1); c.val = _mm_unpacklo_epi16(v0, v1);
d.val = _mm_unpackhi_epi32(v0, v1); d.val = _mm_unpackhi_epi16(v0, v1);
} }
inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b, inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
...@@ -869,6 +887,18 @@ inline _Tpuvec v_absdiff(const _Tpsvec& a, const _Tpsvec& b) \ ...@@ -869,6 +887,18 @@ inline _Tpuvec v_absdiff(const _Tpsvec& a, const _Tpsvec& b) \
OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(v_uint8x16, v_int8x16, 8, (int)0x80808080) OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(v_uint8x16, v_int8x16, 8, (int)0x80808080)
OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(v_uint16x8, v_int16x8, 16, (int)0x80008000) OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(v_uint16x8, v_int16x8, 16, (int)0x80008000)
inline v_uint32x4 v_absdiff(const v_uint32x4& a, const v_uint32x4& b)
{
return v_max(a, b) - v_min(a, b);
}
inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b)
{
__m128i d = _mm_sub_epi32(a.val, b.val);
__m128i m = _mm_cmpgt_epi32(b.val, a.val);
return v_uint32x4(_mm_sub_epi32(_mm_xor_si128(d, m), m));
}
#define OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(_Tpvec, _Tp, _Tpreg, suffix, absmask_vec) \ #define OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(_Tpvec, _Tp, _Tpreg, suffix, absmask_vec) \
inline _Tpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \ inline _Tpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \
{ \ { \
...@@ -1047,8 +1077,8 @@ OPENCV_HAL_IMPL_SSE_SELECT(v_uint16x8, si128) ...@@ -1047,8 +1077,8 @@ OPENCV_HAL_IMPL_SSE_SELECT(v_uint16x8, si128)
OPENCV_HAL_IMPL_SSE_SELECT(v_int16x8, si128) OPENCV_HAL_IMPL_SSE_SELECT(v_int16x8, si128)
OPENCV_HAL_IMPL_SSE_SELECT(v_uint32x4, si128) OPENCV_HAL_IMPL_SSE_SELECT(v_uint32x4, si128)
OPENCV_HAL_IMPL_SSE_SELECT(v_int32x4, si128) OPENCV_HAL_IMPL_SSE_SELECT(v_int32x4, si128)
OPENCV_HAL_IMPL_SSE_SELECT(v_uint64x2, si128) // OPENCV_HAL_IMPL_SSE_SELECT(v_uint64x2, si128)
OPENCV_HAL_IMPL_SSE_SELECT(v_int64x2, si128) // OPENCV_HAL_IMPL_SSE_SELECT(v_int64x2, si128)
OPENCV_HAL_IMPL_SSE_SELECT(v_float32x4, ps) OPENCV_HAL_IMPL_SSE_SELECT(v_float32x4, ps)
OPENCV_HAL_IMPL_SSE_SELECT(v_float64x2, pd) OPENCV_HAL_IMPL_SSE_SELECT(v_float64x2, pd)
...@@ -1257,7 +1287,7 @@ inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, ...@@ -1257,7 +1287,7 @@ inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b,
__m128i v0 = _mm_unpacklo_epi8(u0, u2); // a0 a8 b0 b8 ... __m128i v0 = _mm_unpacklo_epi8(u0, u2); // a0 a8 b0 b8 ...
__m128i v1 = _mm_unpackhi_epi8(u0, u2); // a2 a10 b2 b10 ... __m128i v1 = _mm_unpackhi_epi8(u0, u2); // a2 a10 b2 b10 ...
__m128i v2 = _mm_unpacklo_epi8(u1, u3); // a4 a12 b4 b12 ... __m128i v2 = _mm_unpacklo_epi8(u1, u3); // a4 a12 b4 b12 ...
__m128i v3 = _mm_unpackhi_epi8(u1, u3); // a6 a14 b4 b14 ... __m128i v3 = _mm_unpackhi_epi8(u1, u3); // a6 a14 b6 b14 ...
u0 = _mm_unpacklo_epi8(v0, v2); // a0 a4 a8 a12 ... u0 = _mm_unpacklo_epi8(v0, v2); // a0 a4 a8 a12 ...
u1 = _mm_unpacklo_epi8(v1, v3); // a2 a6 a10 a14 ... u1 = _mm_unpacklo_epi8(v1, v3); // a2 a6 a10 a14 ...
...@@ -1266,13 +1296,13 @@ inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, ...@@ -1266,13 +1296,13 @@ inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b,
v0 = _mm_unpacklo_epi8(u0, u1); // a0 a2 a4 a6 ... v0 = _mm_unpacklo_epi8(u0, u1); // a0 a2 a4 a6 ...
v1 = _mm_unpacklo_epi8(u2, u3); // a1 a3 a5 a7 ... v1 = _mm_unpacklo_epi8(u2, u3); // a1 a3 a5 a7 ...
v2 = _mm_unpackhi_epi8(u0, u1); // b0 b2 b4 b6 ... v2 = _mm_unpackhi_epi8(u0, u1); // c0 c2 c4 c6 ...
v3 = _mm_unpackhi_epi8(u2, u3); // b1 b3 b5 b7 ... v3 = _mm_unpackhi_epi8(u2, u3); // c1 c3 c5 c7 ...
a.val = _mm_unpacklo_epi8(v0, v1); a.val = _mm_unpacklo_epi8(v0, v1);
b.val = _mm_unpacklo_epi8(v2, v3); b.val = _mm_unpackhi_epi8(v0, v1);
c.val = _mm_unpackhi_epi8(v0, v1); c.val = _mm_unpacklo_epi8(v2, v3);
d.val = _mm_unpacklo_epi8(v2, v3); d.val = _mm_unpackhi_epi8(v2, v3);
} }
inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c) inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c)
...@@ -1560,6 +1590,8 @@ inline v_float64x2 v_cvt_f64(const v_float32x4& a) ...@@ -1560,6 +1590,8 @@ inline v_float64x2 v_cvt_f64(const v_float32x4& a)
return v_float64x2(_mm_cvtps_pd(a.val)); return v_float64x2(_mm_cvtps_pd(a.val));
} }
//! @endcond
} }
#endif #endif
This diff is collapsed.
#ifndef _TEST_UTILS_HPP_
#define _TEST_UTILS_HPP_
#include "opencv2/hal/intrin.hpp"
#include "opencv2/ts.hpp"
#include <ostream>
#include <algorithm>
template <typename R> struct Data;
template <int N> struct initializer;
template <> struct initializer<16>
{
template <typename R> static R init(const Data<R> & d)
{
return R(d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7], d[8], d[9], d[10], d[11], d[12], d[13], d[14], d[15]);
}
};
template <> struct initializer<8>
{
template <typename R> static R init(const Data<R> & d)
{
return R(d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7]);
}
};
template <> struct initializer<4>
{
template <typename R> static R init(const Data<R> & d)
{
return R(d[0], d[1], d[2], d[3]);
}
};
template <> struct initializer<2>
{
template <typename R> static R init(const Data<R> & d)
{
return R(d[0], d[1]);
}
};
//==================================================================================================
template <typename R> struct Data
{
typedef typename R::lane_type LaneType;
Data()
{
for (int i = 0; i < R::nlanes; ++i)
d[i] = (LaneType)(i + 1);
}
Data(LaneType val)
{
fill(val);
}
Data(const R & r)
{
*this = r;
}
operator R ()
{
return initializer<R::nlanes>().init(*this);
}
Data<R> & operator=(const R & r)
{
v_store(d, r);
return *this;
}
template <typename T> Data<R> & operator*=(T m)
{
for (int i = 0; i < R::nlanes; ++i)
d[i] *= (LaneType)m;
return *this;
}
template <typename T> Data<R> & operator+=(T m)
{
for (int i = 0; i < R::nlanes; ++i)
d[i] += (LaneType)m;
return *this;
}
void fill(LaneType val)
{
for (int i = 0; i < R::nlanes; ++i)
d[i] = val;
}
void reverse()
{
for (int i = 0; i < R::nlanes / 2; ++i)
std::swap(d[i], d[R::nlanes - i - 1]);
}
const LaneType & operator[](int i) const
{
CV_Assert(i >= 0 && i < R::nlanes);
return d[i];
}
LaneType & operator[](int i)
{
CV_Assert(i >= 0 && i < R::nlanes);
return d[i];
}
const LaneType * mid() const
{
return d + R::nlanes / 2;
}
LaneType * mid()
{
return d + R::nlanes / 2;
}
bool operator==(const Data<R> & other) const
{
for (int i = 0; i < R::nlanes; ++i)
if (d[i] != other.d[i])
return false;
return true;
}
void clear()
{
fill(0);
}
bool isZero() const
{
return isValue(0);
}
bool isValue(uchar val) const
{
for (int i = 0; i < R::nlanes; ++i)
if (d[i] != val)
return false;
return true;
}
LaneType d[R::nlanes];
};
template<typename R> struct AlignedData
{
Data<R> CV_DECL_ALIGNED(16) a; // aligned
char dummy;
Data<R> u; // unaligned
};
template <typename R> std::ostream & operator<<(std::ostream & out, const Data<R> & d)
{
out << "{ ";
for (int i = 0; i < R::nlanes; ++i)
{
// out << std::hex << +V_TypeTraits<typename R::lane_type>::reinterpret_int(d.d[i]);
out << +d.d[i];
if (i + 1 < R::nlanes)
out << ", ";
}
out << " }";
return out;
}
//==================================================================================================
template <typename R> struct RegTrait;
template <> struct RegTrait<cv::v_uint8x16> {
typedef cv::v_uint16x8 w_reg;
typedef cv::v_uint32x4 q_reg;
typedef cv::v_uint8x16 u_reg;
static cv::v_uint8x16 zero() { return cv::v_setzero_u8(); }
static cv::v_uint8x16 all(uchar val) { return cv::v_setall_u8(val); }
};
template <> struct RegTrait<cv::v_int8x16> {
typedef cv::v_int16x8 w_reg;
typedef cv::v_int32x4 q_reg;
typedef cv::v_uint8x16 u_reg;
static cv::v_int8x16 zero() { return cv::v_setzero_s8(); }
static cv::v_int8x16 all(schar val) { return cv::v_setall_s8(val); }
};
template <> struct RegTrait<cv::v_uint16x8> {
typedef cv::v_uint32x4 w_reg;
typedef cv::v_int16x8 int_reg;
typedef cv::v_uint16x8 u_reg;
static cv::v_uint16x8 zero() { return cv::v_setzero_u16(); }
static cv::v_uint16x8 all(ushort val) { return cv::v_setall_u16(val); }
};
template <> struct RegTrait<cv::v_int16x8> {
typedef cv::v_int32x4 w_reg;
typedef cv::v_uint16x8 u_reg;
static cv::v_int16x8 zero() { return cv::v_setzero_s16(); }
static cv::v_int16x8 all(short val) { return cv::v_setall_s16(val); }
};
template <> struct RegTrait<cv::v_uint32x4> {
typedef cv::v_uint64x2 w_reg;
typedef cv::v_int32x4 int_reg;
typedef cv::v_uint32x4 u_reg;
static cv::v_uint32x4 zero() { return cv::v_setzero_u32(); }
static cv::v_uint32x4 all(unsigned val) { return cv::v_setall_u32(val); }
};
template <> struct RegTrait<cv::v_int32x4> {
typedef cv::v_int64x2 w_reg;
typedef cv::v_uint32x4 u_reg;
static cv::v_int32x4 zero() { return cv::v_setzero_s32(); }
static cv::v_int32x4 all(int val) { return cv::v_setall_s32(val); }
};
template <> struct RegTrait<cv::v_uint64x2> {
static cv::v_uint64x2 zero() { return cv::v_setzero_u64(); }
static cv::v_uint64x2 all(uint64 val) { return cv::v_setall_u64(val); }
};
template <> struct RegTrait<cv::v_int64x2> {
static cv::v_int64x2 zero() { return cv::v_setzero_s64(); }
static cv::v_int64x2 all(int64 val) { return cv::v_setall_s64(val); }
};
template <> struct RegTrait<cv::v_float32x4> {
typedef cv::v_int32x4 int_reg;
typedef cv::v_float32x4 u_reg;
static cv::v_float32x4 zero() { return cv::v_setzero_f32(); }
static cv::v_float32x4 all(float val) { return cv::v_setall_f32(val); }
};
#if CV_SIMD128_64F
template <> struct RegTrait<cv::v_float64x2> {
typedef cv::v_int32x4 int_reg;
typedef cv::v_float64x2 u_reg;
static cv::v_float64x2 zero() { return cv::v_setzero_f64(); }
static cv::v_float64x2 all(double val) { return cv::v_setall_f64(val); }
};
#endif
#endif
#include "opencv2/ts.hpp"
CV_TEST_MAIN("cv")
#ifndef __OPENCV_HAL_TEST_PRECOMP_HPP__
#define __OPENCV_HAL_TEST_PRECOMP_HPP__
#include <iostream>
#include <limits>
#include "opencv2/ts.hpp"
#include "opencv2/hal.hpp"
#include "opencv2/hal/defs.h"
#include "opencv2/hal/intrin.hpp"
#endif
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment