HAL universal intrinsics tests and documentation

a275489f · Maksim Shabunin · 190d00ea · a275489f · a275489f · a275489f
Commit a275489f authored Aug 27, 2015 by Maksim Shabunin
10 changed files
--- a/modules/hal/include/opencv2/hal.hpp
+++ b/modules/hal/include/opencv2/hal.hpp
@@ -49,10 +49,21 @@
 /**
  @defgroup hal Hardware Acceleration Layer
+  @{
+    @defgroup hal_intrin Universal intrinsics
+    @{
+      @defgroup hal_intrin_impl Private implementation helpers
+    @}
+    @defgroup hal_utils Platform-dependent utils
+  @}
 */
 namespace cv { namespace hal {
+//! @addtogroup hal
+//! @{
 namespace Error {
 enum
@@ -93,6 +104,8 @@ void sqrt(const double* src, double* dst, int len);
 void invSqrt(const float* src, float* dst, int len);
 void invSqrt(const double* src, double* dst, int len);
+//! @}
 }} //cv::hal
 #endif //__OPENCV_HAL_HPP__
--- a/modules/hal/include/opencv2/hal/defs.h
+++ b/modules/hal/include/opencv2/hal/defs.h
@@ -45,6 +45,9 @@
 #ifndef __OPENCV_DEF_H__
 #define __OPENCV_DEF_H__
+//! @addtogroup hal_utils
+//! @{
 #if !defined _CRT_SECURE_NO_DEPRECATE && defined _MSC_VER && _MSC_VER > 1300
 #  define _CRT_SECURE_NO_DEPRECATE /* to avoid multiple Visual Studio warnings */
 #endif
@@ -335,9 +338,6 @@ Cv64suf;
 #  include "tegra_round.hpp"
 #endif
-//! @addtogroup core_utils
-//! @{
 #if CV_VFP
    // 1. general scheme
    #define ARM_ROUND(_value, _asm_string) \
@@ -567,15 +567,19 @@ CV_INLINE int cvIsInf( float value )
    return (ieee754.u & 0x7fffffff) == 0x7f800000;
 }
+//! @}
 #include <algorithm>
 namespace cv
 {
+//! @addtogroup hal_utils
+//! @{
 /////////////// saturate_cast (used in image & signal processing) ///////////////////
-/**
+/** @brief Template function for accurate conversion from one primitive type to another.
- Template function for accurate conversion from one primitive type to another.
 The functions saturate_cast resemble the standard C++ cast operations, such as static_cast\<T\>()
 and others. They perform an efficient and accurate conversion from one primitive type to another
@@ -618,8 +622,6 @@ template<typename _Tp> static inline _Tp saturate_cast(int64 v)    { return _Tp(
 /** @overload */
 template<typename _Tp> static inline _Tp saturate_cast(uint64 v)   { return _Tp(v); }
-//! @cond IGNORED
 template<> inline uchar saturate_cast<uchar>(schar v)        { return (uchar)std::max((int)v, 0); }
 template<> inline uchar saturate_cast<uchar>(ushort v)       { return (uchar)std::min((unsigned)v, (unsigned)UCHAR_MAX); }
 template<> inline uchar saturate_cast<uchar>(int v)          { return (uchar)((unsigned)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); }
@@ -664,12 +666,10 @@ template<> inline int saturate_cast<int>(double v)           { return cvRound(v)
 template<> inline unsigned saturate_cast<unsigned>(float v)  { return cvRound(v); }
 template<> inline unsigned saturate_cast<unsigned>(double v) { return cvRound(v); }
-//! @endcond
+//! @}
 }
 #endif // __cplusplus
-//! @} core_utils
 #endif //__OPENCV_HAL_H__
--- a/modules/hal/include/opencv2/hal/intrin.hpp
+++ b/modules/hal/include/opencv2/hal/intrin.hpp
@@ -48,6 +48,7 @@
 #include <cmath>
 #include <float.h>
 #include <stdlib.h>
+#include "opencv2/hal/defs.h"
 #define OPENCV_HAL_ADD(a, b) ((a) + (b))
 #define OPENCV_HAL_AND(a, b) ((a) & (b))
@@ -59,6 +60,10 @@
 // access from within opencv code more accessible
 namespace cv {
+//! @addtogroup hal_intrin
+//! @{
+//! @cond IGNORED
 template<typename _Tp> struct V_TypeTraits
 {
    typedef _Tp int_type;
@@ -82,6 +87,7 @@ template<> struct V_TypeTraits<uchar>
    typedef int sum_type;
    typedef ushort w_type;
+    typedef unsigned q_type;
    enum { delta = 128, shift = 8 };
@@ -99,6 +105,7 @@ template<> struct V_TypeTraits<schar>
    typedef int sum_type;
    typedef short w_type;
+    typedef int q_type;
    enum { delta = 128, shift = 8 };
@@ -265,8 +272,22 @@ template<> struct V_TypeTraits<double>
    }
 };
+template <typename T> struct V_SIMD128Traits
+{
+    enum { nlanes = 16 / sizeof(T) };
+};
+//! @endcond
+//! @}
 }
+#ifdef CV_DOXYGEN
+#   undef CV_SSE2
+#   undef CV_NEON
+#endif
 #if CV_SSE2
 #include "opencv2/hal/intrin_sse.hpp"
@@ -281,12 +302,19 @@ template<> struct V_TypeTraits<double>
 #endif
+//! @addtogroup hal_intrin
+//! @{
 #ifndef CV_SIMD128
+//! Set to 1 if current compiler supports vector extensions (NEON or SSE is enabled)
 #define CV_SIMD128 0
 #endif
 #ifndef CV_SIMD128_64F
+//! Set to 1 if current intrinsics implementation supports 64-bit float vectors
 #define CV_SIMD128_64F 0
 #endif
+//! @}
 #endif
--- a/modules/hal/include/opencv2/hal/intrin_cpp.hpp
+++ b/modules/hal/include/opencv2/hal/intrin_cpp.hpp
--- a/modules/hal/include/opencv2/hal/intrin_neon.hpp
+++ b/modules/hal/include/opencv2/hal/intrin_neon.hpp
@@ -48,6 +48,8 @@
 namespace cv
 {
+//! @cond IGNORED
 #define CV_SIMD128 1
 struct v_uint8x16
@@ -278,14 +280,15 @@ void v_rshr_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
 }
 OPENCV_HAL_IMPL_NEON_PACK(v_uint8x16, uchar, uint8x8_t, u8, v_uint16x8, u16, pack, n)
-OPENCV_HAL_IMPL_NEON_PACK(v_uint8x16, uchar, uint8x8_t, u8, v_int16x8, s16, pack_u, un)
 OPENCV_HAL_IMPL_NEON_PACK(v_int8x16, schar, int8x8_t, s8, v_int16x8, s16, pack, n)
 OPENCV_HAL_IMPL_NEON_PACK(v_uint16x8, ushort, uint16x4_t, u16, v_uint32x4, u32, pack, n)
-OPENCV_HAL_IMPL_NEON_PACK(v_uint16x8, ushort, uint16x4_t, u16, v_int32x4, s32, pack_u, un)
 OPENCV_HAL_IMPL_NEON_PACK(v_int16x8, short, int16x4_t, s16, v_int32x4, s32, pack, n)
 OPENCV_HAL_IMPL_NEON_PACK(v_uint32x4, unsigned, uint32x2_t, u32, v_uint64x2, u64, pack, n)
 OPENCV_HAL_IMPL_NEON_PACK(v_int32x4, int, int32x2_t, s32, v_int64x2, s64, pack, n)
+OPENCV_HAL_IMPL_NEON_PACK(v_uint8x16, uchar, uint8x8_t, u8, v_int16x8, s16, pack_u, un)
+OPENCV_HAL_IMPL_NEON_PACK(v_uint16x8, ushort, uint16x4_t, u16, v_int32x4, s32, pack_u, un)
 inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
                            const v_float32x4& m1, const v_float32x4& m2,
                            const v_float32x4& m3)
@@ -374,7 +377,7 @@ inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
 {
    int32x4_t c = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
    int32x4_t d = vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val));
-    int32x4x2_t cd = vtrnq_s32(c, d);
+    int32x4x2_t cd = vuzpq_s32(c, d);
    return v_int32x4(vaddq_s32(cd.val[0], cd.val[1]));
 }
@@ -497,6 +500,16 @@ OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_absdiff, vabdq_u16)
 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_absdiff, vabdq_u32)
 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_absdiff, vabdq_f32)
+#define OPENCV_HAL_IMPL_NEON_BIN_FUNC2(_Tpvec, _Tpvec2, cast, func, intrin) \
+inline _Tpvec2 func(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec2(cast(intrin(a.val, b.val))); \
+}
+OPENCV_HAL_IMPL_NEON_BIN_FUNC2(v_int8x16, v_uint8x16, vreinterpretq_u8_s8, v_absdiff, vabdq_s8)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC2(v_int16x8, v_uint16x8, vreinterpretq_u16_s16, v_absdiff, vabdq_s16)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC2(v_int32x4, v_uint32x4, vreinterpretq_u32_s32, v_absdiff, vabdq_s32)
 inline v_float32x4 v_magnitude(const v_float32x4& a, const v_float32x4& b)
 {
    v_float32x4 x(vmlaq_f32(vmulq_f32(a.val, a.val), b.val, b.val));
@@ -641,13 +654,13 @@ inline bool v_check_all(const v_float32x4& a)
 { return v_check_all(v_reinterpret_as_u32(a)); }
 inline bool v_check_any(const v_int8x16& a)
-{ return v_check_all(v_reinterpret_as_u8(a)); }
+{ return v_check_any(v_reinterpret_as_u8(a)); }
 inline bool v_check_any(const v_int16x8& a)
-{ return v_check_all(v_reinterpret_as_u16(a)); }
+{ return v_check_any(v_reinterpret_as_u16(a)); }
 inline bool v_check_any(const v_int32x4& a)
-{ return v_check_all(v_reinterpret_as_u32(a)); }
+{ return v_check_any(v_reinterpret_as_u32(a)); }
 inline bool v_check_any(const v_float32x4& a)
-{ return v_check_all(v_reinterpret_as_u32(a)); }
+{ return v_check_any(v_reinterpret_as_u32(a)); }
 #define OPENCV_HAL_IMPL_NEON_SELECT(_Tpvec, suffix, usuffix) \
 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
@@ -678,6 +691,8 @@ OPENCV_HAL_IMPL_NEON_EXPAND(v_uint8x16, v_uint16x8, uchar, u8)
 OPENCV_HAL_IMPL_NEON_EXPAND(v_int8x16, v_int16x8, schar, s8)
 OPENCV_HAL_IMPL_NEON_EXPAND(v_uint16x8, v_uint32x4, ushort, u16)
 OPENCV_HAL_IMPL_NEON_EXPAND(v_int16x8, v_int32x4, short, s16)
+OPENCV_HAL_IMPL_NEON_EXPAND(v_uint32x4, v_uint64x2, uint, u32)
+OPENCV_HAL_IMPL_NEON_EXPAND(v_int32x4, v_int64x2, int, s32)
 inline v_uint32x4 v_load_expand_q(const uchar* ptr)
 {
@@ -840,6 +855,8 @@ inline v_float32x4 v_cvt_f32(const v_int32x4& a)
    return v_float32x4(vcvtq_f32_s32(a.val));
 }
+//! @endcond
 }
 #endif
--- a/modules/hal/include/opencv2/hal/intrin_sse.hpp
+++ b/modules/hal/include/opencv2/hal/intrin_sse.hpp
@@ -51,6 +51,8 @@
 namespace cv
 {
+//! @cond IGNORED
 struct v_uint8x16
 {
    typedef uchar lane_type;
@@ -296,6 +298,11 @@ OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int32x4, s32)
 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint64x2, u64)
 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int64x2, s64)
+inline v_float32x4 v_reinterpret_as_f32(const v_float32x4& a) {return a; }
+inline v_float64x2 v_reinterpret_as_f64(const v_float64x2& a) {return a; }
+inline v_float32x4 v_reinterpret_as_f32(const v_float64x2& a) {return v_float32x4(_mm_castpd_ps(a.val)); }
+inline v_float64x2 v_reinterpret_as_f64(const v_float32x4& a) {return v_float64x2(_mm_castps_pd(a.val)); }
 //////////////// PACK ///////////////
 inline v_uint8x16 v_pack(const v_uint16x8& a, const v_uint16x8& b)
 {
@@ -430,6 +437,17 @@ inline void v_pack_u_store(ushort* ptr, const v_int32x4& a)
    _mm_storel_epi64((__m128i*)ptr, r);
 }
+template<int n> inline
+v_uint16x8 v_rshr_pack_u(const v_int32x4& a, const v_int32x4& b)
+{
+    __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
+    __m128i a1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n), delta32);
+    __m128i a2 = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
+    __m128i b1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(b.val, delta), n), delta32);
+    __m128i b2 = _mm_sub_epi16(_mm_packs_epi32(b1, b1), _mm_set1_epi16(-32768));
+    return v_uint16x8(_mm_unpacklo_epi64(a2, b2));
+}
 template<int n> inline
 void v_rshr_pack_u_store(ushort* ptr, const v_int32x4& a)
 {
@@ -460,7 +478,7 @@ void v_rshr_pack_store(short* ptr, const v_int32x4& a)
 {
    __m128i delta = _mm_set1_epi32(1 << (n-1));
    __m128i a1 = _mm_srai_epi32(_mm_add_epi32(a.val, delta), n);
-    _mm_storel_epi64((__m128i*)ptr, a1);
+    _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi32(a1, a1));
 }
@@ -469,7 +487,7 @@ inline v_uint32x4 v_pack(const v_uint64x2& a, const v_uint64x2& b)
 {
    __m128i v0 = _mm_unpacklo_epi32(a.val, b.val); // a0 a1 0 0
    __m128i v1 = _mm_unpackhi_epi32(a.val, b.val); // b0 b1 0 0
-    return v_uint32x4(_mm_unpacklo_epi64(v0, v1));
+    return v_uint32x4(_mm_unpacklo_epi32(v0, v1));
 }
 inline void v_pack_store(unsigned* ptr, const v_uint64x2& a)
@@ -483,7 +501,7 @@ inline v_int32x4 v_pack(const v_int64x2& a, const v_int64x2& b)
 {
    __m128i v0 = _mm_unpacklo_epi32(a.val, b.val); // a0 a1 0 0
    __m128i v1 = _mm_unpackhi_epi32(a.val, b.val); // b0 b1 0 0
-    return v_int32x4(_mm_unpacklo_epi64(v0, v1));
+    return v_int32x4(_mm_unpacklo_epi32(v0, v1));
 }
 inline void v_pack_store(int* ptr, const v_int64x2& a)
@@ -501,7 +519,7 @@ v_uint32x4 v_rshr_pack(const v_uint64x2& a, const v_uint64x2& b)
    __m128i b1 = _mm_srli_epi64(_mm_add_epi64(b.val, delta2.val), n);
    __m128i v0 = _mm_unpacklo_epi32(a1, b1); // a0 a1 0 0
    __m128i v1 = _mm_unpackhi_epi32(a1, b1); // b0 b1 0 0
-    return v_uint32x4(_mm_unpacklo_epi64(v0, v1));
+    return v_uint32x4(_mm_unpacklo_epi32(v0, v1));
 }
 template<int n> inline
@@ -534,7 +552,7 @@ v_int32x4 v_rshr_pack(const v_int64x2& a, const v_int64x2& b)
    __m128i b1 = v_srai_epi64(_mm_add_epi64(b.val, delta2.val), n);
    __m128i v0 = _mm_unpacklo_epi32(a1, b1); // a0 a1 0 0
    __m128i v1 = _mm_unpackhi_epi32(a1, b1); // b0 b1 0 0
-    return v_int32x4(_mm_unpacklo_epi64(v0, v1));
+    return v_int32x4(_mm_unpacklo_epi32(v0, v1));
 }
 template<int n> inline
@@ -630,8 +648,8 @@ inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
 {
    __m128i v0 = _mm_mullo_epi16(a.val, b.val);
    __m128i v1 = _mm_mulhi_epi16(a.val, b.val);
-    c.val = _mm_unpacklo_epi32(v0, v1);
+    c.val = _mm_unpacklo_epi16(v0, v1);
-    d.val = _mm_unpackhi_epi32(v0, v1);
+    d.val = _mm_unpackhi_epi16(v0, v1);
 }
 inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
@@ -639,8 +657,8 @@ inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
 {
    __m128i v0 = _mm_mullo_epi16(a.val, b.val);
    __m128i v1 = _mm_mulhi_epu16(a.val, b.val);
-    c.val = _mm_unpacklo_epi32(v0, v1);
+    c.val = _mm_unpacklo_epi16(v0, v1);
-    d.val = _mm_unpackhi_epi32(v0, v1);
+    d.val = _mm_unpackhi_epi16(v0, v1);
 }
 inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
@@ -869,6 +887,18 @@ inline _Tpuvec v_absdiff(const _Tpsvec& a, const _Tpsvec& b) \
 OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(v_uint8x16, v_int8x16, 8, (int)0x80808080)
 OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(v_uint16x8, v_int16x8, 16, (int)0x80008000)
+inline v_uint32x4 v_absdiff(const v_uint32x4& a, const v_uint32x4& b)
+{
+    return v_max(a, b) - v_min(a, b);
+}
+inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b)
+{
+    __m128i d = _mm_sub_epi32(a.val, b.val);
+    __m128i m = _mm_cmpgt_epi32(b.val, a.val);
+    return v_uint32x4(_mm_sub_epi32(_mm_xor_si128(d, m), m));
+}
 #define OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(_Tpvec, _Tp, _Tpreg, suffix, absmask_vec) \
 inline _Tpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \
 { \
@@ -1047,8 +1077,8 @@ OPENCV_HAL_IMPL_SSE_SELECT(v_uint16x8, si128)
 OPENCV_HAL_IMPL_SSE_SELECT(v_int16x8, si128)
 OPENCV_HAL_IMPL_SSE_SELECT(v_uint32x4, si128)
 OPENCV_HAL_IMPL_SSE_SELECT(v_int32x4, si128)
-OPENCV_HAL_IMPL_SSE_SELECT(v_uint64x2, si128)
+// OPENCV_HAL_IMPL_SSE_SELECT(v_uint64x2, si128)
-OPENCV_HAL_IMPL_SSE_SELECT(v_int64x2, si128)
+// OPENCV_HAL_IMPL_SSE_SELECT(v_int64x2, si128)
 OPENCV_HAL_IMPL_SSE_SELECT(v_float32x4, ps)
 OPENCV_HAL_IMPL_SSE_SELECT(v_float64x2, pd)
@@ -1257,7 +1287,7 @@ inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b,
    __m128i v0 = _mm_unpacklo_epi8(u0, u2); // a0 a8 b0 b8 ...
    __m128i v1 = _mm_unpackhi_epi8(u0, u2); // a2 a10 b2 b10 ...
    __m128i v2 = _mm_unpacklo_epi8(u1, u3); // a4 a12 b4 b12 ...
-    __m128i v3 = _mm_unpackhi_epi8(u1, u3); // a6 a14 b4 b14 ...
+    __m128i v3 = _mm_unpackhi_epi8(u1, u3); // a6 a14 b6 b14 ...
    u0 = _mm_unpacklo_epi8(v0, v2); // a0 a4 a8 a12 ...
    u1 = _mm_unpacklo_epi8(v1, v3); // a2 a6 a10 a14 ...
@@ -1266,13 +1296,13 @@ inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b,
    v0 = _mm_unpacklo_epi8(u0, u1); // a0 a2 a4 a6 ...
    v1 = _mm_unpacklo_epi8(u2, u3); // a1 a3 a5 a7 ...
-    v2 = _mm_unpackhi_epi8(u0, u1); // b0 b2 b4 b6 ...
+    v2 = _mm_unpackhi_epi8(u0, u1); // c0 c2 c4 c6 ...
-    v3 = _mm_unpackhi_epi8(u2, u3); // b1 b3 b5 b7 ...
+    v3 = _mm_unpackhi_epi8(u2, u3); // c1 c3 c5 c7 ...
    a.val = _mm_unpacklo_epi8(v0, v1);
-    b.val = _mm_unpacklo_epi8(v2, v3);
+    b.val = _mm_unpackhi_epi8(v0, v1);
-    c.val = _mm_unpackhi_epi8(v0, v1);
+    c.val = _mm_unpacklo_epi8(v2, v3);
-    d.val = _mm_unpacklo_epi8(v2, v3);
+    d.val = _mm_unpackhi_epi8(v2, v3);
 }
 inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c)
@@ -1560,6 +1590,8 @@ inline v_float64x2 v_cvt_f64(const v_float32x4& a)
    return v_float64x2(_mm_cvtps_pd(a.val));
 }
+//! @endcond
 }
 #endif
--- a/modules/hal/test/test_intrin.cpp
+++ b/modules/hal/test/test_intrin.cpp
--- a/modules/hal/test/test_intrin_utils.hpp
+++ b/modules/hal/test/test_intrin_utils.hpp
+#ifndef _TEST_UTILS_HPP_
+#define _TEST_UTILS_HPP_
+#include "opencv2/hal/intrin.hpp"
+#include "opencv2/ts.hpp"
+#include <ostream>
+#include <algorithm>
+template <typename R> struct Data;
+template <int N> struct initializer;
+template <> struct initializer<16>
+{
+    template <typename R> static R init(const Data<R> & d)
+    {
+        return R(d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7], d[8], d[9], d[10], d[11], d[12], d[13], d[14], d[15]);
+    }
+};
+template <> struct initializer<8>
+{
+    template <typename R> static R init(const Data<R> & d)
+    {
+        return R(d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7]);
+    }
+};
+template <> struct initializer<4>
+{
+    template <typename R> static R init(const Data<R> & d)
+    {
+        return R(d[0], d[1], d[2], d[3]);
+    }
+};
+template <> struct initializer<2>
+{
+    template <typename R> static R init(const Data<R> & d)
+    {
+        return R(d[0], d[1]);
+    }
+};
+//==================================================================================================
+template <typename R> struct Data
+{
+    typedef typename R::lane_type LaneType;
+    Data()
+    {
+        for (int i = 0; i < R::nlanes; ++i)
+            d[i] = (LaneType)(i + 1);
+    }
+    Data(LaneType val)
+    {
+        fill(val);
+    }
+    Data(const R & r)
+    {
+        *this = r;
+    }
+    operator R ()
+    {
+        return initializer<R::nlanes>().init(*this);
+    }
+    Data<R> & operator=(const R & r)
+    {
+        v_store(d, r);
+        return *this;
+    }
+    template <typename T> Data<R> & operator*=(T m)
+    {
+        for (int i = 0; i < R::nlanes; ++i)
+            d[i] *= (LaneType)m;
+        return *this;
+    }
+    template <typename T> Data<R> & operator+=(T m)
+    {
+        for (int i = 0; i < R::nlanes; ++i)
+            d[i] += (LaneType)m;
+        return *this;
+    }
+    void fill(LaneType val)
+    {
+        for (int i = 0; i < R::nlanes; ++i)
+            d[i] = val;
+    }
+    void reverse()
+    {
+        for (int i = 0; i < R::nlanes / 2; ++i)
+            std::swap(d[i], d[R::nlanes - i - 1]);
+    }
+    const LaneType & operator[](int i) const
+    {
+        CV_Assert(i >= 0 && i < R::nlanes);
+        return d[i];
+    }
+    LaneType & operator[](int i)
+    {
+        CV_Assert(i >= 0 && i < R::nlanes);
+        return d[i];
+    }
+    const LaneType * mid() const
+    {
+        return d + R::nlanes / 2;
+    }
+    LaneType * mid()
+    {
+        return d + R::nlanes / 2;
+    }
+    bool operator==(const Data<R> & other) const
+    {
+        for (int i = 0; i < R::nlanes; ++i)
+            if (d[i] != other.d[i])
+                return false;
+        return true;
+    }
+    void clear()
+    {
+        fill(0);
+    }
+    bool isZero() const
+    {
+        return isValue(0);
+    }
+    bool isValue(uchar val) const
+    {
+        for (int i = 0; i < R::nlanes; ++i)
+            if (d[i] != val)
+                return false;
+        return true;
+    }
+    LaneType d[R::nlanes];
+};
+template<typename R> struct AlignedData
+{
+    Data<R> CV_DECL_ALIGNED(16) a; // aligned
+    char dummy;
+    Data<R> u; // unaligned
+};
+template <typename R> std::ostream & operator<<(std::ostream & out, const Data<R> & d)
+{
+    out << "{ ";
+    for (int i = 0; i < R::nlanes; ++i)
+    {
+        // out << std::hex << +V_TypeTraits<typename R::lane_type>::reinterpret_int(d.d[i]);
+        out << +d.d[i];
+        if (i + 1 < R::nlanes)
+            out << ", ";
+    }
+    out << " }";
+    return out;
+}
+//==================================================================================================
+template <typename R> struct RegTrait;
+template <> struct RegTrait<cv::v_uint8x16> {
+    typedef cv::v_uint16x8 w_reg;
+    typedef cv::v_uint32x4 q_reg;
+    typedef cv::v_uint8x16 u_reg;
+    static cv::v_uint8x16 zero() { return cv::v_setzero_u8(); }
+    static cv::v_uint8x16 all(uchar val) { return cv::v_setall_u8(val); }
+};
+template <> struct RegTrait<cv::v_int8x16> {
+    typedef cv::v_int16x8 w_reg;
+    typedef cv::v_int32x4 q_reg;
+    typedef cv::v_uint8x16 u_reg;
+    static cv::v_int8x16 zero() { return cv::v_setzero_s8(); }
+    static cv::v_int8x16 all(schar val) { return cv::v_setall_s8(val); }
+};
+template <> struct RegTrait<cv::v_uint16x8> {
+    typedef cv::v_uint32x4 w_reg;
+    typedef cv::v_int16x8 int_reg;
+    typedef cv::v_uint16x8 u_reg;
+    static cv::v_uint16x8 zero() { return cv::v_setzero_u16(); }
+    static cv::v_uint16x8 all(ushort val) { return cv::v_setall_u16(val); }
+};
+template <> struct RegTrait<cv::v_int16x8> {
+    typedef cv::v_int32x4 w_reg;
+    typedef cv::v_uint16x8 u_reg;
+    static cv::v_int16x8 zero() { return cv::v_setzero_s16(); }
+    static cv::v_int16x8 all(short val) { return cv::v_setall_s16(val); }
+};
+template <> struct RegTrait<cv::v_uint32x4> {
+    typedef cv::v_uint64x2 w_reg;
+    typedef cv::v_int32x4 int_reg;
+    typedef cv::v_uint32x4 u_reg;
+    static cv::v_uint32x4 zero() { return cv::v_setzero_u32(); }
+    static cv::v_uint32x4 all(unsigned val) { return cv::v_setall_u32(val); }
+};
+template <> struct RegTrait<cv::v_int32x4> {
+    typedef cv::v_int64x2 w_reg;
+    typedef cv::v_uint32x4 u_reg;
+    static cv::v_int32x4 zero() { return cv::v_setzero_s32(); }
+    static cv::v_int32x4 all(int val) { return cv::v_setall_s32(val); }
+};
+template <> struct RegTrait<cv::v_uint64x2> {
+    static cv::v_uint64x2 zero() { return cv::v_setzero_u64(); }
+    static cv::v_uint64x2 all(uint64 val) { return cv::v_setall_u64(val); }
+};
+template <> struct RegTrait<cv::v_int64x2> {
+    static cv::v_int64x2 zero() { return cv::v_setzero_s64(); }
+    static cv::v_int64x2 all(int64 val) { return cv::v_setall_s64(val); }
+};
+template <> struct RegTrait<cv::v_float32x4> {
+    typedef cv::v_int32x4 int_reg;
+    typedef cv::v_float32x4 u_reg;
+    static cv::v_float32x4 zero() { return cv::v_setzero_f32(); }
+    static cv::v_float32x4 all(float val) { return cv::v_setall_f32(val); }
+};
+#if CV_SIMD128_64F
+template <> struct RegTrait<cv::v_float64x2> {
+    typedef cv::v_int32x4 int_reg;
+    typedef cv::v_float64x2 u_reg;
+    static cv::v_float64x2 zero() { return cv::v_setzero_f64(); }
+    static cv::v_float64x2 all(double val) { return cv::v_setall_f64(val); }
+};
+#endif
+#endif
--- a/modules/hal/test/test_main.cpp
+++ b/modules/hal/test/test_main.cpp
+#include "opencv2/ts.hpp"
+CV_TEST_MAIN("cv")
--- a/modules/hal/test/test_precomp.hpp
+++ b/modules/hal/test/test_precomp.hpp
+#ifndef __OPENCV_HAL_TEST_PRECOMP_HPP__
+#define __OPENCV_HAL_TEST_PRECOMP_HPP__
+#include <iostream>
+#include <limits>
+#include "opencv2/ts.hpp"
+#include "opencv2/hal.hpp"
+#include "opencv2/hal/defs.h"
+#include "opencv2/hal/intrin.hpp"
+#endif