HAL universal intrinsics tests and documentation

a275489f · Maksim Shabunin · 190d00ea · a275489f · a275489f · a275489f
Commit a275489f authored Aug 27, 2015 by Maksim Shabunin
10 changed files
--- a/modules/hal/include/opencv2/hal.hpp
+++ b/modules/hal/include/opencv2/hal.hpp
@@ -49,10 +49,21 @@

 /**
  @defgroup hal Hardware Acceleration Layer
+  @{
+    @defgroup hal_intrin Universal intrinsics
+    @{
+      @defgroup hal_intrin_impl Private implementation helpers
+    @}
+    @defgroup hal_utils Platform-dependent utils
+  @}
 */

+
 namespace cv { namespace hal {

+//! @addtogroup hal
+//! @{
+
 namespace Error {

 enum
@@ -93,6 +104,8 @@ void sqrt(const double* src, double* dst, int len);
 void invSqrt(const float* src, float* dst, int len);
 void invSqrt(const double* src, double* dst, int len);

+//! @}
+
 }} //cv::hal

 #endif //__OPENCV_HAL_HPP__
--- a/modules/hal/include/opencv2/hal/defs.h
+++ b/modules/hal/include/opencv2/hal/defs.h
@@ -45,6 +45,9 @@
 #ifndef __OPENCV_DEF_H__
 #define __OPENCV_DEF_H__

+//! @addtogroup hal_utils
+//! @{
+
 #if !defined _CRT_SECURE_NO_DEPRECATE && defined _MSC_VER && _MSC_VER > 1300
 #  define _CRT_SECURE_NO_DEPRECATE /* to avoid multiple Visual Studio warnings */
 #endif
@@ -335,9 +338,6 @@ Cv64suf;
 #  include "tegra_round.hpp"
 #endif

-//! @addtogroup core_utils
-//! @{
-
 #if CV_VFP
    // 1. general scheme
    #define ARM_ROUND(_value, _asm_string) \
@@ -567,15 +567,19 @@ CV_INLINE int cvIsInf( float value )
    return (ieee754.u & 0x7fffffff) == 0x7f800000;
 }

+//! @}
+
 #include <algorithm>

 namespace cv
 {

+//! @addtogroup hal_utils
+//! @{
+
 /////////////// saturate_cast (used in image & signal processing) ///////////////////

-/**
- Template function for accurate conversion from one primitive type to another.
+/** @brief Template function for accurate conversion from one primitive type to another.

 The functions saturate_cast resemble the standard C++ cast operations, such as static_cast\<T\>()
 and others. They perform an efficient and accurate conversion from one primitive type to another
@@ -618,8 +622,6 @@ template<typename _Tp> static inline _Tp saturate_cast(int64 v)    { return _Tp(
 /** @overload */
 template<typename _Tp> static inline _Tp saturate_cast(uint64 v)   { return _Tp(v); }

-//! @cond IGNORED
-
 template<> inline uchar saturate_cast<uchar>(schar v)        { return (uchar)std::max((int)v, 0); }
 template<> inline uchar saturate_cast<uchar>(ushort v)       { return (uchar)std::min((unsigned)v, (unsigned)UCHAR_MAX); }
 template<> inline uchar saturate_cast<uchar>(int v)          { return (uchar)((unsigned)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); }
@@ -664,12 +666,10 @@ template<> inline int saturate_cast<int>(double v)           { return cvRound(v)
 template<> inline unsigned saturate_cast<unsigned>(float v)  { return cvRound(v); }
 template<> inline unsigned saturate_cast<unsigned>(double v) { return cvRound(v); }

-//! @endcond
+//! @}

 }

 #endif // __cplusplus

-//! @} core_utils
-
 #endif //__OPENCV_HAL_H__
--- a/modules/hal/include/opencv2/hal/intrin.hpp
+++ b/modules/hal/include/opencv2/hal/intrin.hpp
@@ -48,6 +48,7 @@
 #include <cmath>
 #include <float.h>
 #include <stdlib.h>
+#include "opencv2/hal/defs.h"

 #define OPENCV_HAL_ADD(a, b) ((a) + (b))
 #define OPENCV_HAL_AND(a, b) ((a) & (b))
@@ -59,6 +60,10 @@
 // access from within opencv code more accessible
 namespace cv {

+//! @addtogroup hal_intrin
+//! @{
+
+//! @cond IGNORED
 template<typename _Tp> struct V_TypeTraits
 {
    typedef _Tp int_type;
@@ -82,6 +87,7 @@ template<> struct V_TypeTraits<uchar>
    typedef int sum_type;

    typedef ushort w_type;
+    typedef unsigned q_type;

    enum { delta = 128, shift = 8 };

@@ -99,6 +105,7 @@ template<> struct V_TypeTraits<schar>
    typedef int sum_type;

    typedef short w_type;
+    typedef int q_type;

    enum { delta = 128, shift = 8 };

@@ -265,8 +272,22 @@ template<> struct V_TypeTraits<double>
    }
 };

+template <typename T> struct V_SIMD128Traits
+{
+    enum { nlanes = 16 / sizeof(T) };
+};
+
+//! @endcond
+
+//! @}
+
 }

+#ifdef CV_DOXYGEN
+#   undef CV_SSE2
+#   undef CV_NEON
+#endif
+
 #if CV_SSE2

 #include "opencv2/hal/intrin_sse.hpp"
@@ -281,12 +302,19 @@ template<> struct V_TypeTraits<double>

 #endif

+//! @addtogroup hal_intrin
+//! @{
+
 #ifndef CV_SIMD128
+//! Set to 1 if current compiler supports vector extensions (NEON or SSE is enabled)
 #define CV_SIMD128 0
 #endif

 #ifndef CV_SIMD128_64F
+//! Set to 1 if current intrinsics implementation supports 64-bit float vectors
 #define CV_SIMD128_64F 0
 #endif

+//! @}
+
 #endif
--- a/modules/hal/include/opencv2/hal/intrin_cpp.hpp
+++ b/modules/hal/include/opencv2/hal/intrin_cpp.hpp
@@ -45,25 +45,233 @@
 #ifndef __OPENCV_HAL_INTRIN_CPP_HPP__
 #define __OPENCV_HAL_INTRIN_CPP_HPP__

+#include <limits>
+#include <cstring>
+
 namespace cv
 {

+/** @addtogroup hal_intrin
+
+"Universal intrinsics" is a types and functions set intended to simplify vectorization of code on
+different platforms. Currently there are two supported SIMD extensions: __SSE/SSE2__ on x86
+architectures and __NEON__ on ARM architectures, both allow working with 128 bit registers
+containing packed values of different types. In case when there is no SIMD extension available
+during compilation, fallback C++ implementation of intrinsics will be chosen and code will work as
+expected although it could be slower.
+
+### Types
+
+There are several types representing 128-bit register as a vector of packed values, each type is
+implemented as a structure based on a one SIMD register.
+
+- cv::v_uint8x16 and cv::v_int8x16: sixteen 8-bit integer values (unsigned/signed) - char
+- cv::v_uint16x8 and cv::v_int16x8: eight 16-bit integer values (unsigned/signed) - short
+- cv::v_uint32x4 and cv::v_int32x4: four 32-bit integer values (unsgined/signed) - int
+- cv::v_uint64x2 and cv::v_int64x2: two 64-bit integer values (unsigned/signed) - int64
+- cv::v_float32x4: four 32-bit floating point values (signed) - float
+- cv::v_float64x2: two 64-bit floating point valies (signed) - double
+
+@note
+cv::v_float64x2 is not implemented in NEON variant, if you want to use this type, don't forget to
+check the CV_SIMD128_64F preprocessor definition:
+@code
+#if CV_SIMD128_64F
+//...
+#endif
+@endcode
+
+### Load and store operations
+
+These operations allow to set contents of the register explicitly or by loading it from some memory
+block and to save contents of the register to memory block.
+
+- Constructors:
+@ref v_reg::v_reg(const _Tp *ptr) "from memory",
+@ref v_reg::v_reg(_Tp s0, _Tp s1) "from two values", ...
+- Other create methods:
+@ref v_setall_s8, @ref v_setall_u8, ...,
+@ref v_setzero_u8, @ref v_setzero_s8, ...
+- Memory operations:
+@ref v_load, @ref v_load_aligned, @ref v_load_halves,
+@ref v_store, @ref v_store_aligned,
+@ref v_store_high, @ref v_store_low
+
+### Value reordering
+
+These operations allow to reorder or recombine elements in one or multiple vectors.
+
+- Interleave, deinterleave (3 and 4 channels): @ref v_load_deinterleave, @ref v_store_interleave
+- Expand: @ref v_load_expand, @ref v_load_expand_q, @ref v_expand
+- Pack: @ref v_pack, @ref v_pack_u, @ref v_rshr_pack, @ref v_rshr_pack_u,
+@ref v_pack_store, @ref v_pack_u_store, @ref v_rshr_pack_store, @ref v_rshr_pack_u_store
+- Recombine: @ref v_zip, @ref v_recombine, @ref v_combine_low, @ref v_combine_high
+- Extract: @ref v_extract
+
+
+### Arithmetic, bitwise and comparison operations
+
+Element-wise binary and unary operations.
+
+- Arithmetics:
+@ref operator+(const v_reg &a, const v_reg &b) "+",
+@ref operator-(const v_reg &a, const v_reg &b) "-",
+@ref operator*(const v_reg &a, const v_reg &b) "*",
+@ref operator/(const v_reg &a, const v_reg &b) "/",
+@ref v_mul_expand
+
+- Non-saturating arithmetics: @ref v_add_wrap, @ref v_sub_wrap
+
+- Bitwise shifts:
+@ref operator<<(const v_reg &a, int s) "<<",
+@ref operator>>(const v_reg &a, int s) ">>",
+@ref v_shl, @ref v_shr
+
+- Bitwise logic:
+@ref operator&(const v_reg &a, const v_reg &b) "&",
+@ref operator|(const v_reg &a, const v_reg &b) "|",
+@ref operator^(const v_reg &a, const v_reg &b) "^",
+@ref operator~(const v_reg &a) "~"
+
+- Comparison:
+@ref operator>(const v_reg &a, const v_reg &b) ">",
+@ref operator>=(const v_reg &a, const v_reg &b) ">=",
+@ref operator<(const v_reg &a, const v_reg &b) "<",
+@ref operator<=(const v_reg &a, const v_reg &b) "<=",
+@ref operator==(const v_reg &a, const v_reg &b) "==",
+@ref operator!=(const v_reg &a, const v_reg &b) "!="
+
+- min/max: @ref v_min, @ref v_max
+
+### Reduce and mask
+
+Most of these operations return only one value.
+
+- Reduce: @ref v_reduce_min, @ref v_reduce_max, @ref v_reduce_sum
+- Mask: @ref v_signmask, @ref v_check_all, @ref v_check_any, @ref v_select
+
+### Other math
+
+- Some frequent operations: @ref v_sqrt, @ref v_invsqrt, @ref v_magnitude, @ref v_sqr_magnitude
+- Absolute values: @ref v_abs, @ref v_absdiff
+
+### Conversions
+
+Different type conversions and casts:
+
+- Rounding: @ref v_round, @ref v_floor, @ref v_ceil, @ref v_trunc,
+- To float: @ref v_cvt_f32, @ref v_cvt_f64
+- Reinterpret: @ref v_reinterpret_as_u8, @ref v_reinterpret_as_s8, ...
+
+### Matrix operations
+
+In these operations vectors represent matrix rows/columns: @ref v_dotprod, @ref v_matmul, @ref v_transpose4x4
+
+### Usability
+
+Most operations are implemented only for some subset of the available types, following matrices
+shows the applicability of different operations to the types.
+
+Regular integers:
+
+| Operations\\Types | uint 8x16 | int 8x16 | uint 16x8 | int 16x8 | uint 32x4 | int 32x4 |
+|-------------------|:-:|:-:|:-:|:-:|:-:|:-:|
+|load, store        | x | x | x | x | x | x |
+|interleave         | x | x | x | x | x | x |
+|expand             | x | x | x | x | x | x |
+|expand_q           | x | x |   |   |   |   |
+|add, sub           | x | x | x | x | x | x |
+|add_wrap, sub_wrap | x | x | x | x |   |   |
+|mul                |   |   | x | x | x | x |
+|mul_expand         |   |   | x | x | x |   |
+|compare            | x | x | x | x | x | x |
+|shift              |   |   | x | x | x | x |
+|dotprod            |   |   |   | x |   |   |
+|logical            | x | x | x | x | x | x |
+|min, max           | x | x | x | x | x | x |
+|absdiff            | x | x | x | x | x | x |
+|reduce             |   |   |   |   | x | x |
+|mask               | x | x | x | x | x | x |
+|pack               | x | x | x | x | x | x |
+|pack_u             | x |   | x |   |   |   |
+|unpack             | x | x | x | x | x | x |
+|extract            | x | x | x | x | x | x |
+|cvt_flt32          |   |   |   |   |   | x |
+|cvt_flt64          |   |   |   |   |   | x |
+|transpose4x4       |   |   |   |   | x | x |
+
+Big integers:
+
+| Operations\\Types | uint 64x2 | int 64x2 |
+|-------------------|:-:|:-:|
+|load, store        | x | x |
+|add, sub           | x | x |
+|shift              | x | x |
+|logical            | x | x |
+|extract            | x | x |
+
+Floating point:
+
+| Operations\\Types | float 32x4 | float 64x2 |
+|-------------------|:-:|:-:|
+|load, store        | x | x |
+|interleave         | x |   |
+|add, sub           | x | x |
+|mul                | x | x |
+|div                | x | x |
+|compare            | x | x |
+|min, max           | x | x |
+|absdiff            | x | x |
+|reduce             | x |   |
+|mask               | x | x |
+|unpack             | x | x |
+|cvt_flt32          |   | x |
+|cvt_flt64          | x |   |
+|sqrt, abs          | x | x |
+|float math         | x | x |
+|transpose4x4       | x |   |
+
+
+ @{ */
+
 template<typename _Tp, int n> struct v_reg
 {
+//! @cond IGNORED
    typedef _Tp lane_type;
    typedef v_reg<typename V_TypeTraits<_Tp>::int_type, n> int_vec;
    typedef v_reg<typename V_TypeTraits<_Tp>::abs_type, n> abs_vec;
    enum { nlanes = n };
+// !@endcond

+    /** @brief Constructor
+
+    Initializes register with data from memory
+    @param ptr pointer to memory block with data for register */
    explicit v_reg(const _Tp* ptr) { for( int i = 0; i < n; i++ ) s[i] = ptr[i]; }
+
+    /** @brief Constructor
+
+    Initializes register with two 64-bit values */
    v_reg(_Tp s0, _Tp s1) { s[0] = s0; s[1] = s1; }
+
+    /** @brief Constructor
+
+    Initializes register with four 32-bit values */
    v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3) { s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3; }
+
+    /** @brief Constructor
+
+    Initializes register with eight 16-bit values */
    v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3,
           _Tp s4, _Tp s5, _Tp s6, _Tp s7)
    {
        s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3;
        s[4] = s4; s[5] = s5; s[6] = s6; s[7] = s7;
    }
+
+    /** @brief Constructor
+
+    Initializes register with sixteen 8-bit values */
    v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3,
           _Tp s4, _Tp s5, _Tp s6, _Tp s7,
           _Tp s8, _Tp s9, _Tp s10, _Tp s11,
@@ -75,15 +283,31 @@ template<typename _Tp, int n> struct v_reg
        s[12] = s12; s[13] = s13; s[14] = s14; s[15] = s15;
    }

+    /** @brief Default constructor
+
+    Does not initialize anything*/
    v_reg() {}
+
+    /** @brief Copy constructor */
    v_reg(const v_reg<_Tp, n> & r)
    {
        for( int i = 0; i < n; i++ )
            s[i] = r.s[i];
    }
+    /** @brief Access first value
+
+    Returns value of the first lane according to register type, for example:
+    @code{.cpp}
+    v_int32x4 r(1, 2, 3, 4);
+    int v = r.get0(); // returns 1
+    v_uint64x2 r(1, 2);
+    uint64_t v = r.get0(); // returns 1
+    @endcode
+    */
+    _Tp get0() const { return s[0]; }

+//! @cond IGNORED
    _Tp get(const int i) const { return s[i]; }
-    _Tp get0() const { return s[0]; }
    v_reg<_Tp, n> high() const
    {
        v_reg<_Tp, n> c;
@@ -116,13 +340,37 @@ template<typename _Tp, int n> struct v_reg
    {
        size_t bytes = std::min(sizeof(_Tp2)*n2, sizeof(_Tp)*n);
        v_reg<_Tp2, n2> c;
-        memcpy(&c.s[0], &s[0], bytes);
+        std::memcpy(&c.s[0], &s[0], bytes);
        return c;
    }

    _Tp s[n];
+//! @endcond
 };

+/** @brief Sixteen 8-bit unsigned integer values */
+typedef v_reg<uchar, 16> v_uint8x16;
+/** @brief Sixteen 8-bit signed integer values */
+typedef v_reg<schar, 16> v_int8x16;
+/** @brief Eight 16-bit unsigned integer values */
+typedef v_reg<ushort, 8> v_uint16x8;
+/** @brief Eight 16-bit signed integer values */
+typedef v_reg<short, 8> v_int16x8;
+/** @brief Four 32-bit unsigned integer values */
+typedef v_reg<unsigned, 4> v_uint32x4;
+/** @brief Four 32-bit signed integer values */
+typedef v_reg<int, 4> v_int32x4;
+/** @brief Four 32-bit floating point values (single precision) */
+typedef v_reg<float, 4> v_float32x4;
+/** @brief Two 64-bit floating point values (double precision) */
+typedef v_reg<double, 2> v_float64x2;
+/** @brief Two 64-bit unsigned integer values */
+typedef v_reg<uint64, 2> v_uint64x2;
+/** @brief Two 64-bit signed integer values */
+typedef v_reg<int64, 2> v_int64x2;
+
+//! @brief Helper macro
+//! @ingroup hal_intrin_impl
 #define OPENCV_HAL_IMPL_BIN_OP(bin_op) \
 template<typename _Tp, int n> inline v_reg<_Tp, n> \
    operator bin_op (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
@@ -140,11 +388,28 @@ template<typename _Tp, int n> inline v_reg<_Tp, n>& \
    return a; \
 }

+/** @brief Add values
+
+For all types. */
 OPENCV_HAL_IMPL_BIN_OP(+)
+
+/** @brief Subtract values
+
+For all types. */
 OPENCV_HAL_IMPL_BIN_OP(-)
+
+/** @brief Multiply values
+
+For 16- and 32-bit integer types and floating types. */
 OPENCV_HAL_IMPL_BIN_OP(*)
+
+/** @brief Divide values
+
+For floating types only. */
 OPENCV_HAL_IMPL_BIN_OP(/)

+//! @brief Helper macro
+//! @ingroup hal_intrin_impl
 #define OPENCV_HAL_IMPL_BIT_OP(bit_op) \
 template<typename _Tp, int n> inline v_reg<_Tp, n> operator bit_op \
    (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
@@ -166,10 +431,24 @@ template<typename _Tp, int n> inline v_reg<_Tp, n>& operator \
    return a; \
 }

+/** @brief Bitwise AND
+
+Only for integer types. */
 OPENCV_HAL_IMPL_BIT_OP(&)
+
+/** @brief Bitwise OR
+
+Only for integer types. */
 OPENCV_HAL_IMPL_BIT_OP(|)
+
+/** @brief Bitwise XOR
+
+Only for integer types.*/
 OPENCV_HAL_IMPL_BIT_OP(^)

+/** @brief Bitwise NOT
+
+Only for integer types.*/
 template<typename _Tp, int n> inline v_reg<_Tp, n> operator ~ (const v_reg<_Tp, n>& a)
 {
    v_reg<_Tp, n> c;
@@ -178,6 +457,8 @@ template<typename _Tp, int n> inline v_reg<_Tp, n> operator ~ (const v_reg<_Tp,
        return c;
 }

+//! @brief Helper macro
+//! @ingroup hal_intrin_impl
 #define OPENCV_HAL_IMPL_MATH_FUNC(func, cfunc, _Tp2) \
 template<typename _Tp, int n> inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a) \
 { \
@@ -187,27 +468,59 @@ template<typename _Tp, int n> inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a)
    return c; \
 }

+/** @brief Square root of elements
+
+Only for floating point types.*/
 OPENCV_HAL_IMPL_MATH_FUNC(v_sqrt, std::sqrt, _Tp)
+
+//! @cond IGNORED
 OPENCV_HAL_IMPL_MATH_FUNC(v_sin, std::sin, _Tp)
 OPENCV_HAL_IMPL_MATH_FUNC(v_cos, std::cos, _Tp)
 OPENCV_HAL_IMPL_MATH_FUNC(v_exp, std::exp, _Tp)
 OPENCV_HAL_IMPL_MATH_FUNC(v_log, std::log, _Tp)
+//! @endcond
+
+/** @brief Absolute value of elements
+
+Only for floating point types.*/
 OPENCV_HAL_IMPL_MATH_FUNC(v_abs, (typename V_TypeTraits<_Tp>::abs_type)std::abs,
                          typename V_TypeTraits<_Tp>::abs_type)
+
+/** @brief Round elements
+
+Only for floating point types.*/
 OPENCV_HAL_IMPL_MATH_FUNC(v_round, cvRound, int)
+
+/** @brief Floor elements
+
+Only for floating point types.*/
 OPENCV_HAL_IMPL_MATH_FUNC(v_floor, cvFloor, int)
+
+/** @brief Ceil elements
+
+Only for floating point types.*/
 OPENCV_HAL_IMPL_MATH_FUNC(v_ceil, cvCeil, int)
+
+/** @brief Truncate elements
+
+Only for floating point types.*/
 OPENCV_HAL_IMPL_MATH_FUNC(v_trunc, int, int)

-#define OPENCV_HAL_IMPL_MINMAX_FUNC(func, hfunc, cfunc) \
+//! @brief Helper macro
+//! @ingroup hal_intrin_impl
+#define OPENCV_HAL_IMPL_MINMAX_FUNC(func, cfunc) \
 template<typename _Tp, int n> inline v_reg<_Tp, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
 { \
    v_reg<_Tp, n> c; \
    for( int i = 0; i < n; i++ ) \
        c.s[i] = cfunc(a.s[i], b.s[i]); \
    return c; \
-} \
-template<typename _Tp, int n> inline _Tp hfunc(const v_reg<_Tp, n>& a) \
+}
+
+//! @brief Helper macro
+//! @ingroup hal_intrin_impl
+#define OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC(func, cfunc) \
+template<typename _Tp, int n> inline _Tp func(const v_reg<_Tp, n>& a) \
 { \
    _Tp c = a.s[0]; \
    for( int i = 1; i < n; i++ ) \
@@ -215,9 +528,49 @@ template<typename _Tp, int n> inline _Tp hfunc(const v_reg<_Tp, n>& a) \
    return c; \
 }

-OPENCV_HAL_IMPL_MINMAX_FUNC(v_min, v_reduce_min, std::min)
-OPENCV_HAL_IMPL_MINMAX_FUNC(v_max, v_reduce_max, std::max)
-
+/** @brief Choose min values for each pair
+
+Scheme:
+@code
+{A1 A2 ...}
+{B1 B2 ...}
+--------------
+{min(A1,B1) min(A2,B2) ...}
+@endcode
+For all types except 64-bit integer. */
+OPENCV_HAL_IMPL_MINMAX_FUNC(v_min, std::min)
+
+/** @brief Choose max values for each pair
+
+Scheme:
+@code
+{A1 A2 ...}
+{B1 B2 ...}
+--------------
+{max(A1,B1) max(A2,B2) ...}
+@endcode
+For all types except 64-bit integer. */
+OPENCV_HAL_IMPL_MINMAX_FUNC(v_max, std::max)
+
+/** @brief Find one min value
+
+Scheme:
+@code
+{A1 A2 A3 ...} => min(A1,A2,A3,...)
+@endcode
+For 32-bit integer and 32-bit floating point types. */
+OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC(v_reduce_min, std::min)
+
+/** @brief Find one max value
+
+Scheme:
+@code
+{A1 A2 A3 ...} => max(A1,A2,A3,...)
+@endcode
+For 32-bit integer and 32-bit floating point types. */
+OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC(v_reduce_max, std::max)
+
+//! @cond IGNORED
 template<typename _Tp, int n>
 inline void v_minmax( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
                      v_reg<_Tp, n>& minval, v_reg<_Tp, n>& maxval )
@@ -228,8 +581,10 @@ inline void v_minmax( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
        maxval.s[i] = std::max(a.s[i], b.s[i]);
    }
 }
+//! @endcond

-
+//! @brief Helper macro
+//! @ingroup hal_intrin_impl
 #define OPENCV_HAL_IMPL_CMP_OP(cmp_op) \
 template<typename _Tp, int n> \
 inline v_reg<_Tp, n> operator cmp_op(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
@@ -241,13 +596,38 @@ inline v_reg<_Tp, n> operator cmp_op(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>
    return c; \
 }

+/** @brief Less-than comparison
+
+For all types except 64-bit integer values. */
 OPENCV_HAL_IMPL_CMP_OP(<)
+
+/** @brief Greater-than comparison
+
+For all types except 64-bit integer values. */
 OPENCV_HAL_IMPL_CMP_OP(>)
+
+/** @brief Less-than or equal comparison
+
+For all types except 64-bit integer values. */
 OPENCV_HAL_IMPL_CMP_OP(<=)
+
+/** @brief Greater-than or equal comparison
+
+For all types except 64-bit integer values. */
 OPENCV_HAL_IMPL_CMP_OP(>=)
+
+/** @brief Equal comparison
+
+For all types except 64-bit integer values. */
 OPENCV_HAL_IMPL_CMP_OP(==)
+
+/** @brief Not equal comparison
+
+For all types except 64-bit integer values. */
 OPENCV_HAL_IMPL_CMP_OP(!=)

+//! @brief Helper macro
+//! @ingroup hal_intrin_impl
 #define OPENCV_HAL_IMPL_ADD_SUB_OP(func, bin_op, cast_op, _Tp2) \
 template<typename _Tp, int n> \
 inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
@@ -259,10 +639,73 @@ inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
    return c; \
 }

+/** @brief Add values without saturation
+
+For 8- and 16-bit integer values. */
 OPENCV_HAL_IMPL_ADD_SUB_OP(v_add_wrap, +, (_Tp), _Tp)
+
+/** @brief Subtract values without saturation
+
+For 8- and 16-bit integer values. */
 OPENCV_HAL_IMPL_ADD_SUB_OP(v_sub_wrap, -, (_Tp), _Tp)
-OPENCV_HAL_IMPL_ADD_SUB_OP(v_absdiff, -, (rtype)std::abs, typename V_TypeTraits<_Tp>::abs_type)

+//! @cond IGNORED
+template<typename T> inline T _absdiff(T a, T b)
+{
+    return a > b ? a - b : b - a;
+}
+//! @endcond
+
+/** @brief Absolute difference
+
+Returns \f$ |a - b| \f$ converted to corresponding unsigned type.
+Example:
+@code{.cpp}
+v_int32x4 a, b; // {1, 2, 3, 4} and {4, 3, 2, 1}
+v_uint32x4 c = v_absdiff(a, b); // result is {3, 1, 1, 3}
+@endcode
+For 8-, 16-, 32-bit integer source types. */
+template<typename _Tp, int n>
+inline v_reg<typename V_TypeTraits<_Tp>::abs_type, n> v_absdiff(const v_reg<_Tp, n>& a, const v_reg<_Tp, n> & b)
+{
+    typedef typename V_TypeTraits<_Tp>::abs_type rtype;
+    v_reg<rtype, n> c;
+    const rtype mask = std::numeric_limits<_Tp>::is_signed ? (1 << (sizeof(rtype)*8 - 1)) : 0;
+    for( int i = 0; i < n; i++ )
+    {
+        rtype ua = a.s[i] ^ mask;
+        rtype ub = b.s[i] ^ mask;
+        c.s[i] = _absdiff(ua, ub);
+    }
+    return c;
+}
+
+/** @overload
+
+For 32-bit floating point values */
+inline v_float32x4 v_absdiff(const v_float32x4& a, const v_float32x4& b)
+{
+    v_float32x4 c;
+    for( int i = 0; i < c.nlanes; i++ )
+        c.s[i] = _absdiff(a.s[i], b.s[i]);
+    return c;
+}
+
+/** @overload
+
+For 64-bit floating point values */
+inline v_float64x2 v_absdiff(const v_float64x2& a, const v_float64x2& b)
+{
+    v_float64x2 c;
+    for( int i = 0; i < c.nlanes; i++ )
+        c.s[i] = _absdiff(a.s[i], b.s[i]);
+    return c;
+}
+
+/** @brief Inversed square root
+
+Returns \f$ 1/sqrt(a) \f$
+For floating point types only. */
 template<typename _Tp, int n>
 inline v_reg<_Tp, n> v_invsqrt(const v_reg<_Tp, n>& a)
 {
@@ -272,6 +715,10 @@ inline v_reg<_Tp, n> v_invsqrt(const v_reg<_Tp, n>& a)
    return c;
 }

+/** @brief Magnitude
+
+Returns \f$ sqrt(a^2 + b^2) \f$
+For floating point types only. */
 template<typename _Tp, int n>
 inline v_reg<_Tp, n> v_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
 {
@@ -281,7 +728,10 @@ inline v_reg<_Tp, n> v_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
    return c;
 }

+/** @brief Square of the magnitude

+Returns \f$ a^2 + b^2 \f$
+For floating point types only. */
 template<typename _Tp, int n>
 inline v_reg<_Tp, n> v_sqr_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
 {
@@ -291,6 +741,10 @@ inline v_reg<_Tp, n> v_sqr_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>
    return c;
 }

+/** @brief Multiply and add
+
+Returns \f$ a*b + c \f$
+For floating point types only. */
 template<typename _Tp, int n>
 inline v_reg<_Tp, n> v_muladd(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
                              const v_reg<_Tp, n>& c)
@@ -301,6 +755,18 @@ inline v_reg<_Tp, n> v_muladd(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
    return d;
 }

+/** @brief Dot product of elements
+
+Multiply values in two registers and sum adjacent result pairs.
+Scheme:
+@code
+  {A1 A2 ...} // 16-bit
+x {B1 B2 ...} // 16-bit
+-------------
+{A1B1+A2B2 ...} // 32-bit
+@endcode
+Implemented only for 16-bit signed source type (v_int16x8).
+*/
 template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
    v_dotprod(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
 {
@@ -311,6 +777,25 @@ template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n
    return c;
 }

+/** @brief Multiply and expand
+
+Multiply values two registers and store results in two registers with wider pack type.
+Scheme:
+@code
+  {A B C D} // 32-bit
+x {E F G H} // 32-bit
+---------------
+{AE BF}         // 64-bit
+        {CG DH} // 64-bit
+@endcode
+Example:
+@code{.cpp}
+v_uint32x4 a, b; // {1,2,3,4} and {2,2,2,2}
+v_uint64x2 c, d; // results
+v_mul_expand(a, b, c, d); // c, d = {2,4}, {6, 8}
+@endcode
+Implemented only for 16- and unsigned 32-bit source types (v_int16x8, v_uint16x8, v_uint32x4).
+*/
 template<typename _Tp, int n> inline void v_mul_expand(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
                                                       v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& c,
                                                       v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& d)
@@ -318,11 +803,12 @@ template<typename _Tp, int n> inline void v_mul_expand(const v_reg<_Tp, n>& a, c
    typedef typename V_TypeTraits<_Tp>::w_type w_type;
    for( int i = 0; i < (n/2); i++ )
    {
-        c.s[i] = (w_type)a.s[i]*b.s[i]*2;
+        c.s[i] = (w_type)a.s[i]*b.s[i];
        d.s[i] = (w_type)a.s[i+(n/2)]*b.s[i+(n/2)];
    }
 }

+//! @cond IGNORED
 template<typename _Tp, int n> inline void v_hsum(const v_reg<_Tp, n>& a,
                                                 v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& c)
 {
@@ -332,7 +818,10 @@ template<typename _Tp, int n> inline void v_hsum(const v_reg<_Tp, n>& a,
        c.s[i] = (w_type)a.s[i*2] + a.s[i*2+1];
    }
 }
+//! @endcond

+//! @brief Helper macro
+//! @ingroup hal_intrin_impl
 #define OPENCV_HAL_IMPL_SHIFT_OP(shift_op) \
 template<typename _Tp, int n> inline v_reg<_Tp, n> operator shift_op(const v_reg<_Tp, n>& a, int imm) \
 { \
@@ -342,9 +831,23 @@ template<typename _Tp, int n> inline v_reg<_Tp, n> operator shift_op(const v_reg
    return c; \
 }

+/** @brief Bitwise shift left
+
+For 16-, 32- and 64-bit integer values. */
 OPENCV_HAL_IMPL_SHIFT_OP(<<)
+
+/** @brief Bitwise shift right
+
+For 16-, 32- and 64-bit integer values. */
 OPENCV_HAL_IMPL_SHIFT_OP(>>)

+/** @brief Sum packed values
+
+Scheme:
+@code
+{A1 A2 A3 ...} => sum{A1,A2,A3,...}
+@endcode
+For 32-bit integer and 32-bit floating point types.*/
 template<typename _Tp, int n> inline typename V_TypeTraits<_Tp>::sum_type v_reduce_sum(const v_reg<_Tp, n>& a)
 {
    typename V_TypeTraits<_Tp>::sum_type c = a.s[0];
@@ -353,6 +856,15 @@ template<typename _Tp, int n> inline typename V_TypeTraits<_Tp>::sum_type v_redu
    return c;
 }

+/** @brief Get negative values mask
+
+Returned value is a bit mask with bits set to 1 on places corresponding to negative packed values indexes.
+Example:
+@code{.cpp}
+v_int32x4 r; // set to {-1, -1, 1, 1}
+int mask = v_signmask(r); // mask = 3 <== 00000000 00000000 00000000 00000011
+@endcode
+For all types except 64-bit. */
 template<typename _Tp, int n> inline int v_signmask(const v_reg<_Tp, n>& a)
 {
    int mask = 0;
@@ -361,6 +873,10 @@ template<typename _Tp, int n> inline int v_signmask(const v_reg<_Tp, n>& a)
    return mask;
 }

+/** @brief Check if all packed values are less than zero
+
+Unsigned values will be casted to signed: `uchar 254 => char -2`.
+For all types except 64-bit. */
 template<typename _Tp, int n> inline bool v_check_all(const v_reg<_Tp, n>& a)
 {
    for( int i = 0; i < n; i++ )
@@ -369,6 +885,10 @@ template<typename _Tp, int n> inline bool v_check_all(const v_reg<_Tp, n>& a)
    return true;
 }

+/** @brief Check if any of packed values is less than zero
+
+Unsigned values will be casted to signed: `uchar 254 => char -2`.
+For all types except 64-bit. */
 template<typename _Tp, int n> inline bool v_check_any(const v_reg<_Tp, n>& a)
 {
    for( int i = 0; i < n; i++ )
@@ -377,15 +897,36 @@ template<typename _Tp, int n> inline bool v_check_any(const v_reg<_Tp, n>& a)
    return false;
 }

+/** @brief Bitwise select
+
+Return value will be built by combining values a and b using the following scheme:
+If the i-th bit in _mask_ is 1
+    select i-th bit from _a_
+else
+    select i-th bit from _b_ */
 template<typename _Tp, int n> inline v_reg<_Tp, n> v_select(const v_reg<_Tp, n>& mask,
                                                           const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
 {
+    typedef V_TypeTraits<_Tp> Traits;
+    typedef typename Traits::int_type int_type;
    v_reg<_Tp, n> c;
    for( int i = 0; i < n; i++ )
-        c.s[i] = V_TypeTraits<_Tp>::reinterpret_int(mask.s[i]) < 0 ? b.s[i] : a.s[i];
+    {
+        int_type m = Traits::reinterpret_int(mask.s[i]);
+        c.s[i] =  Traits::reinterpret_from_int((Traits::reinterpret_int(a.s[i]) & m)
+                                             | (Traits::reinterpret_int(b.s[i]) & ~m));
+    }
    return c;
 }

+/** @brief Expand values to the wider pack type
+
+Copy contents of register to two registers with 2x wider pack type.
+Scheme:
+@code
+ int32x4     int64x2 int64x2
+{A B C D} ==> {A B} , {C D}
+@endcode */
 template<typename _Tp, int n> inline void v_expand(const v_reg<_Tp, n>& a,
                            v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& b0,
                            v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& b1)
@@ -397,6 +938,7 @@ template<typename _Tp, int n> inline void v_expand(const v_reg<_Tp, n>& a,
    }
 }

+//! @cond IGNORED
 template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::int_type, n>
    v_reinterpret_as_int(const v_reg<_Tp, n>& a)
 {
@@ -414,7 +956,19 @@ template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::uint_type
        c.s[i] = V_TypeTraits<_Tp>::reinterpret_uint(a.s[i]);
    return c;
 }
-
+//! @endcond
+
+/** @brief Interleave two vectors
+
+Scheme:
+@code
+  {A1 A2 A3 A4}
+  {B1 B2 B3 B4}
+---------------
+  {A1 B1 A2 B2} and {A3 B3 A4 B4}
+@endcode
+For all types except 64-bit.
+*/
 template<typename _Tp, int n> inline void v_zip( const v_reg<_Tp, n>& a0, const v_reg<_Tp, n>& a1,
                                               v_reg<_Tp, n>& b0, v_reg<_Tp, n>& b1 )
 {
@@ -431,50 +985,102 @@ template<typename _Tp, int n> inline void v_zip( const v_reg<_Tp, n>& a0, const
    }
 }

-template<typename _Tp, int n> inline v_reg<_Tp, n> v_load(const _Tp* ptr)
+/** @brief Load register contents from memory
+
+@param ptr pointer to memory block with data
+@return register object
+
+@note Returned type will be detected from passed pointer type, for example uchar ==> cv::v_uint8x16, int ==> cv::v_int32x4, etc.
+ */
+template<typename _Tp>
+inline v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> v_load(const _Tp* ptr)
 {
-    return v_reg<_Tp, n>(ptr);
+    return v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes>(ptr);
 }

-template<typename _Tp, int n> inline v_reg<_Tp, n> v_load_aligned(const _Tp* ptr)
+/** @brief Load register contents from memory (aligned)
+
+similar to cv::v_load, but source memory block should be aligned (to 16-byte boundary)
+ */
+template<typename _Tp>
+inline v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> v_load_aligned(const _Tp* ptr)
 {
-    return v_reg<_Tp, n>(ptr);
+    return v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes>(ptr);
 }

-template<typename _Tp, int n> inline void v_load_halves(const _Tp* loptr, const _Tp* hiptr)
+/** @brief Load register contents from two memory blocks
+
+@param loptr memory block containing data for first half (0..n/2)
+@param hiptr memory block containing data for second half (n/2..n)
+
+@code{.cpp}
+int lo[2] = { 1, 2 }, hi[2] = { 3, 4 };
+v_int32x4 r = v_load_halves(lo, hi);
+@endcode
+ */
+template<typename _Tp>
+inline v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> v_load_halves(const _Tp* loptr, const _Tp* hiptr)
 {
-    v_reg<_Tp, n> c;
-    for( int i = 0; i < n/2; i++ )
+    v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> c;
+    for( int i = 0; i < c.nlanes/2; i++ )
    {
        c.s[i] = loptr[i];
-        c.s[i+n/2] = hiptr[i];
+        c.s[i+c.nlanes/2] = hiptr[i];
    }
    return c;
 }

-template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n> v_load_expand(const _Tp* ptr)
+/** @brief Load register contents from memory with double expand
+
+Same as cv::v_load, but result pack type will be 2x wider than memory type.
+
+@code{.cpp}
+short buf[4] = {1, 2, 3, 4}; // type is int16
+v_int32x4 r = v_load_expand(buf); // r = {1, 2, 3, 4} - type is int32
+@endcode
+For 8-, 16-, 32-bit integer source types. */
+template<typename _Tp>
+inline v_reg<typename V_TypeTraits<_Tp>::w_type, V_SIMD128Traits<_Tp>::nlanes / 2>
+v_load_expand(const _Tp* ptr)
 {
    typedef typename V_TypeTraits<_Tp>::w_type w_type;
-    v_reg<w_type, n> c;
-    for( int i = 0; i < n; i++ )
+    v_reg<w_type, V_SIMD128Traits<w_type>::nlanes> c;
+    for( int i = 0; i < c.nlanes; i++ )
    {
        c.s[i] = ptr[i];
    }
    return c;
 }

-template<typename _Tp, int n> inline v_reg<typename
-    V_TypeTraits<typename V_TypeTraits<_Tp>::w_type>::w_type, n> v_load_expand_q(const _Tp* ptr)
+/** @brief Load register contents from memory with quad expand
+
+Same as cv::v_load_expand, but result type is 4 times wider than source.
+@code{.cpp}
+char buf[4] = {1, 2, 3, 4}; // type is int8
+v_int32x4 r = v_load_q(buf); // r = {1, 2, 3, 4} - type is int32
+@endcode
+For 8-bit integer source types. */
+template<typename _Tp>
+inline v_reg<typename V_TypeTraits<_Tp>::q_type, V_SIMD128Traits<_Tp>::nlanes / 4>
+v_load_expand_q(const _Tp* ptr)
 {
-    typedef typename V_TypeTraits<typename V_TypeTraits<_Tp>::w_type>::w_type w_type;
-    v_reg<w_type, n> c;
-    for( int i = 0; i < n; i++ )
+    typedef typename V_TypeTraits<_Tp>::q_type q_type;
+    v_reg<q_type, V_SIMD128Traits<q_type>::nlanes> c;
+    for( int i = 0; i < c.nlanes; i++ )
    {
        c.s[i] = ptr[i];
    }
    return c;
 }

+/** @brief Load and deinterleave (4 channels)
+
+Load data from memory deinterleave and store to 4 registers.
+Scheme:
+@code
+{A1 B1 C1 D1 A2 B2 C2 D2 ...} ==> {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}, {D1 D2 ...}
+@endcode
+For all types except 64-bit. */
 template<typename _Tp, int n> inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
                                                            v_reg<_Tp, n>& b, v_reg<_Tp, n>& c)
 {
@@ -487,6 +1093,14 @@ template<typename _Tp, int n> inline void v_load_deinterleave(const _Tp* ptr, v_
    }
 }

+/** @brief Load and deinterleave (3 channels)
+
+Load data from memory deinterleave and store to 3 registers.
+Scheme:
+@code
+{A1 B1 C1 A2 B2 C2 ...} ==> {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}
+@endcode
+For all types except 64-bit. */
 template<typename _Tp, int n>
 inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
                                v_reg<_Tp, n>& b, v_reg<_Tp, n>& c,
@@ -502,6 +1116,14 @@ inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
    }
 }

+/** @brief Interleave and store (3 channels)
+
+Interleave and store data from 3 registers to memory.
+Scheme:
+@code
+{A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}, {D1 D2 ...} ==> {A1 B1 C1 D1 A2 B2 C2 D2 ...}
+@endcode
+For all types except 64-bit. */
 template<typename _Tp, int n>
 inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
                                const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c)
@@ -515,6 +1137,14 @@ inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
    }
 }

+/** @brief Interleave and store (4 channels)
+
+Interleave and store data from 4 registers to memory.
+Scheme:
+@code
+{A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}, {D1 D2 ...} ==> {A1 B1 C1 D1 A2 B2 C2 D2 ...}
+@endcode
+For all types except 64-bit. */
 template<typename _Tp, int n> inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
                                                            const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c,
                                                            const v_reg<_Tp, n>& d)
@@ -529,6 +1159,14 @@ template<typename _Tp, int n> inline void v_store_interleave( _Tp* ptr, const v_
    }
 }

+/** @brief Store data to memory
+
+Store register contents to memory.
+Scheme:
+@code
+  REG {A B C D} ==> MEM {A B C D}
+@endcode
+Pointer can be unaligned. */
 template<typename _Tp, int n>
 inline void v_store(_Tp* ptr, const v_reg<_Tp, n>& a)
 {
@@ -536,6 +1174,13 @@ inline void v_store(_Tp* ptr, const v_reg<_Tp, n>& a)
        ptr[i] = a.s[i];
 }

+/** @brief Store data to memory (lower half)
+
+Store lower half of register contents to memory.
+Scheme:
+@code
+  REG {A B C D} ==> MEM {A B}
+@endcode */
 template<typename _Tp, int n>
 inline void v_store_low(_Tp* ptr, const v_reg<_Tp, n>& a)
 {
@@ -543,6 +1188,13 @@ inline void v_store_low(_Tp* ptr, const v_reg<_Tp, n>& a)
        ptr[i] = a.s[i];
 }

+/** @brief Store data to memory (higher half)
+
+Store higher half of register contents to memory.
+Scheme:
+@code
+  REG {A B C D} ==> MEM {C D}
+@endcode */
 template<typename _Tp, int n>
 inline void v_store_high(_Tp* ptr, const v_reg<_Tp, n>& a)
 {
@@ -550,6 +1202,14 @@ inline void v_store_high(_Tp* ptr, const v_reg<_Tp, n>& a)
        ptr[i] = a.s[i+(n/2)];
 }

+/** @brief Store data to memory (aligned)
+
+Store register contents to memory.
+Scheme:
+@code
+  REG {A B C D} ==> MEM {A B C D}
+@endcode
+Pointer __should__ be aligned by 16-byte boundary. */
 template<typename _Tp, int n>
 inline void v_store_aligned(_Tp* ptr, const v_reg<_Tp, n>& a)
 {
@@ -557,6 +1217,16 @@ inline void v_store_aligned(_Tp* ptr, const v_reg<_Tp, n>& a)
        ptr[i] = a.s[i];
 }

+/** @brief Combine vector from first elements of two vectors
+
+Scheme:
+@code
+  {A1 A2 A3 A4}
+  {B1 B2 B3 B4}
+---------------
+  {A1 A2 B1 B2}
+@endcode
+For all types except 64-bit. */
 template<typename _Tp, int n>
 inline v_reg<_Tp, n> v_combine_low(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
 {
@@ -569,6 +1239,16 @@ inline v_reg<_Tp, n> v_combine_low(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>&
    return c;
 }

+/** @brief Combine vector from last elements of two vectors
+
+Scheme:
+@code
+  {A1 A2 A3 A4}
+  {B1 B2 B3 B4}
+---------------
+  {A3 A4 B3 B4}
+@endcode
+For all types except 64-bit. */
 template<typename _Tp, int n>
 inline v_reg<_Tp, n> v_combine_high(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
 {
@@ -581,6 +1261,12 @@ inline v_reg<_Tp, n> v_combine_high(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>&
    return c;
 }

+/** @brief Combine two vectors from lower and higher parts of two other vectors
+
+@code{.cpp}
+low = cv::v_combine_low(a, b);
+high = cv::v_combine_high(a, b);
+@endcode */
 template<typename _Tp, int n>
 inline void v_recombine(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
                        v_reg<_Tp, n>& low, v_reg<_Tp, n>& high)
@@ -594,18 +1280,41 @@ inline void v_recombine(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
    }
 }

+/** @brief Vector extract
+
+Scheme:
+@code
+  {A1 A2 A3 A4}
+  {B1 B2 B3 B4}
+========================
+shift = 1  {A2 A3 A4 B1}
+shift = 2  {A3 A4 B1 B2}
+shift = 3  {A4 B1 B2 B3}
+@endcode
+Restriction: 0 <= shift < nlanes
+
+Usage:
+@code
+v_int32x4 a, b, c;
+c = v_extract<2>(a, b);
+@endcode
+For integer types only. */
 template<int s, typename _Tp, int n>
 inline v_reg<_Tp, n> v_extract(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
 {
    v_reg<_Tp, n> r;
+    const int shift = n - s;
    int i = 0;
-    for (; i < s; ++i)
-        r.s[i] = a.s[i+n-s];
+    for (; i < shift; ++i)
+        r.s[i] = a.s[i+s];
    for (; i < n; ++i)
-        r.s[i] = b.s[i-s];
+        r.s[i] = b.s[i-shift];
    return r;
 }

+/** @brief Round
+
+Rounds each value. Input type is float vector ==> output type is int vector.*/
 template<int n> inline v_reg<int, n> v_round(const v_reg<float, n>& a)
 {
    v_reg<int, n> c;
@@ -614,6 +1323,9 @@ template<int n> inline v_reg<int, n> v_round(const v_reg<float, n>& a)
    return c;
 }

+/** @brief Floor
+
+Floor each value. Input type is float vector ==> output type is int vector.*/
 template<int n> inline v_reg<int, n> v_floor(const v_reg<float, n>& a)
 {
    v_reg<int, n> c;
@@ -622,6 +1334,9 @@ template<int n> inline v_reg<int, n> v_floor(const v_reg<float, n>& a)
    return c;
 }

+/** @brief Ceil
+
+Ceil each value. Input type is float vector ==> output type is int vector.*/
 template<int n> inline v_reg<int, n> v_ceil(const v_reg<float, n>& a)
 {
    v_reg<int, n> c;
@@ -630,6 +1345,9 @@ template<int n> inline v_reg<int, n> v_ceil(const v_reg<float, n>& a)
    return c;
 }

+/** @brief Trunc
+
+Truncate each value. Input type is float vector ==> output type is int vector.*/
 template<int n> inline v_reg<int, n> v_trunc(const v_reg<float, n>& a)
 {
    v_reg<int, n> c;
@@ -638,6 +1356,7 @@ template<int n> inline v_reg<int, n> v_trunc(const v_reg<float, n>& a)
    return c;
 }

+/** @overload */
 template<int n> inline v_reg<int, n*2> v_round(const v_reg<double, n>& a)
 {
    v_reg<int, n*2> c;
@@ -649,6 +1368,7 @@ template<int n> inline v_reg<int, n*2> v_round(const v_reg<double, n>& a)
    return c;
 }

+/** @overload */
 template<int n> inline v_reg<int, n*2> v_floor(const v_reg<double, n>& a)
 {
    v_reg<int, n> c;
@@ -660,6 +1380,7 @@ template<int n> inline v_reg<int, n*2> v_floor(const v_reg<double, n>& a)
    return c;
 }

+/** @overload */
 template<int n> inline v_reg<int, n*2> v_ceil(const v_reg<double, n>& a)
 {
    v_reg<int, n> c;
@@ -671,6 +1392,7 @@ template<int n> inline v_reg<int, n*2> v_ceil(const v_reg<double, n>& a)
    return c;
 }

+/** @overload */
 template<int n> inline v_reg<int, n*2> v_trunc(const v_reg<double, n>& a)
 {
    v_reg<int, n> c;
@@ -682,6 +1404,9 @@ template<int n> inline v_reg<int, n*2> v_trunc(const v_reg<double, n>& a)
    return c;
 }

+/** @brief Convert to float
+
+Supported input type is cv::v_int32x4. */
 template<int n> inline v_reg<float, n> v_cvt_f32(const v_reg<int, n>& a)
 {
    v_reg<float, n> c;
@@ -690,6 +1415,9 @@ template<int n> inline v_reg<float, n> v_cvt_f32(const v_reg<int, n>& a)
    return c;
 }

+/** @brief Convert to double
+
+Supported input type is cv::v_int32x4. */
 template<int n> inline v_reg<double, n> v_cvt_f64(const v_reg<int, n*2>& a)
 {
    v_reg<double, n> c;
@@ -698,6 +1426,9 @@ template<int n> inline v_reg<double, n> v_cvt_f64(const v_reg<int, n*2>& a)
    return c;
 }

+/** @brief Convert to double
+
+Supported input type is cv::v_float32x4. */
 template<int n> inline v_reg<double, n> v_cvt_f64(const v_reg<float, n*2>& a)
 {
    v_reg<double, n> c;
@@ -706,6 +1437,21 @@ template<int n> inline v_reg<double, n> v_cvt_f64(const v_reg<float, n*2>& a)
    return c;
 }

+/** @brief Transpose 4x4 matrix
+
+Scheme:
+@code
+a0  {A1 A2 A3 A4}
+a1  {B1 B2 B3 B4}
+a2  {C1 C2 C3 C4}
+a3  {D1 D2 D3 D4}
+===============
+b0  {A1 B1 C1 D1}
+b1  {A2 B2 C2 D2}
+b2  {A3 B3 C3 D3}
+b3  {A4 B4 C4 D4}
+@endcode
+*/
 template<typename _Tp>
 inline void v_transpose4x4( v_reg<_Tp, 4>& a0, const v_reg<_Tp, 4>& a1,
                            const v_reg<_Tp, 4>& a2, const v_reg<_Tp, 4>& a3,
@@ -718,41 +1464,105 @@ inline void v_transpose4x4( v_reg<_Tp, 4>& a0, const v_reg<_Tp, 4>& a1,
    b3 = v_reg<_Tp, 4>(a0.s[3], a1.s[3], a2.s[3], a3.s[3]);
 }

-typedef v_reg<uchar, 16> v_uint8x16;
-typedef v_reg<schar, 16> v_int8x16;
-typedef v_reg<ushort, 8> v_uint16x8;
-typedef v_reg<short, 8> v_int16x8;
-typedef v_reg<unsigned, 4> v_uint32x4;
-typedef v_reg<int, 4> v_int32x4;
-typedef v_reg<float, 4> v_float32x4;
-typedef v_reg<float, 8> v_float32x8;
-typedef v_reg<double, 2> v_float64x2;
-typedef v_reg<uint64, 2> v_uint64x2;
-typedef v_reg<int64, 2> v_int64x2;
-
-#define OPENCV_HAL_IMPL_C_INIT(_Tpvec, _Tp, suffix) \
-inline _Tpvec v_setzero_##suffix() { return _Tpvec::zero(); } \
-inline _Tpvec v_setall_##suffix(_Tp val) { return _Tpvec::all(val); } \
+//! @brief Helper macro
+//! @ingroup hal_intrin_impl
+#define OPENCV_HAL_IMPL_C_INIT_ZERO(_Tpvec, _Tp, suffix) \
+inline _Tpvec v_setzero_##suffix() { return _Tpvec::zero(); }
+
+//! @name Init with zero
+//! @{
+//! @brief Create new vector with zero elements
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint8x16, uchar, u8)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_int8x16, schar, s8)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint16x8, ushort, u16)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_int16x8, short, s16)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint32x4, unsigned, u32)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_int32x4, int, s32)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_float32x4, float, f32)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_float64x2, double, f64)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint64x2, uint64, u64)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_int64x2, int64, s64)
+//! @}
+
+//! @brief Helper macro
+//! @ingroup hal_intrin_impl
+#define OPENCV_HAL_IMPL_C_INIT_VAL(_Tpvec, _Tp, suffix) \
+inline _Tpvec v_setall_##suffix(_Tp val) { return _Tpvec::all(val); }
+
+//! @name Init with value
+//! @{
+//! @brief Create new vector with elements set to a specific value
+OPENCV_HAL_IMPL_C_INIT_VAL(v_uint8x16, uchar, u8)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_int8x16, schar, s8)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_uint16x8, ushort, u16)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_int16x8, short, s16)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_uint32x4, unsigned, u32)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_int32x4, int, s32)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_float32x4, float, f32)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_float64x2, double, f64)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_uint64x2, uint64, u64)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_int64x2, int64, s64)
+//! @}
+
+//! @brief Helper macro
+//! @ingroup hal_intrin_impl
+#define OPENCV_HAL_IMPL_C_REINTERPRET(_Tpvec, _Tp, suffix) \
 template<typename _Tp0, int n0> inline _Tpvec \
    v_reinterpret_as_##suffix(const v_reg<_Tp0, n0>& a) \
-{ return a.template reinterpret_as<_Tp, _Tpvec::nlanes>(a); }
-
-OPENCV_HAL_IMPL_C_INIT(v_uint8x16, uchar, u8)
-OPENCV_HAL_IMPL_C_INIT(v_int8x16, schar, s8)
-OPENCV_HAL_IMPL_C_INIT(v_uint16x8, ushort, u16)
-OPENCV_HAL_IMPL_C_INIT(v_int16x8, short, s16)
-OPENCV_HAL_IMPL_C_INIT(v_uint32x4, unsigned, u32)
-OPENCV_HAL_IMPL_C_INIT(v_int32x4, int, s32)
-OPENCV_HAL_IMPL_C_INIT(v_float32x4, float, f32)
-OPENCV_HAL_IMPL_C_INIT(v_float64x2, double, f64)
-OPENCV_HAL_IMPL_C_INIT(v_uint64x2, uint64, u64)
-OPENCV_HAL_IMPL_C_INIT(v_uint64x2, int64, s64)
-
-#define OPENCV_HAL_IMPL_C_SHIFT(_Tpvec, _Tp) \
+{ return a.template reinterpret_as<_Tp, _Tpvec::nlanes>(); }
+
+//! @name Reinterpret
+//! @{
+//! @brief Convert vector to different type without modifying underlying data.
+OPENCV_HAL_IMPL_C_REINTERPRET(v_uint8x16, uchar, u8)
+OPENCV_HAL_IMPL_C_REINTERPRET(v_int8x16, schar, s8)
+OPENCV_HAL_IMPL_C_REINTERPRET(v_uint16x8, ushort, u16)
+OPENCV_HAL_IMPL_C_REINTERPRET(v_int16x8, short, s16)
+OPENCV_HAL_IMPL_C_REINTERPRET(v_uint32x4, unsigned, u32)
+OPENCV_HAL_IMPL_C_REINTERPRET(v_int32x4, int, s32)
+OPENCV_HAL_IMPL_C_REINTERPRET(v_float32x4, float, f32)
+OPENCV_HAL_IMPL_C_REINTERPRET(v_float64x2, double, f64)
+OPENCV_HAL_IMPL_C_REINTERPRET(v_uint64x2, uint64, u64)
+OPENCV_HAL_IMPL_C_REINTERPRET(v_int64x2, int64, s64)
+//! @}
+
+//! @brief Helper macro
+//! @ingroup hal_intrin_impl
+#define OPENCV_HAL_IMPL_C_SHIFTL(_Tpvec, _Tp) \
 template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
-{ return a << n; } \
+{ return a << n; }
+
+//! @name Left shift
+//! @{
+//! @brief Shift left
+OPENCV_HAL_IMPL_C_SHIFTL(v_uint16x8, ushort)
+OPENCV_HAL_IMPL_C_SHIFTL(v_int16x8, short)
+OPENCV_HAL_IMPL_C_SHIFTL(v_uint32x4, unsigned)
+OPENCV_HAL_IMPL_C_SHIFTL(v_int32x4, int)
+OPENCV_HAL_IMPL_C_SHIFTL(v_uint64x2, uint64)
+OPENCV_HAL_IMPL_C_SHIFTL(v_int64x2, int64)
+//! @}
+
+//! @brief Helper macro
+//! @ingroup hal_intrin_impl
+#define OPENCV_HAL_IMPL_C_SHIFTR(_Tpvec, _Tp) \
 template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
-{ return a >> n; } \
+{ return a >> n; }
+
+//! @name Right shift
+//! @{
+//! @brief Shift right
+OPENCV_HAL_IMPL_C_SHIFTR(v_uint16x8, ushort)
+OPENCV_HAL_IMPL_C_SHIFTR(v_int16x8, short)
+OPENCV_HAL_IMPL_C_SHIFTR(v_uint32x4, unsigned)
+OPENCV_HAL_IMPL_C_SHIFTR(v_int32x4, int)
+OPENCV_HAL_IMPL_C_SHIFTR(v_uint64x2, uint64)
+OPENCV_HAL_IMPL_C_SHIFTR(v_int64x2, int64)
+//! @}
+
+//! @brief Helper macro
+//! @ingroup hal_intrin_impl
+#define OPENCV_HAL_IMPL_C_RSHIFTR(_Tpvec, _Tp) \
 template<int n> inline _Tpvec v_rshr(const _Tpvec& a) \
 { \
    _Tpvec c; \
@@ -761,15 +1571,20 @@ template<int n> inline _Tpvec v_rshr(const _Tpvec& a) \
    return c; \
 }

-OPENCV_HAL_IMPL_C_SHIFT(v_uint16x8, ushort)
-OPENCV_HAL_IMPL_C_SHIFT(v_int16x8, short)
-OPENCV_HAL_IMPL_C_SHIFT(v_uint32x4, unsigned)
-OPENCV_HAL_IMPL_C_SHIFT(v_int32x4, int)
-OPENCV_HAL_IMPL_C_SHIFT(v_uint64x2, uint64)
-OPENCV_HAL_IMPL_C_SHIFT(v_int64x2, int64)
-
-
-#define OPENCV_HAL_IMPL_C_PACK(_Tpvec, _Tp, _Tpnvec, _Tpn, pack_suffix) \
+//! @name Rounding shift
+//! @{
+//! @brief Rounding shift right
+OPENCV_HAL_IMPL_C_RSHIFTR(v_uint16x8, ushort)
+OPENCV_HAL_IMPL_C_RSHIFTR(v_int16x8, short)
+OPENCV_HAL_IMPL_C_RSHIFTR(v_uint32x4, unsigned)
+OPENCV_HAL_IMPL_C_RSHIFTR(v_int32x4, int)
+OPENCV_HAL_IMPL_C_RSHIFTR(v_uint64x2, uint64)
+OPENCV_HAL_IMPL_C_RSHIFTR(v_int64x2, int64)
+//! @}
+
+//! @brief Helper macro
+//! @ingroup hal_intrin_impl
+#define OPENCV_HAL_IMPL_C_PACK(_Tpvec, _Tpnvec, _Tpn, pack_suffix) \
 inline _Tpnvec v_##pack_suffix(const _Tpvec& a, const _Tpvec& b) \
 { \
    _Tpnvec c; \
@@ -779,7 +1594,30 @@ inline _Tpnvec v_##pack_suffix(const _Tpvec& a, const _Tpvec& b) \
        c.s[i+_Tpvec::nlanes] = saturate_cast<_Tpn>(b.s[i]); \
    } \
    return c; \
-} \
+}
+
+//! @name Pack
+//! @{
+//! @brief Pack values from two vectors to one
+//!
+//! Return vector type have twice more elements than input vector types. Variant with _u_ suffix also
+//! converts to corresponding unsigned type.
+//!
+//! - pack: for 16-, 32- and 64-bit integer input types
+//! - pack_u: for 16- and 32-bit signed integer input types
+OPENCV_HAL_IMPL_C_PACK(v_uint16x8, v_uint8x16, uchar, pack)
+OPENCV_HAL_IMPL_C_PACK(v_int16x8, v_int8x16, schar, pack)
+OPENCV_HAL_IMPL_C_PACK(v_uint32x4, v_uint16x8, ushort, pack)
+OPENCV_HAL_IMPL_C_PACK(v_int32x4, v_int16x8, short, pack)
+OPENCV_HAL_IMPL_C_PACK(v_uint64x2, v_uint32x4, unsigned, pack)
+OPENCV_HAL_IMPL_C_PACK(v_int64x2, v_int32x4, int, pack)
+OPENCV_HAL_IMPL_C_PACK(v_int16x8, v_uint8x16, uchar, pack_u)
+OPENCV_HAL_IMPL_C_PACK(v_int32x4, v_uint16x8, ushort, pack_u)
+//! @}
+
+//! @brief Helper macro
+//! @ingroup hal_intrin_impl
+#define OPENCV_HAL_IMPL_C_RSHR_PACK(_Tpvec, _Tp, _Tpnvec, _Tpn, pack_suffix) \
 template<int n> inline _Tpnvec v_rshr_##pack_suffix(const _Tpvec& a, const _Tpvec& b) \
 { \
    _Tpnvec c; \
@@ -789,27 +1627,98 @@ template<int n> inline _Tpnvec v_rshr_##pack_suffix(const _Tpvec& a, const _Tpve
        c.s[i+_Tpvec::nlanes] = saturate_cast<_Tpn>((b.s[i] + ((_Tp)1 << (n - 1))) >> n); \
    } \
    return c; \
-} \
+}
+
+//! @name Pack with rounding shift
+//! @{
+//! @brief Pack values from two vectors to one with rounding shift
+//!
+//! Values from the input vectors will be shifted right by _n_ bits with rounding, converted to narrower
+//! type and returned in the result vector. Variant with _u_ suffix converts to unsigned type.
+//!
+//! - pack: for 16-, 32- and 64-bit integer input types
+//! - pack_u: for 16- and 32-bit signed integer input types
+OPENCV_HAL_IMPL_C_RSHR_PACK(v_uint16x8, ushort, v_uint8x16, uchar, pack)
+OPENCV_HAL_IMPL_C_RSHR_PACK(v_int16x8, short, v_int8x16, schar, pack)
+OPENCV_HAL_IMPL_C_RSHR_PACK(v_uint32x4, unsigned, v_uint16x8, ushort, pack)
+OPENCV_HAL_IMPL_C_RSHR_PACK(v_int32x4, int, v_int16x8, short, pack)
+OPENCV_HAL_IMPL_C_RSHR_PACK(v_uint64x2, uint64, v_uint32x4, unsigned, pack)
+OPENCV_HAL_IMPL_C_RSHR_PACK(v_int64x2, int64, v_int32x4, int, pack)
+OPENCV_HAL_IMPL_C_RSHR_PACK(v_int16x8, short, v_uint8x16, uchar, pack_u)
+OPENCV_HAL_IMPL_C_RSHR_PACK(v_int32x4, int, v_uint16x8, ushort, pack_u)
+//! @}
+
+//! @brief Helper macro
+//! @ingroup hal_intrin_impl
+#define OPENCV_HAL_IMPL_C_PACK_STORE(_Tpvec, _Tp, _Tpnvec, _Tpn, pack_suffix) \
 inline void v_##pack_suffix##_store(_Tpn* ptr, const _Tpvec& a) \
 { \
    for( int i = 0; i < _Tpvec::nlanes; i++ ) \
        ptr[i] = saturate_cast<_Tpn>(a.s[i]); \
-} \
+}
+
+//! @name Pack and store
+//! @{
+//! @brief Store values from the input vector into memory with pack
+//!
+//! Values will be stored into memory with saturating conversion to narrower type.
+//! Variant with _u_ suffix converts to corresponding unsigned type.
+//!
+//! - pack: for 16-, 32- and 64-bit integer input types
+//! - pack_u: for 16- and 32-bit signed integer input types
+OPENCV_HAL_IMPL_C_PACK_STORE(v_uint16x8, ushort, v_uint8x16, uchar, pack)
+OPENCV_HAL_IMPL_C_PACK_STORE(v_int16x8, short, v_int8x16, schar, pack)
+OPENCV_HAL_IMPL_C_PACK_STORE(v_uint32x4, unsigned, v_uint16x8, ushort, pack)
+OPENCV_HAL_IMPL_C_PACK_STORE(v_int32x4, int, v_int16x8, short, pack)
+OPENCV_HAL_IMPL_C_PACK_STORE(v_uint64x2, uint64, v_uint32x4, unsigned, pack)
+OPENCV_HAL_IMPL_C_PACK_STORE(v_int64x2, int64, v_int32x4, int, pack)
+OPENCV_HAL_IMPL_C_PACK_STORE(v_int16x8, short, v_uint8x16, uchar, pack_u)
+OPENCV_HAL_IMPL_C_PACK_STORE(v_int32x4, int, v_uint16x8, ushort, pack_u)
+//! @}
+
+//! @brief Helper macro
+//! @ingroup hal_intrin_impl
+#define OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(_Tpvec, _Tp, _Tpnvec, _Tpn, pack_suffix) \
 template<int n> inline void v_rshr_##pack_suffix##_store(_Tpn* ptr, const _Tpvec& a) \
 { \
    for( int i = 0; i < _Tpvec::nlanes; i++ ) \
        ptr[i] = saturate_cast<_Tpn>((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \
 }

-OPENCV_HAL_IMPL_C_PACK(v_uint16x8, ushort, v_uint8x16, uchar, pack)
-OPENCV_HAL_IMPL_C_PACK(v_int16x8, short, v_int8x16, schar, pack)
-OPENCV_HAL_IMPL_C_PACK(v_int16x8, short, v_uint8x16, uchar, pack_u)
-OPENCV_HAL_IMPL_C_PACK(v_uint32x4, unsigned, v_uint16x8, ushort, pack)
-OPENCV_HAL_IMPL_C_PACK(v_int32x4, int, v_int16x8, short, pack)
-OPENCV_HAL_IMPL_C_PACK(v_int32x4, int, v_uint16x8, ushort, pack_u)
-OPENCV_HAL_IMPL_C_PACK(v_uint64x2, uint64, v_uint32x4, unsigned, pack)
-OPENCV_HAL_IMPL_C_PACK(v_int64x2, int64, v_int32x4, int, pack)
-
+//! @name Pack and store with rounding shift
+//! @{
+//! @brief Store values from the input vector into memory with pack
+//!
+//! Values will be shifted _n_ bits right with rounding, converted to narrower type and stored into
+//! memory. Variant with _u_ suffix converts to unsigned type.
+//!
+//! - pack: for 16-, 32- and 64-bit integer input types
+//! - pack_u: for 16- and 32-bit signed integer input types
+OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_uint16x8, ushort, v_uint8x16, uchar, pack)
+OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int16x8, short, v_int8x16, schar, pack)
+OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_uint32x4, unsigned, v_uint16x8, ushort, pack)
+OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int32x4, int, v_int16x8, short, pack)
+OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_uint64x2, uint64, v_uint32x4, unsigned, pack)
+OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int64x2, int64, v_int32x4, int, pack)
+OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int16x8, short, v_uint8x16, uchar, pack_u)
+OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int32x4, int, v_uint16x8, ushort, pack_u)
+//! @}
+
+/** @brief Matrix multiplication
+
+Scheme:
+@code
+{A0 A1 A2 A3}   |V0|
+{B0 B1 B2 B3}   |V1|
+{C0 C1 C2 C3}   |V2|
+{D0 D1 D2 D3} x |V3|
+====================
+{R0 R1 R2 R3}, where:
+R0 = A0V0 + A1V1 + A2V2 + A3V3,
+R1 = B0V0 + B1V1 + B2V2 + B3V3
+...
+@endcode
+*/
 inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
                            const v_float32x4& m1, const v_float32x4& m2,
                            const v_float32x4& m3)
@@ -820,6 +1729,8 @@ inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
                       v.s[0]*m0.s[3] + v.s[1]*m1.s[3] + v.s[2]*m2.s[3] + v.s[3]*m3.s[3]);
 }

+//! @}
+
 }

 #endif
--- a/modules/hal/include/opencv2/hal/intrin_neon.hpp
+++ b/modules/hal/include/opencv2/hal/intrin_neon.hpp
@@ -48,6 +48,8 @@
 namespace cv
 {

+//! @cond IGNORED
+
 #define CV_SIMD128 1

 struct v_uint8x16
@@ -278,14 +280,15 @@ void v_rshr_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
 }

 OPENCV_HAL_IMPL_NEON_PACK(v_uint8x16, uchar, uint8x8_t, u8, v_uint16x8, u16, pack, n)
-OPENCV_HAL_IMPL_NEON_PACK(v_uint8x16, uchar, uint8x8_t, u8, v_int16x8, s16, pack_u, un)
 OPENCV_HAL_IMPL_NEON_PACK(v_int8x16, schar, int8x8_t, s8, v_int16x8, s16, pack, n)
 OPENCV_HAL_IMPL_NEON_PACK(v_uint16x8, ushort, uint16x4_t, u16, v_uint32x4, u32, pack, n)
-OPENCV_HAL_IMPL_NEON_PACK(v_uint16x8, ushort, uint16x4_t, u16, v_int32x4, s32, pack_u, un)
 OPENCV_HAL_IMPL_NEON_PACK(v_int16x8, short, int16x4_t, s16, v_int32x4, s32, pack, n)
 OPENCV_HAL_IMPL_NEON_PACK(v_uint32x4, unsigned, uint32x2_t, u32, v_uint64x2, u64, pack, n)
 OPENCV_HAL_IMPL_NEON_PACK(v_int32x4, int, int32x2_t, s32, v_int64x2, s64, pack, n)

+OPENCV_HAL_IMPL_NEON_PACK(v_uint8x16, uchar, uint8x8_t, u8, v_int16x8, s16, pack_u, un)
+OPENCV_HAL_IMPL_NEON_PACK(v_uint16x8, ushort, uint16x4_t, u16, v_int32x4, s32, pack_u, un)
+
 inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
                            const v_float32x4& m1, const v_float32x4& m2,
                            const v_float32x4& m3)
@@ -374,7 +377,7 @@ inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
 {
    int32x4_t c = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
    int32x4_t d = vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val));
-    int32x4x2_t cd = vtrnq_s32(c, d);
+    int32x4x2_t cd = vuzpq_s32(c, d);
    return v_int32x4(vaddq_s32(cd.val[0], cd.val[1]));
 }

@@ -497,6 +500,16 @@ OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_absdiff, vabdq_u16)
 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_absdiff, vabdq_u32)
 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_absdiff, vabdq_f32)

+#define OPENCV_HAL_IMPL_NEON_BIN_FUNC2(_Tpvec, _Tpvec2, cast, func, intrin) \
+inline _Tpvec2 func(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec2(cast(intrin(a.val, b.val))); \
+}
+
+OPENCV_HAL_IMPL_NEON_BIN_FUNC2(v_int8x16, v_uint8x16, vreinterpretq_u8_s8, v_absdiff, vabdq_s8)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC2(v_int16x8, v_uint16x8, vreinterpretq_u16_s16, v_absdiff, vabdq_s16)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC2(v_int32x4, v_uint32x4, vreinterpretq_u32_s32, v_absdiff, vabdq_s32)
+
 inline v_float32x4 v_magnitude(const v_float32x4& a, const v_float32x4& b)
 {
    v_float32x4 x(vmlaq_f32(vmulq_f32(a.val, a.val), b.val, b.val));
@@ -641,13 +654,13 @@ inline bool v_check_all(const v_float32x4& a)
 { return v_check_all(v_reinterpret_as_u32(a)); }

 inline bool v_check_any(const v_int8x16& a)
-{ return v_check_all(v_reinterpret_as_u8(a)); }
+{ return v_check_any(v_reinterpret_as_u8(a)); }
 inline bool v_check_any(const v_int16x8& a)
-{ return v_check_all(v_reinterpret_as_u16(a)); }
+{ return v_check_any(v_reinterpret_as_u16(a)); }
 inline bool v_check_any(const v_int32x4& a)
-{ return v_check_all(v_reinterpret_as_u32(a)); }
+{ return v_check_any(v_reinterpret_as_u32(a)); }
 inline bool v_check_any(const v_float32x4& a)
-{ return v_check_all(v_reinterpret_as_u32(a)); }
+{ return v_check_any(v_reinterpret_as_u32(a)); }

 #define OPENCV_HAL_IMPL_NEON_SELECT(_Tpvec, suffix, usuffix) \
 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
@@ -678,6 +691,8 @@ OPENCV_HAL_IMPL_NEON_EXPAND(v_uint8x16, v_uint16x8, uchar, u8)
 OPENCV_HAL_IMPL_NEON_EXPAND(v_int8x16, v_int16x8, schar, s8)
 OPENCV_HAL_IMPL_NEON_EXPAND(v_uint16x8, v_uint32x4, ushort, u16)
 OPENCV_HAL_IMPL_NEON_EXPAND(v_int16x8, v_int32x4, short, s16)
+OPENCV_HAL_IMPL_NEON_EXPAND(v_uint32x4, v_uint64x2, uint, u32)
+OPENCV_HAL_IMPL_NEON_EXPAND(v_int32x4, v_int64x2, int, s32)

 inline v_uint32x4 v_load_expand_q(const uchar* ptr)
 {
@@ -840,6 +855,8 @@ inline v_float32x4 v_cvt_f32(const v_int32x4& a)
    return v_float32x4(vcvtq_f32_s32(a.val));
 }

+//! @endcond
+
 }

 #endif
--- a/modules/hal/include/opencv2/hal/intrin_sse.hpp
+++ b/modules/hal/include/opencv2/hal/intrin_sse.hpp
@@ -51,6 +51,8 @@
 namespace cv
 {

+//! @cond IGNORED
+
 struct v_uint8x16
 {
    typedef uchar lane_type;
@@ -296,6 +298,11 @@ OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int32x4, s32)
 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint64x2, u64)
 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int64x2, s64)

+inline v_float32x4 v_reinterpret_as_f32(const v_float32x4& a) {return a; }
+inline v_float64x2 v_reinterpret_as_f64(const v_float64x2& a) {return a; }
+inline v_float32x4 v_reinterpret_as_f32(const v_float64x2& a) {return v_float32x4(_mm_castpd_ps(a.val)); }
+inline v_float64x2 v_reinterpret_as_f64(const v_float32x4& a) {return v_float64x2(_mm_castps_pd(a.val)); }
+
 //////////////// PACK ///////////////
 inline v_uint8x16 v_pack(const v_uint16x8& a, const v_uint16x8& b)
 {
@@ -430,6 +437,17 @@ inline void v_pack_u_store(ushort* ptr, const v_int32x4& a)
    _mm_storel_epi64((__m128i*)ptr, r);
 }

+template<int n> inline
+v_uint16x8 v_rshr_pack_u(const v_int32x4& a, const v_int32x4& b)
+{
+    __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
+    __m128i a1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n), delta32);
+    __m128i a2 = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
+    __m128i b1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(b.val, delta), n), delta32);
+    __m128i b2 = _mm_sub_epi16(_mm_packs_epi32(b1, b1), _mm_set1_epi16(-32768));
+    return v_uint16x8(_mm_unpacklo_epi64(a2, b2));
+}
+
 template<int n> inline
 void v_rshr_pack_u_store(ushort* ptr, const v_int32x4& a)
 {
@@ -460,7 +478,7 @@ void v_rshr_pack_store(short* ptr, const v_int32x4& a)
 {
    __m128i delta = _mm_set1_epi32(1 << (n-1));
    __m128i a1 = _mm_srai_epi32(_mm_add_epi32(a.val, delta), n);
-    _mm_storel_epi64((__m128i*)ptr, a1);
+    _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi32(a1, a1));
 }


@@ -469,7 +487,7 @@ inline v_uint32x4 v_pack(const v_uint64x2& a, const v_uint64x2& b)
 {
    __m128i v0 = _mm_unpacklo_epi32(a.val, b.val); // a0 a1 0 0
    __m128i v1 = _mm_unpackhi_epi32(a.val, b.val); // b0 b1 0 0
-    return v_uint32x4(_mm_unpacklo_epi64(v0, v1));
+    return v_uint32x4(_mm_unpacklo_epi32(v0, v1));
 }

 inline void v_pack_store(unsigned* ptr, const v_uint64x2& a)
@@ -483,7 +501,7 @@ inline v_int32x4 v_pack(const v_int64x2& a, const v_int64x2& b)
 {
    __m128i v0 = _mm_unpacklo_epi32(a.val, b.val); // a0 a1 0 0
    __m128i v1 = _mm_unpackhi_epi32(a.val, b.val); // b0 b1 0 0
-    return v_int32x4(_mm_unpacklo_epi64(v0, v1));
+    return v_int32x4(_mm_unpacklo_epi32(v0, v1));
 }

 inline void v_pack_store(int* ptr, const v_int64x2& a)
@@ -501,7 +519,7 @@ v_uint32x4 v_rshr_pack(const v_uint64x2& a, const v_uint64x2& b)
    __m128i b1 = _mm_srli_epi64(_mm_add_epi64(b.val, delta2.val), n);
    __m128i v0 = _mm_unpacklo_epi32(a1, b1); // a0 a1 0 0
    __m128i v1 = _mm_unpackhi_epi32(a1, b1); // b0 b1 0 0
-    return v_uint32x4(_mm_unpacklo_epi64(v0, v1));
+    return v_uint32x4(_mm_unpacklo_epi32(v0, v1));
 }

 template<int n> inline
@@ -534,7 +552,7 @@ v_int32x4 v_rshr_pack(const v_int64x2& a, const v_int64x2& b)
    __m128i b1 = v_srai_epi64(_mm_add_epi64(b.val, delta2.val), n);
    __m128i v0 = _mm_unpacklo_epi32(a1, b1); // a0 a1 0 0
    __m128i v1 = _mm_unpackhi_epi32(a1, b1); // b0 b1 0 0
-    return v_int32x4(_mm_unpacklo_epi64(v0, v1));
+    return v_int32x4(_mm_unpacklo_epi32(v0, v1));
 }

 template<int n> inline
@@ -630,8 +648,8 @@ inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
 {
    __m128i v0 = _mm_mullo_epi16(a.val, b.val);
    __m128i v1 = _mm_mulhi_epi16(a.val, b.val);
-    c.val = _mm_unpacklo_epi32(v0, v1);
-    d.val = _mm_unpackhi_epi32(v0, v1);
+    c.val = _mm_unpacklo_epi16(v0, v1);
+    d.val = _mm_unpackhi_epi16(v0, v1);
 }

 inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
@@ -639,8 +657,8 @@ inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
 {
    __m128i v0 = _mm_mullo_epi16(a.val, b.val);
    __m128i v1 = _mm_mulhi_epu16(a.val, b.val);
-    c.val = _mm_unpacklo_epi32(v0, v1);
-    d.val = _mm_unpackhi_epi32(v0, v1);
+    c.val = _mm_unpacklo_epi16(v0, v1);
+    d.val = _mm_unpackhi_epi16(v0, v1);
 }

 inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
@@ -869,6 +887,18 @@ inline _Tpuvec v_absdiff(const _Tpsvec& a, const _Tpsvec& b) \
 OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(v_uint8x16, v_int8x16, 8, (int)0x80808080)
 OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(v_uint16x8, v_int16x8, 16, (int)0x80008000)

+inline v_uint32x4 v_absdiff(const v_uint32x4& a, const v_uint32x4& b)
+{
+    return v_max(a, b) - v_min(a, b);
+}
+
+inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b)
+{
+    __m128i d = _mm_sub_epi32(a.val, b.val);
+    __m128i m = _mm_cmpgt_epi32(b.val, a.val);
+    return v_uint32x4(_mm_sub_epi32(_mm_xor_si128(d, m), m));
+}
+
 #define OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(_Tpvec, _Tp, _Tpreg, suffix, absmask_vec) \
 inline _Tpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \
 { \
@@ -1047,8 +1077,8 @@ OPENCV_HAL_IMPL_SSE_SELECT(v_uint16x8, si128)
 OPENCV_HAL_IMPL_SSE_SELECT(v_int16x8, si128)
 OPENCV_HAL_IMPL_SSE_SELECT(v_uint32x4, si128)
 OPENCV_HAL_IMPL_SSE_SELECT(v_int32x4, si128)
-OPENCV_HAL_IMPL_SSE_SELECT(v_uint64x2, si128)
-OPENCV_HAL_IMPL_SSE_SELECT(v_int64x2, si128)
+// OPENCV_HAL_IMPL_SSE_SELECT(v_uint64x2, si128)
+// OPENCV_HAL_IMPL_SSE_SELECT(v_int64x2, si128)
 OPENCV_HAL_IMPL_SSE_SELECT(v_float32x4, ps)
 OPENCV_HAL_IMPL_SSE_SELECT(v_float64x2, pd)

@@ -1257,7 +1287,7 @@ inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b,
    __m128i v0 = _mm_unpacklo_epi8(u0, u2); // a0 a8 b0 b8 ...
    __m128i v1 = _mm_unpackhi_epi8(u0, u2); // a2 a10 b2 b10 ...
    __m128i v2 = _mm_unpacklo_epi8(u1, u3); // a4 a12 b4 b12 ...
-    __m128i v3 = _mm_unpackhi_epi8(u1, u3); // a6 a14 b4 b14 ...
+    __m128i v3 = _mm_unpackhi_epi8(u1, u3); // a6 a14 b6 b14 ...

    u0 = _mm_unpacklo_epi8(v0, v2); // a0 a4 a8 a12 ...
    u1 = _mm_unpacklo_epi8(v1, v3); // a2 a6 a10 a14 ...
@@ -1266,13 +1296,13 @@ inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b,

    v0 = _mm_unpacklo_epi8(u0, u1); // a0 a2 a4 a6 ...
    v1 = _mm_unpacklo_epi8(u2, u3); // a1 a3 a5 a7 ...
-    v2 = _mm_unpackhi_epi8(u0, u1); // b0 b2 b4 b6 ...
-    v3 = _mm_unpackhi_epi8(u2, u3); // b1 b3 b5 b7 ...
+    v2 = _mm_unpackhi_epi8(u0, u1); // c0 c2 c4 c6 ...
+    v3 = _mm_unpackhi_epi8(u2, u3); // c1 c3 c5 c7 ...

    a.val = _mm_unpacklo_epi8(v0, v1);
-    b.val = _mm_unpacklo_epi8(v2, v3);
-    c.val = _mm_unpackhi_epi8(v0, v1);
-    d.val = _mm_unpacklo_epi8(v2, v3);
+    b.val = _mm_unpackhi_epi8(v0, v1);
+    c.val = _mm_unpacklo_epi8(v2, v3);
+    d.val = _mm_unpackhi_epi8(v2, v3);
 }

 inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c)
@@ -1560,6 +1590,8 @@ inline v_float64x2 v_cvt_f64(const v_float32x4& a)
    return v_float64x2(_mm_cvtps_pd(a.val));
 }

+//! @endcond
+
 }

 #endif
--- a/modules/hal/test/test_intrin.cpp
+++ b/modules/hal/test/test_intrin.cpp
+#include "test_intrin_utils.hpp"
+#include <climits>
+
+using namespace cv;
+
+template<typename R> struct TheTest
+{
+    typedef typename R::lane_type LaneType;
+
+    TheTest & test_loadstore()
+    {
+        AlignedData<R> data;
+        AlignedData<R> out;
+
+        // check if addresses are aligned and unaligned respectively
+        EXPECT_EQ((size_t)0, (size_t)&data.a.d % 16);
+        EXPECT_NE((size_t)0, (size_t)&data.u.d % 16);
+        EXPECT_EQ((size_t)0, (size_t)&out.a.d % 16);
+        EXPECT_NE((size_t)0, (size_t)&out.u.d % 16);
+
+        // check some initialization methods
+        R r1 = data.a;
+        R r2 = v_load(data.u.d);
+        R r3 = v_load_aligned(data.a.d);
+        R r4(r2);
+        EXPECT_EQ(data.a[0], r1.get0());
+        EXPECT_EQ(data.u[0], r2.get0());
+        EXPECT_EQ(data.a[0], r3.get0());
+        EXPECT_EQ(data.u[0], r4.get0());
+
+        // check some store methods
+        out.u.clear();
+        out.a.clear();
+        v_store(out.u.d, r1);
+        v_store_aligned(out.a.d, r2);
+        EXPECT_EQ(data.a, out.a);
+        EXPECT_EQ(data.u, out.u);
+
+        // check more store methods
+        Data<R> d, res(0);
+        R r5 = d;
+        v_store_high(res.mid(), r5);
+        v_store_low(res.d, r5);
+        EXPECT_EQ(d, res);
+
+        // check halves load correctness
+        res.clear();
+        R r6 = v_load_halves(d.d, d.mid());
+        v_store(res.d, r6);
+        EXPECT_EQ(d, res);
+
+        // zero, all
+        Data<R> resZ = RegTrait<R>::zero();
+        Data<R> resV = RegTrait<R>::all(8);
+        for (int i = 0; i < R::nlanes; ++i)
+        {
+            EXPECT_EQ((LaneType)0, resZ[i]);
+            EXPECT_EQ((LaneType)8, resV[i]);
+        }
+
+        // reinterpret_as
+        v_uint8x16 vu8 = v_reinterpret_as_u8(r1); out.a.clear(); v_store((uchar*)out.a.d, vu8); EXPECT_EQ(data.a, out.a);
+        v_int8x16 vs8 = v_reinterpret_as_s8(r1); out.a.clear(); v_store((schar*)out.a.d, vs8); EXPECT_EQ(data.a, out.a);
+        v_uint16x8 vu16 = v_reinterpret_as_u16(r1); out.a.clear(); v_store((ushort*)out.a.d, vu16); EXPECT_EQ(data.a, out.a);
+        v_int16x8 vs16 = v_reinterpret_as_s16(r1); out.a.clear(); v_store((short*)out.a.d, vs16); EXPECT_EQ(data.a, out.a);
+        v_uint32x4 vu32 = v_reinterpret_as_u32(r1); out.a.clear(); v_store((unsigned*)out.a.d, vu32); EXPECT_EQ(data.a, out.a);
+        v_int32x4 vs32 = v_reinterpret_as_s32(r1); out.a.clear(); v_store((int*)out.a.d, vs32); EXPECT_EQ(data.a, out.a);
+        v_uint64x2 vu64 = v_reinterpret_as_u64(r1); out.a.clear(); v_store((uint64*)out.a.d, vu64); EXPECT_EQ(data.a, out.a);
+        v_int64x2 vs64 = v_reinterpret_as_s64(r1); out.a.clear(); v_store((int64*)out.a.d, vs64); EXPECT_EQ(data.a, out.a);
+        v_float32x4 vf32 = v_reinterpret_as_f32(r1); out.a.clear(); v_store((float*)out.a.d, vf32); EXPECT_EQ(data.a, out.a);
+#if CV_SIMD128_64F
+        v_float64x2 vf64 = v_reinterpret_as_f64(r1); out.a.clear(); v_store((double*)out.a.d, vf64); EXPECT_EQ(data.a, out.a);
+#endif
+
+        return *this;
+    }
+
+    TheTest & test_interleave()
+    {
+        Data<R> data1, data2, data3, data4;
+        data2 += 20;
+        data3 += 40;
+        data4 += 60;
+
+
+        R a = data1, b = data2, c = data3;
+        R d = data1, e = data2, f = data3, g = data4;
+
+        LaneType buf3[R::nlanes * 3];
+        LaneType buf4[R::nlanes * 4];
+
+        v_store_interleave(buf3, a, b, c);
+        v_store_interleave(buf4, d, e, f, g);
+
+        Data<R> z(0);
+        a = b = c = d = e = f = g = z;
+
+        v_load_deinterleave(buf3, a, b, c);
+        v_load_deinterleave(buf4, d, e, f, g);
+
+        for (int i = 0; i < R::nlanes; ++i)
+        {
+            EXPECT_EQ(data1, Data<R>(a));
+            EXPECT_EQ(data2, Data<R>(b));
+            EXPECT_EQ(data3, Data<R>(c));
+
+            EXPECT_EQ(data1, Data<R>(d));
+            EXPECT_EQ(data2, Data<R>(e));
+            EXPECT_EQ(data3, Data<R>(f));
+            EXPECT_EQ(data4, Data<R>(g));
+        }
+
+        return *this;
+    }
+
+    // v_expand and v_load_expand
+    TheTest & test_expand()
+    {
+        typedef typename RegTrait<R>::w_reg Rx2;
+        Data<R> dataA;
+        R a = dataA;
+
+        Data<Rx2> resB = v_load_expand(dataA.d);
+
+        Rx2 c, d;
+        v_expand(a, c, d);
+
+        Data<Rx2> resC = c, resD = d;
+        const int n = Rx2::nlanes;
+        for (int i = 0; i < n; ++i)
+        {
+            EXPECT_EQ(dataA[i], resB[i]);
+            EXPECT_EQ(dataA[i], resC[i]);
+            EXPECT_EQ(dataA[i + n], resD[i]);
+        }
+
+        return *this;
+    }
+
+    TheTest & test_expand_q()
+    {
+        typedef typename RegTrait<R>::q_reg Rx4;
+        Data<R> data;
+        Data<Rx4> out = v_load_expand_q(data.d);
+        const int n = Rx4::nlanes;
+        for (int i = 0; i < n; ++i)
+            EXPECT_EQ(data[i], out[i]);
+
+        return *this;
+    }
+
+    TheTest & test_addsub()
+    {
+        Data<R> dataA, dataB;
+        dataB.reverse();
+        R a = dataA, b = dataB;
+
+        Data<R> resC = a + b, resD = a - b;
+        for (int i = 0; i < R::nlanes; ++i)
+        {
+            EXPECT_EQ(saturate_cast<LaneType>(dataA[i] + dataB[i]), resC[i]);
+            EXPECT_EQ(saturate_cast<LaneType>(dataA[i] - dataB[i]), resD[i]);
+        }
+
+        return *this;
+    }
+
+    TheTest & test_addsub_wrap()
+    {
+        Data<R> dataA, dataB;
+        dataB.reverse();
+        R a = dataA, b = dataB;
+
+        Data<R> resC = v_add_wrap(a, b),
+                resD = v_sub_wrap(a, b);
+        for (int i = 0; i < R::nlanes; ++i)
+        {
+            EXPECT_EQ((LaneType)(dataA[i] + dataB[i]), resC[i]);
+            EXPECT_EQ((LaneType)(dataA[i] - dataB[i]), resD[i]);
+        }
+        return *this;
+    }
+
+    TheTest & test_mul()
+    {
+        Data<R> dataA, dataB;
+        dataB.reverse();
+        R a = dataA, b = dataB;
+
+        Data<R> resC = a * b;
+        for (int i = 0; i < R::nlanes; ++i)
+        {
+            EXPECT_EQ(dataA[i] * dataB[i], resC[i]);
+        }
+
+        return *this;
+    }
+
+    TheTest & test_div()
+    {
+        Data<R> dataA, dataB;
+        dataB.reverse();
+        R a = dataA, b = dataB;
+
+        Data<R> resC = a / b;
+        for (int i = 0; i < R::nlanes; ++i)
+        {
+            EXPECT_EQ(dataA[i] / dataB[i], resC[i]);
+        }
+
+        return *this;
+    }
+
+    TheTest & test_mul_expand()
+    {
+        typedef typename RegTrait<R>::w_reg Rx2;
+        Data<R> dataA, dataB(2);
+        R a = dataA, b = dataB;
+        Rx2 c, d;
+
+        v_mul_expand(a, b, c, d);
+
+        Data<Rx2> resC = c, resD = d;
+        const int n = R::nlanes / 2;
+        for (int i = 0; i < n; ++i)
+        {
+            EXPECT_EQ((typename Rx2::lane_type)dataA[i] * dataB[i], resC[i]);
+            EXPECT_EQ((typename Rx2::lane_type)dataA[i + n] * dataB[i + n], resD[i]);
+        }
+
+        return *this;
+    }
+
+    template <int s>
+    TheTest & test_shift()
+    {
+        Data<R> dataA;
+        R a = dataA;
+
+        Data<R> resB = a << s, resC = v_shl<s>(a), resD = a >> s, resE = v_shr<s>(a);
+        for (int i = 0; i < R::nlanes; ++i)
+        {
+            EXPECT_EQ(dataA[i] << s, resB[i]);
+            EXPECT_EQ(dataA[i] << s, resC[i]);
+            EXPECT_EQ(dataA[i] >> s, resD[i]);
+            EXPECT_EQ(dataA[i] >> s, resE[i]);
+        }
+        return *this;
+    }
+
+    TheTest & test_cmp()
+    {
+        Data<R> dataA, dataB;
+        dataB.reverse();
+        dataB += 1;
+        R a = dataA, b = dataB;
+
+        Data<R> resC = (a == b);
+        Data<R> resD = (a != b);
+        Data<R> resE = (a > b);
+        Data<R> resF = (a >= b);
+        Data<R> resG = (a < b);
+        Data<R> resH = (a <= b);
+
+        for (int i = 0; i < R::nlanes; ++i)
+        {
+            EXPECT_EQ(dataA[i] == dataB[i], resC[i] != 0);
+            EXPECT_EQ(dataA[i] != dataB[i], resD[i] != 0);
+            EXPECT_EQ(dataA[i] >  dataB[i], resE[i] != 0);
+            EXPECT_EQ(dataA[i] >= dataB[i], resF[i] != 0);
+            EXPECT_EQ(dataA[i] <  dataB[i], resG[i] != 0);
+            EXPECT_EQ(dataA[i] <= dataB[i], resH[i] != 0);
+        }
+        return *this;
+    }
+
+    TheTest & test_dot_prod()
+    {
+        typedef typename RegTrait<R>::w_reg Rx2;
+        Data<R> dataA, dataB(2);
+        R a = dataA, b = dataB;
+
+        Data<Rx2> res = v_dotprod(a, b);
+
+        const int n = R::nlanes / 2;
+        for (int i = 0; i < n; ++i)
+        {
+            EXPECT_EQ(dataA[i*2] * dataB[i*2] + dataA[i*2 + 1] * dataB[i*2 + 1], res[i]);
+        }
+        return *this;
+    }
+
+    TheTest & test_logic()
+    {
+        Data<R> dataA, dataB(2);
+        R a = dataA, b = dataB;
+
+        Data<R> resC = a & b, resD = a | b, resE = a ^ b, resF = ~a;
+        for (int i = 0; i < R::nlanes; ++i)
+        {
+            EXPECT_EQ(dataA[i] & dataB[i], resC[i]);
+            EXPECT_EQ(dataA[i] | dataB[i], resD[i]);
+            EXPECT_EQ(dataA[i] ^ dataB[i], resE[i]);
+            EXPECT_EQ((LaneType)~dataA[i], resF[i]);
+        }
+
+        return *this;
+    }
+
+    TheTest & test_sqrt_abs()
+    {
+        Data<R> dataA, dataD;
+        dataD *= -1.0;
+        R a = dataA, d = dataD;
+
+        Data<R> resB = v_sqrt(a), resC = v_invsqrt(a), resE = v_abs(d);
+        for (int i = 0; i < R::nlanes; ++i)
+        {
+            EXPECT_FLOAT_EQ((float)std::sqrt(dataA[i]), (float)resB[i]);
+            EXPECT_FLOAT_EQ((float)1/std::sqrt(dataA[i]), (float)resC[i]);
+            EXPECT_FLOAT_EQ((float)abs(dataA[i]), (float)resE[i]);
+        }
+
+        return *this;
+    }
+
+    TheTest & test_min_max()
+    {
+        Data<R> dataA, dataB;
+        dataB.reverse();
+        R a = dataA, b = dataB;
+
+        Data<R> resC = v_min(a, b), resD = v_max(a, b);
+        for (int i = 0; i < R::nlanes; ++i)
+        {
+            EXPECT_EQ(std::min(dataA[i], dataB[i]), resC[i]);
+            EXPECT_EQ(std::max(dataA[i], dataB[i]), resD[i]);
+        }
+
+        return *this;
+    }
+
+    TheTest & test_absdiff()
+    {
+        typedef typename RegTrait<R>::u_reg Ru;
+        typedef typename Ru::lane_type u_type;
+        Data<R> dataA(std::numeric_limits<LaneType>::max()),
+                dataB(std::numeric_limits<LaneType>::min());
+        dataA[0] = (LaneType)-1;
+        dataB[0] = 1;
+        dataA[1] = 2;
+        dataB[1] = (LaneType)-2;
+        R a = dataA, b = dataB;
+        Data<Ru> resC = v_absdiff(a, b);
+        const u_type mask = std::numeric_limits<LaneType>::is_signed ? (u_type)(1 << (sizeof(u_type)*8 - 1)) : 0;
+        for (int i = 0; i < Ru::nlanes; ++i)
+        {
+            u_type uA = dataA[i] ^ mask;
+            u_type uB = dataB[i] ^ mask;
+            EXPECT_EQ(uA > uB ? uA - uB : uB - uA, resC[i]);
+        }
+        return *this;
+    }
+
+    TheTest & test_float_absdiff()
+    {
+        Data<R> dataA(std::numeric_limits<LaneType>::max()),
+                dataB(std::numeric_limits<LaneType>::min());
+        dataA[0] = -1;
+        dataB[0] = 1;
+        dataA[1] = 2;
+        dataB[1] = -2;
+        R a = dataA, b = dataB;
+        Data<R> resC = v_absdiff(a, b);
+        for (int i = 0; i < R::nlanes; ++i)
+        {
+            EXPECT_EQ(dataA[i] > dataB[i] ? dataA[i] - dataB[i] : dataB[i] - dataA[i], resC[i]);
+        }
+        return *this;
+    }
+
+    TheTest & test_reduce()
+    {
+        Data<R> dataA;
+        R a = dataA;
+        EXPECT_EQ((LaneType)1, v_reduce_min(a));
+        EXPECT_EQ((LaneType)R::nlanes, v_reduce_max(a));
+        EXPECT_EQ((LaneType)(1 + R::nlanes)*2, v_reduce_sum(a));
+        return *this;
+    }
+
+    TheTest & test_mask()
+    {
+        Data<R> dataA, dataB, dataC, dataD(1), dataE(2);
+        dataA[1] *= (LaneType)-1;
+        dataC *= (LaneType)-1;
+        R a = dataA, b = dataB, c = dataC, d = dataD, e = dataE;
+
+        int m = v_signmask(a);
+        EXPECT_EQ(2, m);
+
+        EXPECT_EQ(false, v_check_all(a));
+        EXPECT_EQ(false, v_check_all(b));
+        EXPECT_EQ(true, v_check_all(c));
+
+        EXPECT_EQ(true, v_check_any(a));
+        EXPECT_EQ(false, v_check_any(b));
+        EXPECT_EQ(true, v_check_any(c));
+
+        typedef V_TypeTraits<LaneType> Traits;
+        typedef typename Traits::int_type int_type;
+
+        R f = v_select(b, d, e);
+        Data<R> resF = f;
+        for (int i = 0; i < R::nlanes; ++i)
+        {
+            int_type m2 = Traits::reinterpret_int(dataB[i]);
+            EXPECT_EQ((Traits::reinterpret_int(dataD[i]) & m2)
+                    | (Traits::reinterpret_int(dataE[i]) & ~m2),
+                      Traits::reinterpret_int(resF[i]));
+        }
+
+        return *this;
+    }
+
+    template <int s>
+    TheTest & test_pack()
+    {
+        typedef typename RegTrait<R>::w_reg Rx2;
+        typedef typename Rx2::lane_type w_type;
+        Data<Rx2> dataA, dataB;
+        dataA += std::numeric_limits<LaneType>::is_signed ? -10 : 10;
+        dataB *= 10;
+        Rx2 a = dataA, b = dataB;
+
+        Data<R> resC = v_pack(a, b);
+        Data<R> resD = v_rshr_pack<s>(a, b);
+
+        Data<R> resE(0);
+        v_pack_store(resE.d, b);
+
+        Data<R> resF(0);
+        v_rshr_pack_store<s>(resF.d, b);
+
+        const int n = Rx2::nlanes;
+        const w_type add = (w_type)1 << (s - 1);
+        for (int i = 0; i < n; ++i)
+        {
+            EXPECT_EQ(saturate_cast<LaneType>(dataA[i]), resC[i]);
+            EXPECT_EQ(saturate_cast<LaneType>(dataB[i]), resC[i + n]);
+            EXPECT_EQ(saturate_cast<LaneType>((dataA[i] + add) >> s), resD[i]);
+            EXPECT_EQ(saturate_cast<LaneType>((dataB[i] + add) >> s), resD[i + n]);
+            EXPECT_EQ(saturate_cast<LaneType>(dataB[i]), resE[i]);
+            EXPECT_EQ((LaneType)0, resE[i + n]);
+            EXPECT_EQ(saturate_cast<LaneType>((dataB[i] + add) >> s), resF[i]);
+            EXPECT_EQ((LaneType)0, resF[i + n]);
+        }
+        return *this;
+    }
+
+    template <int s>
+    TheTest & test_pack_u()
+    {
+        typedef typename RegTrait<R>::w_reg Rx2;
+        typedef typename RegTrait<Rx2>::int_reg Ri2;
+        typedef typename Ri2::lane_type w_type;
+
+        Data<Ri2> dataA, dataB;
+        dataA += -10;
+        dataB *= 10;
+        Ri2 a = dataA, b = dataB;
+
+        Data<R> resC = v_pack_u(a, b);
+        Data<R> resD = v_rshr_pack_u<s>(a, b);
+
+        Data<R> resE(0);
+        v_pack_u_store(resE.d, b);
+
+        Data<R> resF(0);
+        v_rshr_pack_u_store<s>(resF.d, b);
+
+        const int n = Ri2::nlanes;
+        const w_type add = (w_type)1 << (s - 1);
+        for (int i = 0; i < n; ++i)
+        {
+            EXPECT_EQ(saturate_cast<LaneType>(dataA[i]), resC[i]);
+            EXPECT_EQ(saturate_cast<LaneType>(dataB[i]), resC[i + n]);
+            EXPECT_EQ(saturate_cast<LaneType>((dataA[i] + add) >> s), resD[i]);
+            EXPECT_EQ(saturate_cast<LaneType>((dataB[i] + add) >> s), resD[i + n]);
+            EXPECT_EQ(saturate_cast<LaneType>(dataB[i]), resE[i]);
+            EXPECT_EQ((LaneType)0, resE[i + n]);
+            EXPECT_EQ(saturate_cast<LaneType>((dataB[i] + add) >> s), resF[i]);
+            EXPECT_EQ((LaneType)0, resF[i + n]);
+        }
+        return *this;
+    }
+
+    TheTest & test_unpack()
+    {
+        Data<R> dataA, dataB;
+        dataB *= 10;
+        R a = dataA, b = dataB;
+
+        R c, d, e, f, lo, hi;
+        v_zip(a, b, c, d);
+        v_recombine(a, b, e, f);
+        lo = v_combine_low(a, b);
+        hi = v_combine_high(a, b);
+
+        Data<R> resC = c, resD = d, resE = e, resF = f, resLo = lo, resHi = hi;
+
+        const int n = R::nlanes/2;
+        for (int i = 0; i < n; ++i)
+        {
+            EXPECT_EQ(dataA[i], resC[i*2]);
+            EXPECT_EQ(dataB[i], resC[i*2+1]);
+            EXPECT_EQ(dataA[i+n], resD[i*2]);
+            EXPECT_EQ(dataB[i+n], resD[i*2+1]);
+
+            EXPECT_EQ(dataA[i], resE[i]);
+            EXPECT_EQ(dataB[i], resE[i+n]);
+            EXPECT_EQ(dataA[i+n], resF[i]);
+            EXPECT_EQ(dataB[i+n], resF[i+n]);
+
+            EXPECT_EQ(dataA[i], resLo[i]);
+            EXPECT_EQ(dataB[i], resLo[i+n]);
+            EXPECT_EQ(dataA[i+n], resHi[i]);
+            EXPECT_EQ(dataB[i+n], resHi[i+n]);
+        }
+
+        return *this;
+    }
+
+    template<int s>
+    TheTest & test_extract()
+    {
+        Data<R> dataA, dataB;
+        dataB *= 10;
+        R a = dataA, b = dataB;
+
+        Data<R> resC = v_extract<s>(a, b);
+
+        for (int i = 0; i < R::nlanes; ++i)
+        {
+            if (i + s >= R::nlanes)
+                EXPECT_EQ(dataB[i - R::nlanes + s], resC[i]);
+            else
+                EXPECT_EQ(dataA[i + s], resC[i]);
+        }
+
+        return *this;
+    }
+
+    TheTest & test_float_math()
+    {
+        typedef typename RegTrait<R>::int_reg Ri;
+        Data<R> data1, data2, data3;
+        data1 *= 1.1;
+        data2 += 10;
+        R a1 = data1, a2 = data2, a3 = data3;
+
+        Data<Ri> resB = v_round(a1),
+                 resC = v_trunc(a1),
+                 resD = v_floor(a1),
+                 resE = v_ceil(a1);
+
+        Data<R> resF = v_magnitude(a1, a2),
+                resG = v_sqr_magnitude(a1, a2),
+                resH = v_muladd(a1, a2, a3);
+
+        for (int i = 0; i < R::nlanes; ++i)
+        {
+            EXPECT_EQ(cvRound(data1[i]), resB[i]);
+            EXPECT_EQ((typename Ri::lane_type)data1[i], resC[i]);
+            EXPECT_EQ(cvFloor(data1[i]), resD[i]);
+            EXPECT_EQ(cvCeil(data1[i]), resE[i]);
+
+            EXPECT_DOUBLE_EQ(std::sqrt(data1[i]*data1[i] + data2[i]*data2[i]), resF[i]);
+            EXPECT_DOUBLE_EQ(data1[i]*data1[i] + data2[i]*data2[i], resG[i]);
+            EXPECT_DOUBLE_EQ(data1[i]*data2[i] + data3[i], resH[i]);
+        }
+
+        return *this;
+    }
+
+    TheTest & test_float_cvt32()
+    {
+        typedef v_float32x4 Rt;
+        Data<R> dataA;
+        dataA *= 1.1;
+        R a = dataA;
+        Rt b = v_cvt_f32(a);
+        Data<Rt> resB = b;
+        int n = std::min<int>(Rt::nlanes, R::nlanes);
+        for (int i = 0; i < n; ++i)
+        {
+            EXPECT_EQ((typename Rt::lane_type)dataA[i], resB[i]);
+        }
+        return *this;
+    }
+
+    TheTest & test_float_cvt64()
+    {
+#if CV_SIMD128_64F
+        typedef v_float64x2 Rt;
+        Data<R> dataA;
+        dataA *= 1.1;
+        R a = dataA;
+        Rt b = v_cvt_f64(a);
+        Data<Rt> resB = b;
+        int n = std::min<int>(Rt::nlanes, R::nlanes);
+        for (int i = 0; i < n; ++i)
+        {
+            EXPECT_EQ((typename Rt::lane_type)dataA[i], resB[i]);
+        }
+#endif
+        return *this;
+    }
+
+    TheTest & test_matmul()
+    {
+        Data<R> dataV, dataA, dataB, dataC, dataD;
+        dataB.reverse();
+        dataC += 2;
+        dataD *= 0.3;
+        R v = dataV, a = dataA, b = dataB, c = dataC, d = dataD;
+
+        Data<R> res = v_matmul(v, a, b, c, d);
+        for (int i = 0; i < R::nlanes; ++i)
+        {
+            LaneType val = dataV[0] * dataA[i]
+                                      + dataV[1] * dataB[i]
+                                      + dataV[2] * dataC[i]
+                                      + dataV[3] * dataD[i];
+            EXPECT_DOUBLE_EQ(val, res[i]);
+        }
+        return *this;
+    }
+
+    TheTest & test_transpose()
+    {
+        Data<R> dataA, dataB, dataC, dataD;
+        dataB *= 5;
+        dataC *= 10;
+        dataD *= 15;
+        R a = dataA, b = dataB, c = dataC, d = dataD;
+        R e, f, g, h;
+        v_transpose4x4(a, b, c, d,
+                       e, f, g, h);
+
+        Data<R> res[4] = {e, f, g, h};
+        for (int i = 0; i < R::nlanes; ++i)
+        {
+            EXPECT_EQ(dataA[i], res[i][0]);
+            EXPECT_EQ(dataB[i], res[i][1]);
+            EXPECT_EQ(dataC[i], res[i][2]);
+            EXPECT_EQ(dataD[i], res[i][3]);
+        }
+        return *this;
+    }
+
+};
+
+
+//=============  8-bit integer =====================================================================
+
+TEST(hal_intrin, uint8x16) {
+    TheTest<v_uint8x16>()
+        .test_loadstore()
+        .test_interleave()
+        .test_expand()
+        .test_expand_q()
+        .test_addsub()
+        .test_addsub_wrap()
+        .test_cmp()
+        .test_logic()
+        .test_min_max()
+        .test_absdiff()
+        .test_mask()
+        .test_pack<1>().test_pack<2>().test_pack<3>().test_pack<8>()
+        .test_pack_u<1>().test_pack_u<2>().test_pack_u<3>().test_pack_u<8>()
+        .test_unpack()
+        .test_extract<0>().test_extract<1>().test_extract<8>().test_extract<15>()
+        ;
+}
+
+TEST(hal_intrin, int8x16) {
+    TheTest<v_int8x16>()
+        .test_loadstore()
+        .test_interleave()
+        .test_expand()
+        .test_expand_q()
+        .test_addsub()
+        .test_addsub_wrap()
+        .test_cmp()
+        .test_logic()
+        .test_min_max()
+        .test_absdiff()
+        .test_mask()
+        .test_pack<1>().test_pack<2>().test_pack<3>().test_pack<8>()
+        .test_unpack()
+        .test_extract<0>().test_extract<1>().test_extract<8>().test_extract<15>()
+        ;
+}
+
+//============= 16-bit integer =====================================================================
+
+TEST(hal_intrin, uint16x8) {
+    TheTest<v_uint16x8>()
+        .test_loadstore()
+        .test_interleave()
+        .test_expand()
+        .test_addsub()
+        .test_addsub_wrap()
+        .test_mul()
+        .test_mul_expand()
+        .test_cmp()
+        .test_shift<1>()
+        .test_shift<8>()
+        .test_logic()
+        .test_min_max()
+        .test_absdiff()
+        .test_mask()
+        .test_pack<1>().test_pack<2>().test_pack<7>().test_pack<16>()
+        .test_pack_u<1>().test_pack_u<2>().test_pack_u<7>().test_pack_u<16>()
+        .test_unpack()
+        .test_extract<0>().test_extract<1>().test_extract<4>().test_extract<7>()
+        ;
+}
+
+TEST(hal_intrin, int16x8) {
+    TheTest<v_int16x8>()
+        .test_loadstore()
+        .test_interleave()
+        .test_expand()
+        .test_addsub()
+        .test_addsub_wrap()
+        .test_mul()
+        .test_mul_expand()
+        .test_cmp()
+        .test_shift<1>()
+        .test_shift<8>()
+        .test_dot_prod()
+        .test_logic()
+        .test_min_max()
+        .test_absdiff()
+        .test_mask()
+        .test_pack<1>().test_pack<2>().test_pack<7>().test_pack<16>()
+        .test_unpack()
+        .test_extract<0>().test_extract<1>().test_extract<4>().test_extract<7>()
+        ;
+}
+
+//============= 32-bit integer =====================================================================
+
+TEST(hal_intrin, uint32x4) {
+    TheTest<v_uint32x4>()
+        .test_loadstore()
+        .test_interleave()
+        .test_expand()
+        .test_addsub()
+        .test_mul()
+        .test_mul_expand()
+        .test_cmp()
+        .test_shift<1>()
+        .test_shift<8>()
+        .test_logic()
+        .test_min_max()
+        .test_absdiff()
+        .test_reduce()
+        .test_mask()
+        .test_pack<1>().test_pack<2>().test_pack<15>().test_pack<32>()
+        .test_unpack()
+        .test_extract<0>().test_extract<1>().test_extract<2>().test_extract<3>()
+        .test_transpose()
+        ;
+}
+
+TEST(hal_intrin, int32x4) {
+    TheTest<v_int32x4>()
+        .test_loadstore()
+        .test_interleave()
+        .test_expand()
+        .test_addsub()
+        .test_mul()
+        .test_cmp()
+        .test_shift<1>().test_shift<8>()
+        .test_logic()
+        .test_min_max()
+        .test_absdiff()
+        .test_reduce()
+        .test_mask()
+        .test_pack<1>().test_pack<2>().test_pack<15>().test_pack<32>()
+        .test_unpack()
+        .test_extract<0>().test_extract<1>().test_extract<2>().test_extract<3>()
+        .test_float_cvt32()
+        .test_float_cvt64()
+        .test_transpose()
+        ;
+}
+
+//============= 64-bit integer =====================================================================
+
+TEST(hal_intrin, uint64x2) {
+    TheTest<v_uint64x2>()
+        .test_loadstore()
+        .test_addsub()
+        .test_shift<1>().test_shift<8>()
+        .test_logic()
+        .test_extract<0>().test_extract<1>()
+        ;
+}
+
+TEST(hal_intrin, int64x2) {
+    TheTest<v_int64x2>()
+        .test_loadstore()
+        .test_addsub()
+        .test_shift<1>().test_shift<8>()
+        .test_logic()
+        .test_extract<0>().test_extract<1>()
+        ;
+}
+
+//============= Floating point =====================================================================
+
+TEST(hal_intrin, float32x4) {
+    TheTest<v_float32x4>()
+        .test_loadstore()
+        .test_interleave()
+        .test_addsub()
+        .test_mul()
+        .test_div()
+        .test_cmp()
+        .test_sqrt_abs()
+        .test_min_max()
+        .test_float_absdiff()
+        .test_reduce()
+        .test_mask()
+        .test_unpack()
+        .test_float_math()
+        .test_float_cvt64()
+        .test_matmul()
+        .test_transpose()
+        ;
+}
+
+#if CV_SIMD128_64F
+TEST(hal_intrin, float64x2) {
+    TheTest<v_float64x2>()
+        .test_loadstore()
+        .test_addsub()
+        .test_mul()
+        .test_div()
+        .test_cmp()
+        .test_sqrt_abs()
+        .test_min_max()
+        .test_float_absdiff()
+        .test_mask()
+        .test_unpack()
+        .test_float_math()
+        .test_float_cvt32()
+        ;
+}
+#endif
--- a/modules/hal/test/test_intrin_utils.hpp
+++ b/modules/hal/test/test_intrin_utils.hpp
+#ifndef _TEST_UTILS_HPP_
+#define _TEST_UTILS_HPP_
+
+#include "opencv2/hal/intrin.hpp"
+#include "opencv2/ts.hpp"
+#include <ostream>
+#include <algorithm>
+
+template <typename R> struct Data;
+template <int N> struct initializer;
+
+template <> struct initializer<16>
+{
+    template <typename R> static R init(const Data<R> & d)
+    {
+        return R(d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7], d[8], d[9], d[10], d[11], d[12], d[13], d[14], d[15]);
+    }
+};
+
+template <> struct initializer<8>
+{
+    template <typename R> static R init(const Data<R> & d)
+    {
+        return R(d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7]);
+    }
+};
+
+template <> struct initializer<4>
+{
+    template <typename R> static R init(const Data<R> & d)
+    {
+        return R(d[0], d[1], d[2], d[3]);
+    }
+};
+
+template <> struct initializer<2>
+{
+    template <typename R> static R init(const Data<R> & d)
+    {
+        return R(d[0], d[1]);
+    }
+};
+
+//==================================================================================================
+
+template <typename R> struct Data
+{
+    typedef typename R::lane_type LaneType;
+    Data()
+    {
+        for (int i = 0; i < R::nlanes; ++i)
+            d[i] = (LaneType)(i + 1);
+    }
+    Data(LaneType val)
+    {
+        fill(val);
+    }
+    Data(const R & r)
+    {
+        *this = r;
+    }
+    operator R ()
+    {
+        return initializer<R::nlanes>().init(*this);
+    }
+    Data<R> & operator=(const R & r)
+    {
+        v_store(d, r);
+        return *this;
+    }
+    template <typename T> Data<R> & operator*=(T m)
+    {
+        for (int i = 0; i < R::nlanes; ++i)
+            d[i] *= (LaneType)m;
+        return *this;
+    }
+    template <typename T> Data<R> & operator+=(T m)
+    {
+        for (int i = 0; i < R::nlanes; ++i)
+            d[i] += (LaneType)m;
+        return *this;
+    }
+    void fill(LaneType val)
+    {
+        for (int i = 0; i < R::nlanes; ++i)
+            d[i] = val;
+    }
+    void reverse()
+    {
+        for (int i = 0; i < R::nlanes / 2; ++i)
+            std::swap(d[i], d[R::nlanes - i - 1]);
+    }
+    const LaneType & operator[](int i) const
+    {
+        CV_Assert(i >= 0 && i < R::nlanes);
+        return d[i];
+    }
+    LaneType & operator[](int i)
+    {
+        CV_Assert(i >= 0 && i < R::nlanes);
+        return d[i];
+    }
+    const LaneType * mid() const
+    {
+        return d + R::nlanes / 2;
+    }
+    LaneType * mid()
+    {
+        return d + R::nlanes / 2;
+    }
+    bool operator==(const Data<R> & other) const
+    {
+        for (int i = 0; i < R::nlanes; ++i)
+            if (d[i] != other.d[i])
+                return false;
+        return true;
+    }
+    void clear()
+    {
+        fill(0);
+    }
+    bool isZero() const
+    {
+        return isValue(0);
+    }
+    bool isValue(uchar val) const
+    {
+        for (int i = 0; i < R::nlanes; ++i)
+            if (d[i] != val)
+                return false;
+        return true;
+    }
+
+    LaneType d[R::nlanes];
+};
+
+template<typename R> struct AlignedData
+{
+    Data<R> CV_DECL_ALIGNED(16) a; // aligned
+    char dummy;
+    Data<R> u; // unaligned
+};
+
+template <typename R> std::ostream & operator<<(std::ostream & out, const Data<R> & d)
+{
+    out << "{ ";
+    for (int i = 0; i < R::nlanes; ++i)
+    {
+        // out << std::hex << +V_TypeTraits<typename R::lane_type>::reinterpret_int(d.d[i]);
+        out << +d.d[i];
+        if (i + 1 < R::nlanes)
+            out << ", ";
+    }
+    out << " }";
+    return out;
+}
+
+//==================================================================================================
+
+template <typename R> struct RegTrait;
+
+template <> struct RegTrait<cv::v_uint8x16> {
+    typedef cv::v_uint16x8 w_reg;
+    typedef cv::v_uint32x4 q_reg;
+    typedef cv::v_uint8x16 u_reg;
+    static cv::v_uint8x16 zero() { return cv::v_setzero_u8(); }
+    static cv::v_uint8x16 all(uchar val) { return cv::v_setall_u8(val); }
+};
+template <> struct RegTrait<cv::v_int8x16> {
+    typedef cv::v_int16x8 w_reg;
+    typedef cv::v_int32x4 q_reg;
+    typedef cv::v_uint8x16 u_reg;
+    static cv::v_int8x16 zero() { return cv::v_setzero_s8(); }
+    static cv::v_int8x16 all(schar val) { return cv::v_setall_s8(val); }
+};
+
+template <> struct RegTrait<cv::v_uint16x8> {
+    typedef cv::v_uint32x4 w_reg;
+    typedef cv::v_int16x8 int_reg;
+    typedef cv::v_uint16x8 u_reg;
+    static cv::v_uint16x8 zero() { return cv::v_setzero_u16(); }
+    static cv::v_uint16x8 all(ushort val) { return cv::v_setall_u16(val); }
+};
+
+template <> struct RegTrait<cv::v_int16x8> {
+    typedef cv::v_int32x4 w_reg;
+    typedef cv::v_uint16x8 u_reg;
+    static cv::v_int16x8 zero() { return cv::v_setzero_s16(); }
+    static cv::v_int16x8 all(short val) { return cv::v_setall_s16(val); }
+};
+
+template <> struct RegTrait<cv::v_uint32x4> {
+    typedef cv::v_uint64x2 w_reg;
+    typedef cv::v_int32x4 int_reg;
+    typedef cv::v_uint32x4 u_reg;
+    static cv::v_uint32x4 zero() { return cv::v_setzero_u32(); }
+    static cv::v_uint32x4 all(unsigned val) { return cv::v_setall_u32(val); }
+};
+
+template <> struct RegTrait<cv::v_int32x4> {
+    typedef cv::v_int64x2 w_reg;
+    typedef cv::v_uint32x4 u_reg;
+    static cv::v_int32x4 zero() { return cv::v_setzero_s32(); }
+    static cv::v_int32x4 all(int val) { return cv::v_setall_s32(val); }
+};
+
+template <> struct RegTrait<cv::v_uint64x2> {
+    static cv::v_uint64x2 zero() { return cv::v_setzero_u64(); }
+    static cv::v_uint64x2 all(uint64 val) { return cv::v_setall_u64(val); }
+};
+
+template <> struct RegTrait<cv::v_int64x2> {
+    static cv::v_int64x2 zero() { return cv::v_setzero_s64(); }
+    static cv::v_int64x2 all(int64 val) { return cv::v_setall_s64(val); }
+};
+
+template <> struct RegTrait<cv::v_float32x4> {
+    typedef cv::v_int32x4 int_reg;
+    typedef cv::v_float32x4 u_reg;
+    static cv::v_float32x4 zero() { return cv::v_setzero_f32(); }
+    static cv::v_float32x4 all(float val) { return cv::v_setall_f32(val); }
+};
+
+#if CV_SIMD128_64F
+template <> struct RegTrait<cv::v_float64x2> {
+    typedef cv::v_int32x4 int_reg;
+    typedef cv::v_float64x2 u_reg;
+    static cv::v_float64x2 zero() { return cv::v_setzero_f64(); }
+    static cv::v_float64x2 all(double val) { return cv::v_setall_f64(val); }
+};
+
+#endif
+
+#endif
--- a/modules/hal/test/test_main.cpp
+++ b/modules/hal/test/test_main.cpp
+#include "opencv2/ts.hpp"
+
+CV_TEST_MAIN("cv")
--- a/modules/hal/test/test_precomp.hpp
+++ b/modules/hal/test/test_precomp.hpp
+#ifndef __OPENCV_HAL_TEST_PRECOMP_HPP__
+#define __OPENCV_HAL_TEST_PRECOMP_HPP__
+
+#include <iostream>
+#include <limits>
+#include "opencv2/ts.hpp"
+#include "opencv2/hal.hpp"
+#include "opencv2/hal/defs.h"
+#include "opencv2/hal/intrin.hpp"
+
+#endif