core:test Expand hal_intrin tests to support SIMD256

6499263b · Sayed Adel · 5336b9ad · 6499263b · 6499263b · 6499263b
Commit 6499263b authored Jul 24, 2018 by Sayed Adel
8 changed files
--- a/modules/core/include/opencv2/core/hal/intrin.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin.hpp
@@ -154,7 +154,7 @@ using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
 // but some of AVX2 intrinsics get v256_ prefix instead of v_, e.g. v256_load() vs v_load().
 // Correspondingly, the wide intrinsics (which are mapped to the "widest"
 // available instruction set) will get vx_ prefix
-// (and will be mapped to v256_ counterparts) (e.g. vx_load() => v245_load())
+// (and will be mapped to v256_ counterparts) (e.g. vx_load() => v256_load())
 #if CV_AVX2
 #include "opencv2/core/hal/intrin_avx.hpp"
@@ -214,14 +214,16 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
    inline vtyp vx_setzero_##short_typ() { return prefix##_setzero_##short_typ(); } \
    inline vtyp vx_##loadsfx(const typ* ptr) { return prefix##_##loadsfx(ptr); } \
    inline vtyp vx_##loadsfx##_aligned(const typ* ptr) { return prefix##_##loadsfx##_aligned(ptr); } \
+    inline vtyp vx_##loadsfx##_low(const typ* ptr) { return prefix##_##loadsfx##_low(ptr); } \
+    inline vtyp vx_##loadsfx##_halves(const typ* ptr0, const typ* ptr1) { return prefix##_##loadsfx##_halves(ptr0, ptr1); } \
    inline void vx_store(typ* ptr, const vtyp& v) { return v_store(ptr, v); } \
    inline void vx_store_aligned(typ* ptr, const vtyp& v) { return v_store_aligned(ptr, v); }
 #define CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(typ, wtyp, prefix) \
-inline wtyp vx_load_expand(const typ* ptr) { return prefix##_load_expand(ptr); }
+    inline wtyp vx_load_expand(const typ* ptr) { return prefix##_load_expand(ptr); }
 #define CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND_Q(typ, qtyp, prefix) \
-inline qtyp vx_load_expand_q(const typ* ptr) { return prefix##_load_expand_q(ptr); }
+    inline qtyp vx_load_expand_q(const typ* ptr) { return prefix##_load_expand_q(ptr); }
 #define CV_INTRIN_DEFINE_WIDE_INTRIN_WITH_EXPAND(typ, vtyp, short_typ, wtyp, qtyp, prefix, loadsfx) \
    CV_INTRIN_DEFINE_WIDE_INTRIN(typ, vtyp, short_typ, prefix, loadsfx) \
@@ -316,7 +318,7 @@ template<typename _Tp> struct V_RegTraits
    CV_INTRIN_DEFINE_WIDE_INTRIN_ALL_TYPES(v256)
    CV_INTRIN_DEFINE_WIDE_INTRIN(double, v_float64, f64, v256, load)
    inline void vx_cleanup() { v256_cleanup(); }
-#elif CV_SIMD128
+#elif CV_SIMD128 || CV_SIMD128_CPP
    typedef v_uint8x16  v_uint8;
    typedef v_int8x16   v_int8;
    typedef v_uint16x8  v_uint16;

--- a/modules/core/include/opencv2/core/hal/intrin_avx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_avx.hpp
@@ -407,6 +407,11 @@ inline v_float16x16 v256_load_f16(const short* ptr)
 inline v_float16x16 v256_load_f16_aligned(const short* ptr)
 { return v_float16x16(_mm256_load_si256((const __m256i*)ptr)); }
+inline v_float16x16 v256_load_f16_low(const short* ptr)
+{ return v_float16x16(v256_load_low(ptr).val); }
+inline v_float16x16 v256_load_f16_halves(const short* ptr0, const short* ptr1)
+{ return v_float16x16(v256_load_halves(ptr0, ptr1).val); }
 inline void v_store(short* ptr, const v_float16x16& a)
 { _mm256_storeu_si256((__m256i*)ptr, a.val); }
 inline void v_store_aligned(short* ptr, const v_float16x16& a)
@@ -819,94 +824,80 @@ OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_float64x4, _mm256_max_pd)
 template<int imm>
 inline v_uint8x32 v_rotate_left(const v_uint8x32& a, const v_uint8x32& b)
 {
-    __m256i swap = _mm256_permute2x128_si256(a.val, b.val, 0x03);
+    enum {IMM_R = (16 - imm) & 0xFF};
+    enum {IMM_R2 = (32 - imm) & 0xFF};
-    switch(imm)
-    {
-        case 0:  return a;
-        case 32: return b;
-        case 16: return v_uint8x32(swap);
-    }
-    if (imm < 16) return v_uint8x32(_mm256_alignr_epi8(a.val, swap, 16 - imm));
+    if (imm == 0)  return a;
-    if (imm < 32) return v_uint8x32(_mm256_alignr_epi8(swap, b.val, 32 - imm));
+    if (imm == 32) return b;
+    if (imm > 32)  return v_uint8x32();
-    return v_uint8x32();
+    __m256i swap = _mm256_permute2x128_si256(a.val, b.val, 0x03);
+    if (imm == 16) return v_uint8x32(swap);
+    if (imm < 16)  return v_uint8x32(_mm256_alignr_epi8(a.val, swap, IMM_R));
+    return v_uint8x32(_mm256_alignr_epi8(swap, b.val, IMM_R2)); // imm < 32
 }
 template<int imm>
 inline v_uint8x32 v_rotate_right(const v_uint8x32& a, const v_uint8x32& b)
 {
-    __m256i swap = _mm256_permute2x128_si256(a.val, b.val, 0x21);
+    enum {IMM_L = (imm - 16) & 0xFF};
-    switch(imm)
+    if (imm == 0)  return a;
-    {
+    if (imm == 32) return b;
-        case 0:  return a;
+    if (imm > 32)  return v_uint8x32();
-        case 32: return b;
-        case 16: return v_uint8x32(swap);
-    }
-    if (imm < 16) return v_uint8x32(_mm256_alignr_epi8(swap, a.val, imm));
+    __m256i swap = _mm256_permute2x128_si256(a.val, b.val, 0x21);
-    if (imm < 32) return v_uint8x32(_mm256_alignr_epi8(b.val, swap, imm - 16));
+    if (imm == 16) return v_uint8x32(swap);
+    if (imm < 16)  return v_uint8x32(_mm256_alignr_epi8(swap, a.val, imm));
-    return v_uint8x32();
+    return v_uint8x32(_mm256_alignr_epi8(b.val, swap, IMM_L));
 }
 template<int imm>
 inline v_uint8x32 v_rotate_left(const v_uint8x32& a)
 {
-    v_uint8x32 res;
+    enum {IMM_L = (imm - 16) & 0xFF};
+    enum {IMM_R = (16 - imm) & 0xFF};
+    if (imm == 0) return a;
+    if (imm > 32) return v_uint8x32();
    // ESAC control[3] ? [127:0] = 0
    __m256i swapz = _mm256_permute2x128_si256(a.val, a.val, _MM_SHUFFLE(0, 0, 2, 0));
+    if (imm == 16) return v_uint8x32(swapz);
-    if (imm == 0)
+    if (imm < 16)  return v_uint8x32(_mm256_alignr_epi8(a.val, swapz, IMM_R));
-        return a;
+    return v_uint8x32(_mm256_slli_si256(swapz, IMM_L));
-    if (imm == 16)
-        res.val = swapz;
-    else if (imm < 16)
-        res.val = _mm256_alignr_epi8(a.val, swapz, 16 - imm);
-    else if (imm < 32)
-        res.val = _mm256_slli_si256(swapz, imm - 16);
-    else
-        return v_uint8x32();
-    return res;
 }
 template<int imm>
 inline v_uint8x32 v_rotate_right(const v_uint8x32& a)
 {
-    v_uint8x32 res;
+    enum {IMM_L = (imm - 16) & 0xFF};
+    if (imm == 0) return a;
+    if (imm > 32) return v_uint8x32();
    // ESAC control[3] ? [127:0] = 0
    __m256i swapz = _mm256_permute2x128_si256(a.val, a.val, _MM_SHUFFLE(2, 0, 0, 1));
+    if (imm == 16) return v_uint8x32(swapz);
-    if (imm == 0)
+    if (imm < 16)  return v_uint8x32(_mm256_alignr_epi8(swapz, a.val, imm));
-        return a;
+    return v_uint8x32(_mm256_srli_si256(swapz, IMM_L));
-    if (imm == 16)
+}
-        res.val = swapz;
-    else if (imm < 16)
+#define OPENCV_HAL_IMPL_AVX_ROTATE_CAST(intrin, _Tpvec, cast)     \
-        res.val = _mm256_alignr_epi8(swapz, a.val, imm);
+    template<int imm>                                             \
-    else if (imm < 32)
+    inline _Tpvec intrin(const _Tpvec& a, const _Tpvec& b)        \
-        res.val = _mm256_srli_si256(swapz, imm - 16);
+    {                                                             \
-    else
+        enum {IMMxW = imm * sizeof(typename _Tpvec::lane_type)};  \
-        return v_uint8x32();
+        v_uint8x32 ret = intrin<IMMxW>(v_reinterpret_as_u8(a),    \
-    return res;
+                                       v_reinterpret_as_u8(b));   \
-}
+        return _Tpvec(cast(ret.val));                             \
+    }                                                             \
-#define OPENCV_HAL_IMPL_AVX_ROTATE_CAST(intrin, _Tpvec, cast)   \
+    template<int imm>                                             \
-    template<int imm>                                           \
+    inline _Tpvec intrin(const _Tpvec& a)                         \
-    inline _Tpvec intrin(const _Tpvec& a, const _Tpvec& b)      \
+    {                                                             \
-    {                                                           \
+        enum {IMMxW = imm * sizeof(typename _Tpvec::lane_type)};  \
-        const int w = sizeof(typename _Tpvec::lane_type);       \
+        v_uint8x32 ret = intrin<IMMxW>(v_reinterpret_as_u8(a));   \
-        v_uint8x32 ret = intrin<imm*w>(v_reinterpret_as_u8(a),  \
+        return _Tpvec(cast(ret.val));                             \
-                                       v_reinterpret_as_u8(b)); \
-        return _Tpvec(cast(ret.val));                           \
-    }                                                           \
-    template<int imm>                                           \
-    inline _Tpvec intrin(const _Tpvec& a)                       \
-    {                                                           \
-        const int w = sizeof(typename _Tpvec::lane_type);       \
-        v_uint8x32 ret = intrin<imm*w>(v_reinterpret_as_u8(a)); \
-        return _Tpvec(cast(ret.val));                           \
    }
 #define OPENCV_HAL_IMPL_AVX_ROTATE(_Tpvec)                                  \

--- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
@@ -319,6 +319,9 @@ static inline void cv_vst1_f16(void* ptr, float16x4_t a)
 #endif
 }
+#ifndef vdup_n_f16
+    #define vdup_n_f16(v) (float16x4_t){v, v, v, v}
+#endif
 struct v_float16x8
 {
@@ -889,6 +892,11 @@ inline v_float16x8 v_load_f16(const short* ptr)
 inline v_float16x8 v_load_f16_aligned(const short* ptr)
 { return v_float16x8(cv_vld1q_f16(ptr)); }
+inline v_float16x8 v_load_f16_low(const short* ptr)
+{ return v_float16x8(vcombine_f16(cv_vld1_f16(ptr), vdup_n_f16((float16_t)0))); }
+inline v_float16x8 v_load_f16_halves(const short* ptr0, const short* ptr1)
+{ return v_float16x8(vcombine_f16(cv_vld1_f16(ptr0), cv_vld1_f16(ptr1))); }
 inline void v_store(short* ptr, const v_float16x8& a)
 { cv_vst1q_f16(ptr, a.val); }
 inline void v_store_aligned(short* ptr, const v_float16x8& a)

--- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp
@@ -1308,6 +1308,11 @@ inline v_float16x8 v_load_f16(const short* ptr)
 inline v_float16x8 v_load_f16_aligned(const short* ptr)
 { return v_float16x8(_mm_load_si128((const __m128i*)ptr)); }
+inline v_float16x8 v_load_f16_low(const short* ptr)
+{ return v_float16x8(v_load_low(ptr).val); }
+inline v_float16x8 v_load_f16_halves(const short* ptr0, const short* ptr1)
+{ return v_float16x8(v_load_halves(ptr0, ptr1).val); }
 inline void v_store(short* ptr, const v_float16x8& a)
 { _mm_storeu_si128((__m128i*)ptr, a.val); }
 inline void v_store_aligned(short* ptr, const v_float16x8& a)

--- a/modules/core/test/test_intrin.avx2.cpp
+++ b/modules/core/test/test_intrin.avx2.cpp
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+#include "test_precomp.hpp"
+#include "test_intrin.simd.hpp"
\ No newline at end of file
--- a/modules/core/test/test_intrin.cpp
+++ b/modules/core/test/test_intrin.cpp
@@ -2,249 +2,101 @@
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html.
 #include "test_precomp.hpp"
+#include "test_intrin.simd.hpp"
-#include "test_intrin_utils.hpp"
+#define CV_CPU_SIMD_FILENAME "test_intrin.simd.hpp"
-#define CV_CPU_SIMD_FILENAME "test_intrin_utils.hpp"
 #define CV_CPU_DISPATCH_MODE FP16
 #include "opencv2/core/private/cv_cpu_include_simd_declarations.hpp"
+#define CV_CPU_DISPATCH_MODE AVX2
-using namespace cv;
+#include "opencv2/core/private/cv_cpu_include_simd_declarations.hpp"
 namespace opencv_test { namespace hal {
 using namespace CV_CPU_OPTIMIZATION_NAMESPACE;
-//=============  8-bit integer =====================================================================
+TEST(hal_intrin, uint8x16)
+{ test_hal_intrin_uint8(); }
-TEST(hal_intrin, uint8x16) {
-    TheTest<v_uint8x16>()
-        .test_loadstore()
-        .test_interleave()
-        .test_expand()
-        .test_expand_q()
-        .test_addsub()
-        .test_addsub_wrap()
-        .test_cmp()
-        .test_logic()
-        .test_min_max()
-        .test_absdiff()
-        .test_mask()
-        .test_popcount()
-        .test_pack<1>().test_pack<2>().test_pack<3>().test_pack<8>()
-        .test_pack_u<1>().test_pack_u<2>().test_pack_u<3>().test_pack_u<8>()
-        .test_unpack()
-        .test_extract<0>().test_extract<1>().test_extract<8>().test_extract<15>()
-        .test_rotate<0>().test_rotate<1>().test_rotate<8>().test_rotate<15>()
-        ;
-}
-TEST(hal_intrin, int8x16) {
+TEST(hal_intrin, int8x16)
-    TheTest<v_int8x16>()
+{ test_hal_intrin_int8(); }
-        .test_loadstore()
-        .test_interleave()
-        .test_expand()
-        .test_expand_q()
-        .test_addsub()
-        .test_addsub_wrap()
-        .test_cmp()
-        .test_logic()
-        .test_min_max()
-        .test_absdiff()
-        .test_abs()
-        .test_mask()
-        .test_popcount()
-        .test_pack<1>().test_pack<2>().test_pack<3>().test_pack<8>()
-        .test_unpack()
-        .test_extract<0>().test_extract<1>().test_extract<8>().test_extract<15>()
-        .test_rotate<0>().test_rotate<1>().test_rotate<8>().test_rotate<15>()
-        ;
-}
-//============= 16-bit integer =====================================================================
+TEST(hal_intrin, uint16x8)
+{ test_hal_intrin_uint16(); }
-TEST(hal_intrin, uint16x8) {
-    TheTest<v_uint16x8>()
-        .test_loadstore()
-        .test_interleave()
-        .test_expand()
-        .test_addsub()
-        .test_addsub_wrap()
-        .test_mul()
-        .test_mul_expand()
-        .test_cmp()
-        .test_shift<1>()
-        .test_shift<8>()
-        .test_logic()
-        .test_min_max()
-        .test_absdiff()
-        .test_reduce()
-        .test_mask()
-        .test_popcount()
-        .test_pack<1>().test_pack<2>().test_pack<7>().test_pack<16>()
-        .test_pack_u<1>().test_pack_u<2>().test_pack_u<7>().test_pack_u<16>()
-        .test_unpack()
-        .test_extract<0>().test_extract<1>().test_extract<4>().test_extract<7>()
-        .test_rotate<0>().test_rotate<1>().test_rotate<4>().test_rotate<7>()
-        ;
-}
-TEST(hal_intrin, int16x8) {
+TEST(hal_intrin, int16x8)
-    TheTest<v_int16x8>()
+{ test_hal_intrin_int16(); }
-        .test_loadstore()
-        .test_interleave()
-        .test_expand()
-        .test_addsub()
-        .test_addsub_wrap()
-        .test_mul()
-        .test_mul_expand()
-        .test_cmp()
-        .test_shift<1>()
-        .test_shift<8>()
-        .test_dot_prod()
-        .test_logic()
-        .test_min_max()
-        .test_absdiff()
-        .test_abs()
-        .test_reduce()
-        .test_mask()
-        .test_popcount()
-        .test_pack<1>().test_pack<2>().test_pack<7>().test_pack<16>()
-        .test_unpack()
-        .test_extract<0>().test_extract<1>().test_extract<4>().test_extract<7>()
-        .test_rotate<0>().test_rotate<1>().test_rotate<4>().test_rotate<7>()
-        ;
-}
-//============= 32-bit integer =====================================================================
+TEST(hal_intrin, int32x4)
+{ test_hal_intrin_int32(); }
-TEST(hal_intrin, uint32x4) {
-    TheTest<v_uint32x4>()
-        .test_loadstore()
-        .test_interleave()
-        .test_expand()
-        .test_addsub()
-        .test_mul()
-        .test_mul_expand()
-        .test_cmp()
-        .test_shift<1>()
-        .test_shift<8>()
-        .test_logic()
-        .test_min_max()
-        .test_absdiff()
-        .test_reduce()
-        .test_mask()
-        .test_popcount()
-        .test_pack<1>().test_pack<2>().test_pack<15>().test_pack<32>()
-        .test_unpack()
-        .test_extract<0>().test_extract<1>().test_extract<2>().test_extract<3>()
-        .test_rotate<0>().test_rotate<1>().test_rotate<2>().test_rotate<3>()
-        .test_transpose()
-        ;
-}
-TEST(hal_intrin, int32x4) {
+TEST(hal_intrin, uint32x4)
-    TheTest<v_int32x4>()
+{ test_hal_intrin_uint32(); }
-        .test_loadstore()
-        .test_interleave()
-        .test_expand()
-        .test_addsub()
-        .test_mul()
-        .test_abs()
-        .test_cmp()
-        .test_popcount()
-        .test_shift<1>().test_shift<8>()
-        .test_logic()
-        .test_min_max()
-        .test_absdiff()
-        .test_reduce()
-        .test_mask()
-        .test_pack<1>().test_pack<2>().test_pack<15>().test_pack<32>()
-        .test_unpack()
-        .test_extract<0>().test_extract<1>().test_extract<2>().test_extract<3>()
-        .test_rotate<0>().test_rotate<1>().test_rotate<2>().test_rotate<3>()
-        .test_float_cvt32()
-        .test_float_cvt64()
-        .test_transpose()
-        ;
-}
-//============= 64-bit integer =====================================================================
+TEST(hal_intrin, uint64x2)
+{ test_hal_intrin_uint64(); }
-TEST(hal_intrin, uint64x2) {
-    TheTest<v_uint64x2>()
-        .test_loadstore()
-        .test_addsub()
-        .test_shift<1>().test_shift<8>()
-        .test_logic()
-        .test_extract<0>().test_extract<1>()
-        .test_rotate<0>().test_rotate<1>()
-        ;
-}
-TEST(hal_intrin, int64x2) {
+TEST(hal_intrin, int64x2)
-    TheTest<v_int64x2>()
+{ test_hal_intrin_int64(); }
-        .test_loadstore()
-        .test_addsub()
-        .test_shift<1>().test_shift<8>()
-        .test_logic()
-        .test_extract<0>().test_extract<1>()
-        .test_rotate<0>().test_rotate<1>()
-        ;
-}
-//============= Floating point =====================================================================
+TEST(hal_intrin, float32x4)
+{ test_hal_intrin_float32(); }
-TEST(hal_intrin, float32x4) {
-    TheTest<v_float32x4>()
-        .test_loadstore()
-        .test_interleave()
-        .test_interleave_2channel()
-        .test_addsub()
-        .test_mul()
-        .test_div()
-        .test_cmp()
-        .test_sqrt_abs()
-        .test_min_max()
-        .test_float_absdiff()
-        .test_reduce()
-        .test_mask()
-        .test_unpack()
-        .test_float_math()
-        .test_float_cvt64()
-        .test_matmul()
-        .test_transpose()
-        .test_reduce_sum4()
-        .test_extract<0>().test_extract<1>().test_extract<2>().test_extract<3>()
-        .test_rotate<0>().test_rotate<1>().test_rotate<2>().test_rotate<3>()
-        ;
-}
-#if CV_SIMD128_64F
+TEST(hal_intrin, float64x2)
-TEST(hal_intrin, float64x2) {
+{ test_hal_intrin_float64(); }
-    TheTest<v_float64x2>()
-        .test_loadstore()
-        .test_addsub()
-        .test_mul()
-        .test_div()
-        .test_cmp()
-        .test_sqrt_abs()
-        .test_min_max()
-        .test_float_absdiff()
-        .test_mask()
-        .test_unpack()
-        .test_float_math()
-        .test_float_cvt32()
-        .test_extract<0>().test_extract<1>()
-        .test_rotate<0>().test_rotate<1>()
-        ;
-}
-#endif
-TEST(hal_intrin,float16)
+TEST(hal_intrin, float16x8)
 {
    CV_CPU_CALL_FP16_(test_hal_intrin_float16, ());
    throw SkipTestException("Unsupported hardware: FP16 is not available");
 }
-}}
+#define DISPATCH_SIMD_MODES AVX2
+#define DISPATCH_SIMD_NAME "SIMD256"
+#define DISPATCH_SIMD(fun)                              \
+    do {                                                \
+        CV_CPU_DISPATCH(fun, (), DISPATCH_SIMD_MODES);  \
+        throw SkipTestException(                        \
+            "Unsupported hardware: "                    \
+            DISPATCH_SIMD_NAME                          \
+            " is not available"                         \
+        );                                              \
+    } while(0)
+TEST(hal_intrin256, uint8x32)
+{ DISPATCH_SIMD(test_hal_intrin_uint8); }
+TEST(hal_intrin256, int8x32)
+{ DISPATCH_SIMD(test_hal_intrin_int8); }
+TEST(hal_intrin256, uint16x16)
+{ DISPATCH_SIMD(test_hal_intrin_uint16); }
+TEST(hal_intrin256, int16x16)
+{ DISPATCH_SIMD(test_hal_intrin_int16); }
+TEST(hal_intrin256, uint32x8)
+{ DISPATCH_SIMD(test_hal_intrin_uint32); }
+TEST(hal_intrin256, int32x8)
+{ DISPATCH_SIMD(test_hal_intrin_int32); }
+TEST(hal_intrin256, uint64x4)
+{ DISPATCH_SIMD(test_hal_intrin_uint64); }
+TEST(hal_intrin256, int64x4)
+{ DISPATCH_SIMD(test_hal_intrin_int64); }
+TEST(hal_intrin256, float32x8)
+{ DISPATCH_SIMD(test_hal_intrin_float32); }
+TEST(hal_intrin256, float64x4)
+{ DISPATCH_SIMD(test_hal_intrin_float64); }
+TEST(hal_intrin256, float16x16)
+{
+    if (!CV_CPU_HAS_SUPPORT_FP16)
+        throw SkipTestException("Unsupported hardware: FP16 is not available");
+    DISPATCH_SIMD(test_hal_intrin_float16);
+}
+}} // namespace
\ No newline at end of file
--- a/modules/core/test/test_intrin.simd.hpp
+++ b/modules/core/test/test_intrin.simd.hpp
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+#include "test_precomp.hpp"
+#include "test_intrin_utils.hpp"
+namespace opencv_test { namespace hal {
+CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
+void test_hal_intrin_uint8();
+void test_hal_intrin_int8();
+void test_hal_intrin_uint16();
+void test_hal_intrin_int16();
+void test_hal_intrin_uint32();
+void test_hal_intrin_int32();
+void test_hal_intrin_uint64();
+void test_hal_intrin_int64();
+void test_hal_intrin_float32();
+void test_hal_intrin_float64();
+#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
+//=============  8-bit integer =====================================================================
+void test_hal_intrin_uint8()
+{
+    TheTest<v_uint8>()
+        .test_loadstore()
+        .test_interleave()
+        .test_expand()
+        .test_expand_q()
+        .test_addsub()
+        .test_addsub_wrap()
+        .test_cmp()
+        .test_logic()
+        .test_min_max()
+        .test_absdiff()
+        .test_mask()
+        .test_popcount()
+        .test_pack<1>().test_pack<2>().test_pack<3>().test_pack<8>()
+        .test_pack_u<1>().test_pack_u<2>().test_pack_u<3>().test_pack_u<8>()
+        .test_unpack()
+        .test_extract<0>().test_extract<1>().test_extract<8>().test_extract<15>()
+        .test_rotate<0>().test_rotate<1>().test_rotate<8>().test_rotate<15>()
+        ;
+#if CV_SIMD256
+    TheTest<v_uint8>()
+        .test_pack<9>().test_pack<10>().test_pack<13>().test_pack<15>()
+        .test_pack_u<9>().test_pack_u<10>().test_pack_u<13>().test_pack_u<15>()
+        .test_extract<16>().test_extract<17>().test_extract<23>().test_extract<31>()
+        .test_rotate<16>().test_rotate<17>().test_rotate<23>().test_rotate<31>()
+        ;
+#endif
+}
+void test_hal_intrin_int8()
+{
+    TheTest<v_int8>()
+        .test_loadstore()
+        .test_interleave()
+        .test_expand()
+        .test_expand_q()
+        .test_addsub()
+        .test_addsub_wrap()
+        .test_cmp()
+        .test_logic()
+        .test_min_max()
+        .test_absdiff()
+        .test_abs()
+        .test_mask()
+        .test_popcount()
+        .test_pack<1>().test_pack<2>().test_pack<3>().test_pack<8>()
+        .test_unpack()
+        .test_extract<0>().test_extract<1>().test_extract<8>().test_extract<15>()
+        .test_rotate<0>().test_rotate<1>().test_rotate<8>().test_rotate<15>()
+        ;
+}
+//============= 16-bit integer =====================================================================
+void test_hal_intrin_uint16()
+{
+    TheTest<v_uint16>()
+        .test_loadstore()
+        .test_interleave()
+        .test_expand()
+        .test_addsub()
+        .test_addsub_wrap()
+        .test_mul()
+        .test_mul_expand()
+        .test_cmp()
+        .test_shift<1>()
+        .test_shift<8>()
+        .test_logic()
+        .test_min_max()
+        .test_absdiff()
+        .test_reduce()
+        .test_mask()
+        .test_popcount()
+        .test_pack<1>().test_pack<2>().test_pack<7>().test_pack<16>()
+        .test_pack_u<1>().test_pack_u<2>().test_pack_u<7>().test_pack_u<16>()
+        .test_unpack()
+        .test_extract<0>().test_extract<1>().test_extract<4>().test_extract<7>()
+        .test_rotate<0>().test_rotate<1>().test_rotate<4>().test_rotate<7>()
+        ;
+}
+void test_hal_intrin_int16()
+{
+    TheTest<v_int16>()
+        .test_loadstore()
+        .test_interleave()
+        .test_expand()
+        .test_addsub()
+        .test_addsub_wrap()
+        .test_mul()
+        .test_mul_expand()
+        .test_cmp()
+        .test_shift<1>()
+        .test_shift<8>()
+        .test_dot_prod()
+        .test_logic()
+        .test_min_max()
+        .test_absdiff()
+        .test_abs()
+        .test_reduce()
+        .test_mask()
+        .test_popcount()
+        .test_pack<1>().test_pack<2>().test_pack<7>().test_pack<16>()
+        .test_unpack()
+        .test_extract<0>().test_extract<1>().test_extract<4>().test_extract<7>()
+        .test_rotate<0>().test_rotate<1>().test_rotate<4>().test_rotate<7>()
+        ;
+}
+//============= 32-bit integer =====================================================================
+void test_hal_intrin_uint32()
+{
+    TheTest<v_uint32>()
+        .test_loadstore()
+        .test_interleave()
+        .test_expand()
+        .test_addsub()
+        .test_mul()
+        .test_mul_expand()
+        .test_cmp()
+        .test_shift<1>()
+        .test_shift<8>()
+        .test_logic()
+        .test_min_max()
+        .test_absdiff()
+        .test_reduce()
+        .test_mask()
+        .test_popcount()
+        .test_pack<1>().test_pack<2>().test_pack<15>().test_pack<32>()
+        .test_unpack()
+        .test_extract<0>().test_extract<1>().test_extract<2>().test_extract<3>()
+        .test_rotate<0>().test_rotate<1>().test_rotate<2>().test_rotate<3>()
+        .test_transpose()
+        ;
+}
+void test_hal_intrin_int32()
+{
+    TheTest<v_int32>()
+        .test_loadstore()
+        .test_interleave()
+        .test_expand()
+        .test_addsub()
+        .test_mul()
+        .test_abs()
+        .test_cmp()
+        .test_popcount()
+        .test_shift<1>().test_shift<8>()
+        .test_logic()
+        .test_min_max()
+        .test_absdiff()
+        .test_reduce()
+        .test_mask()
+        .test_pack<1>().test_pack<2>().test_pack<15>().test_pack<32>()
+        .test_unpack()
+        .test_extract<0>().test_extract<1>().test_extract<2>().test_extract<3>()
+        .test_rotate<0>().test_rotate<1>().test_rotate<2>().test_rotate<3>()
+        .test_float_cvt32()
+        .test_float_cvt64()
+        .test_transpose()
+        ;
+}
+//============= 64-bit integer =====================================================================
+void test_hal_intrin_uint64()
+{
+    TheTest<v_uint64>()
+        .test_loadstore()
+        .test_addsub()
+        .test_shift<1>().test_shift<8>()
+        .test_logic()
+        .test_extract<0>().test_extract<1>()
+        .test_rotate<0>().test_rotate<1>()
+        ;
+}
+void test_hal_intrin_int64()
+{
+    TheTest<v_int64>()
+        .test_loadstore()
+        .test_addsub()
+        .test_shift<1>().test_shift<8>()
+        .test_logic()
+        .test_extract<0>().test_extract<1>()
+        .test_rotate<0>().test_rotate<1>()
+        ;
+}
+//============= Floating point =====================================================================
+void test_hal_intrin_float32()
+{
+    TheTest<v_float32>()
+        .test_loadstore()
+        .test_interleave()
+        .test_interleave_2channel()
+        .test_addsub()
+        .test_mul()
+        .test_div()
+        .test_cmp()
+        .test_sqrt_abs()
+        .test_min_max()
+        .test_float_absdiff()
+        .test_reduce()
+        .test_mask()
+        .test_unpack()
+        .test_float_math()
+        .test_float_cvt64()
+        .test_matmul()
+        .test_transpose()
+        .test_reduce_sum4()
+        .test_extract<0>().test_extract<1>().test_extract<2>().test_extract<3>()
+        .test_rotate<0>().test_rotate<1>().test_rotate<2>().test_rotate<3>()
+        ;
+#if CV_SIMD256
+    TheTest<v_float32>()
+        .test_extract<4>().test_extract<5>().test_extract<6>().test_extract<7>()
+        .test_rotate<4>().test_rotate<5>().test_rotate<6>().test_rotate<7>()
+        ;
+#endif
+}
+void test_hal_intrin_float64()
+{
+#if CV_SIMD_64F
+    TheTest<v_float64>()
+        .test_loadstore()
+        .test_addsub()
+        .test_mul()
+        .test_div()
+        .test_cmp()
+        .test_sqrt_abs()
+        .test_min_max()
+        .test_float_absdiff()
+        .test_mask()
+        .test_unpack()
+        .test_float_math()
+        .test_float_cvt32()
+        .test_extract<0>().test_extract<1>()
+        .test_rotate<0>().test_rotate<1>()
+        ;
+#if CV_SIMD256
+    TheTest<v_float64>()
+        .test_extract<2>().test_extract<3>()
+        .test_rotate<2>().test_rotate<3>()
+        ;
+#endif //CV_SIMD256
+#endif
+}
+#if CV_FP16 && CV_SIMD_WIDTH > 16
+void test_hal_intrin_float16()
+{
+    TheTest<v_float16>()
+        .test_loadstore_fp16()
+        .test_float_cvt_fp16()
+        ;
+}
+#endif
+#endif //CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
+CV_CPU_OPTIMIZATION_NAMESPACE_END
+}} //namespace
\ No newline at end of file
--- a/modules/core/test/test_intrin_utils.hpp
+++ b/modules/core/test/test_intrin_utils.hpp