added basic support for CV_16F (the new datatype etc.) (#12463)

* added basic support for CV_16F (the new datatype etc.). CV_USRTYPE1 is now equal to CV_16F, which may break some [rarely used] functionality. We'll see * fixed just introduced bug in norm; reverted errorneous changes in Torch importer (need to find a better solution) * addressed some issues found during the PR review * restored the patch to fix some perf test failures

added basic support for CV_16F (the new datatype etc.) (#12463)
* added basic support for CV_16F (the new datatype etc.). CV_USRTYPE1 is now equal to CV_16F, which may break some [rarely used] functionality. We'll see * fixed just introduced bug in norm; reverted errorneous changes in Torch importer (need to find a better solution) * addressed some issues found during the PR review * restored the patch to fix some perf test failures
6d7f5871 · Vadim Pisarevsky · GitHub · dca657a2 · 6d7f5871 · 6d7f5871
Unverified Commit 6d7f5871 authored Sep 10, 2018 by Vadim Pisarevsky Committed by GitHub Sep 10, 2018
23 changed files
--- a/modules/core/include/opencv2/core.hpp
+++ b/modules/core/include/opencv2/core.hpp
@@ -3009,6 +3009,7 @@ public:

    virtual Ptr<Formatted> format(const Mat& mtx) const = 0;

+    virtual void set16fPrecision(int p = 4) = 0;
    virtual void set32fPrecision(int p = 8) = 0;
    virtual void set64fPrecision(int p = 16) = 0;
    virtual void setMultiline(bool ml = true) = 0;

--- a/modules/core/include/opencv2/core/cvdef.h
+++ b/modules/core/include/opencv2/core/cvdef.h
@@ -317,13 +317,10 @@ Cv64suf;
 #define CV_IS_SUBMAT(flags)     ((flags) & CV_MAT_SUBMAT_FLAG)

 /** Size of each channel item,
-   0x8442211 = 1000 0100 0100 0010 0010 0001 0001 ~ array of sizeof(arr_type_elem) */
-#define CV_ELEM_SIZE1(type) \
-    ((((sizeof(size_t)<<28)|0x8442211) >> CV_MAT_DEPTH(type)*4) & 15)
+   0x28442211 = 0010 1000 0100 0100 0010 0010 0001 0001 ~ array of sizeof(arr_type_elem) */
+#define CV_ELEM_SIZE1(type) ((0x28442211 >> CV_MAT_DEPTH(type)*4) & 15)

-/** 0x3a50 = 11 10 10 01 01 00 00 ~ array of log2(sizeof(arr_type_elem)) */
-#define CV_ELEM_SIZE(type) \
-    (CV_MAT_CN(type) << ((((sizeof(size_t)/4+1)*16384|0x3a50) >> CV_MAT_DEPTH(type)*2) & 3))
+#define CV_ELEM_SIZE(type) (CV_MAT_CN(type)*CV_ELEM_SIZE1(type))

 #ifndef MIN
 #  define MIN(a,b)  ((a) > (b) ? (b) : (a))

--- a/modules/core/include/opencv2/core/hal/hal.hpp
+++ b/modules/core/include/opencv2/core/hal/hal.hpp
@@ -195,6 +195,12 @@ CV_EXPORTS void addWeighted32s( const int* src1, size_t step1, const int* src2,
 CV_EXPORTS void addWeighted32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* scalars );
 CV_EXPORTS void addWeighted64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* scalars );

+CV_EXPORTS void cvt16f32f( const float16_t* src, float* dst, int len );
+CV_EXPORTS void cvt32f16f( const float* src, float16_t* dst, int len );
+
+CV_EXPORTS void addRNGBias32f( float* arr, const float* scaleBiasPairs, int len );
+CV_EXPORTS void addRNGBias64f( double* arr, const double* scaleBiasPairs, int len );
+
 struct CV_EXPORTS DFT1D
 {
    static Ptr<DFT1D> create(int len, int count, int depth, int flags, bool * useBuffer = 0);

--- a/modules/core/include/opencv2/core/hal/interface.h
+++ b/modules/core/include/opencv2/core/hal/interface.h
@@ -76,6 +76,7 @@ typedef signed char schar;
 #define CV_32F  5
 #define CV_64F  6
 #define CV_USRTYPE1 7
+#define CV_16F  7

 #define CV_MAT_DEPTH_MASK       (CV_DEPTH_MAX - 1)
 #define CV_MAT_DEPTH(flags)     ((flags) & CV_MAT_DEPTH_MASK)
@@ -124,6 +125,12 @@ typedef signed char schar;
 #define CV_64FC3 CV_MAKETYPE(CV_64F,3)
 #define CV_64FC4 CV_MAKETYPE(CV_64F,4)
 #define CV_64FC(n) CV_MAKETYPE(CV_64F,(n))
+
+#define CV_16FC1 CV_MAKETYPE(CV_16F,1)
+#define CV_16FC2 CV_MAKETYPE(CV_16F,2)
+#define CV_16FC3 CV_MAKETYPE(CV_16F,3)
+#define CV_16FC4 CV_MAKETYPE(CV_16F,4)
+#define CV_16FC(n) CV_MAKETYPE(CV_16F,(n))
 //! @}

 //! @name Comparison operation

--- a/modules/core/include/opencv2/core/mat.hpp
+++ b/modules/core/include/opencv2/core/mat.hpp
@@ -296,8 +296,10 @@ public:
        DEPTH_MASK_32S = 1 << CV_32S,
        DEPTH_MASK_32F = 1 << CV_32F,
        DEPTH_MASK_64F = 1 << CV_64F,
+        DEPTH_MASK_16F = 1 << CV_16F,
        DEPTH_MASK_ALL = (DEPTH_MASK_64F<<1)-1,
        DEPTH_MASK_ALL_BUT_8S = DEPTH_MASK_ALL & ~DEPTH_MASK_8S,
+        DEPTH_MASK_ALL_16F = (DEPTH_MASK_16F<<1)-1,
        DEPTH_MASK_FLT = DEPTH_MASK_32F + DEPTH_MASK_64F
    };


--- a/modules/core/include/opencv2/core/saturate.hpp
+++ b/modules/core/include/opencv2/core/saturate.hpp
@@ -158,6 +158,22 @@ template<> inline uint64 saturate_cast<uint64>(int64 v)      { return (uint64)st

 template<> inline int64 saturate_cast<int64>(uint64 v)       { return (int64)std::min(v, (uint64)LLONG_MAX); }

+/** @overload */
+template<typename _Tp> static inline _Tp saturate_cast(float16_t v) { return saturate_cast<_Tp>((float)v); }
+
+// in theory, we could use a LUT for 8u/8s->16f conversion,
+// but with hardware support for FP32->FP16 conversion the current approach is preferable
+template<> inline float16_t saturate_cast<float16_t>(uchar v)   { return float16_t((float)v); }
+template<> inline float16_t saturate_cast<float16_t>(schar v)   { return float16_t((float)v); }
+template<> inline float16_t saturate_cast<float16_t>(ushort v)  { return float16_t((float)v); }
+template<> inline float16_t saturate_cast<float16_t>(short v)   { return float16_t((float)v); }
+template<> inline float16_t saturate_cast<float16_t>(unsigned v){ return float16_t((float)v); }
+template<> inline float16_t saturate_cast<float16_t>(int v)     { return float16_t((float)v); }
+template<> inline float16_t saturate_cast<float16_t>(uint64 v)  { return float16_t((float)v); }
+template<> inline float16_t saturate_cast<float16_t>(int64 v)   { return float16_t((float)v); }
+template<> inline float16_t saturate_cast<float16_t>(float v)   { return float16_t(v); }
+template<> inline float16_t saturate_cast<float16_t>(double v)  { return float16_t((float)v); }
+
 //! @}

 } // cv

--- a/modules/core/include/opencv2/core/traits.hpp
+++ b/modules/core/include/opencv2/core/traits.hpp
@@ -261,6 +261,20 @@ public:
         };
 };

+template<> class DataType<float16_t>
+{
+public:
+    typedef float16_t   value_type;
+    typedef float       work_type;
+    typedef value_type  channel_type;
+    typedef value_type  vec_type;
+    enum { generic_type = 0,
+           depth        = CV_16F,
+           channels     = 1,
+           fmt          = (int)'h',
+           type         = CV_MAKETYPE(depth, channels)
+         };
+};

 /** @brief A helper class for cv::DataType

@@ -330,6 +344,12 @@ template<> class TypeDepth<CV_64F>
    typedef double value_type;
 };

+template<> class TypeDepth<CV_16F>
+{
+    enum { depth = CV_16F };
+    typedef float16_t value_type;
+};
+
 #endif

 //! @}

--- a/modules/core/src/array.cpp
+++ b/modules/core/src/array.cpp
@@ -3262,6 +3262,9 @@ void scalarToRawData(const Scalar& s, void* _buf, int type, int unroll_to)
    case CV_64F:
        scalarToRawData_<double>(s, (double*)_buf, cn, unroll_to);
        break;
+    case CV_16F:
+        scalarToRawData_<float16_t>(s, (float16_t*)_buf, cn, unroll_to);
+        break;
    default:
        CV_Error(CV_StsUnsupportedFormat,"");
    }

--- a/modules/core/src/check.cpp
+++ b/modules/core/src/check.cpp
@@ -43,15 +43,15 @@ static const char* getTestOpMath(unsigned testOp)

 const char* depthToString_(int depth)
 {
-    static const char* depthNames[] = { "CV_8U", "CV_8S", "CV_16U", "CV_16S", "CV_32S", "CV_32F", "CV_64F", "CV_USRTYPE1" };
-    return (depth <= CV_USRTYPE1 && depth >= 0) ? depthNames[depth] : NULL;
+    static const char* depthNames[] = { "CV_8U", "CV_8S", "CV_16U", "CV_16S", "CV_32S", "CV_32F", "CV_64F", "CV_16F" };
+    return (depth <= CV_16F && depth >= 0) ? depthNames[depth] : NULL;
 }

 const cv::String typeToString_(int type)
 {
    int depth = CV_MAT_DEPTH(type);
    int cn = CV_MAT_CN(type);
-    if (depth >= 0 && depth <= CV_USRTYPE1)
+    if (depth >= 0 && depth <= CV_16F)
        return cv::format("%sC%d", depthToString_(depth), cn);
    return cv::String();
 }

--- a/modules/core/src/convert.cpp
+++ b/modules/core/src/convert.cpp
@@ -8,7 +8,7 @@

 namespace cv {

-/*namespace hal {
+namespace hal {

 void cvt16f32f( const float16_t* src, float* dst, int len )
 {
@@ -50,21 +50,21 @@ void cvt32f16f( const float* src, float16_t* dst, int len )
        dst[j] = float16_t(src[j]);
 }

-/*void addRNGBias32f( float* arr, const float* scaleBiasPairs, int len )
+void addRNGBias32f( float* arr, const float* scaleBiasPairs, int len )
 {
    // the loop is simple enough, so we let the compiler to vectorize it
    for( int i = 0; i < len; i++ )
-        arr[i] = scaleBiasPairs[i*2 + 1];
+        arr[i] += scaleBiasPairs[i*2 + 1];
 }

 void addRNGBias64f( double* arr, const double* scaleBiasPairs, int len )
 {
    // the loop is simple enough, so we let the compiler to vectorize it
    for( int i = 0; i < len; i++ )
-        arr[i] = scaleBiasPairs[i*2 + 1];
+        arr[i] += scaleBiasPairs[i*2 + 1];
 }

-}*/
+}

 template<typename _Ts, typename _Td, typename _Twvec> inline void
 cvt_( const _Ts* src, size_t sstep, _Td* dst, size_t dstep, Size size )
@@ -150,7 +150,7 @@ DEF_CVT_FUNC(8u16s, cvt_,  uchar, short,    v_int16)
 DEF_CVT_FUNC(8u32s, cvt_,  uchar, int,      v_int32)
 DEF_CVT_FUNC(8u32f, cvt_,  uchar, float,    v_float32)
 DEF_CVT_FUNC(8u64f, cvt_,  uchar, double,   v_int32)
-//DEF_CVT_FUNC(8u16f, cvt1_, uchar, float16_t, v_float32)
+DEF_CVT_FUNC(8u16f, cvt1_, uchar, float16_t, v_float32)

 ////////////////////// 8s -> ... ////////////////////////

@@ -160,7 +160,7 @@ DEF_CVT_FUNC(8s16s, cvt_,  schar, short,    v_int16)
 DEF_CVT_FUNC(8s32s, cvt_,  schar, int,      v_int32)
 DEF_CVT_FUNC(8s32f, cvt_,  schar, float,    v_float32)
 DEF_CVT_FUNC(8s64f, cvt_,  schar, double,   v_int32)
-//DEF_CVT_FUNC(8s16f, cvt1_, schar, float16_t, v_float32)
+DEF_CVT_FUNC(8s16f, cvt1_, schar, float16_t, v_float32)

 ////////////////////// 16u -> ... ////////////////////////

@@ -170,7 +170,7 @@ DEF_CVT_FUNC(16u16s, cvt_, ushort, short,  v_int32)
 DEF_CVT_FUNC(16u32s, cvt_, ushort, int,    v_int32)
 DEF_CVT_FUNC(16u32f, cvt_, ushort, float,  v_float32)
 DEF_CVT_FUNC(16u64f, cvt_, ushort, double, v_int32)
-//DEF_CVT_FUNC(16u16f, cvt1_,ushort, float16_t, v_float32)
+DEF_CVT_FUNC(16u16f, cvt1_,ushort, float16_t, v_float32)

 ////////////////////// 16s -> ... ////////////////////////

@@ -180,7 +180,7 @@ DEF_CVT_FUNC(16s16u, cvt_, short, ushort, v_int32)
 DEF_CVT_FUNC(16s32s, cvt_, short, int,    v_int32)
 DEF_CVT_FUNC(16s32f, cvt_, short, float,  v_float32)
 DEF_CVT_FUNC(16s64f, cvt_, short, double, v_int32)
-//DEF_CVT_FUNC(16s16f, cvt1_,short, float16_t, v_float32)
+DEF_CVT_FUNC(16s16f, cvt1_,short, float16_t, v_float32)

 ////////////////////// 32s -> ... ////////////////////////

@@ -190,7 +190,7 @@ DEF_CVT_FUNC(32s16u, cvt_, int, ushort, v_int32)
 DEF_CVT_FUNC(32s16s, cvt_, int, short,  v_int32)
 DEF_CVT_FUNC(32s32f, cvt_, int, float,  v_float32)
 DEF_CVT_FUNC(32s64f, cvt_, int, double, v_int32)
-//DEF_CVT_FUNC(32s16f, cvt1_,int, float16_t, v_float32)
+DEF_CVT_FUNC(32s16f, cvt1_,int, float16_t, v_float32)

 ////////////////////// 32f -> ... ////////////////////////

@@ -210,17 +210,17 @@ DEF_CVT_FUNC(64f16u, cvt_, double, ushort, v_int32)
 DEF_CVT_FUNC(64f16s, cvt_, double, short,  v_int32)
 DEF_CVT_FUNC(64f32s, cvt_, double, int,    v_int32)
 DEF_CVT_FUNC(64f32f, cvt_, double, float,  v_float32)
-//DEF_CVT_FUNC(64f16f, cvt1_,double, float16_t, v_float32)
+DEF_CVT_FUNC(64f16f, cvt1_,double, float16_t, v_float32)

 ////////////////////// 16f -> ... ////////////////////////

-//DEF_CVT_FUNC(16f8u,  cvt_,  float16_t, uchar,  v_float32)
-//DEF_CVT_FUNC(16f8s,  cvt_,  float16_t, schar,  v_float32)
-//DEF_CVT_FUNC(16f16u, cvt1_, float16_t, ushort, v_float32)
-//DEF_CVT_FUNC(16f16s, cvt1_, float16_t, short,  v_float32)
-//DEF_CVT_FUNC(16f32s, cvt1_, float16_t, int,    v_float32)
+DEF_CVT_FUNC(16f8u,  cvt_,  float16_t, uchar,  v_float32)
+DEF_CVT_FUNC(16f8s,  cvt_,  float16_t, schar,  v_float32)
+DEF_CVT_FUNC(16f16u, cvt1_, float16_t, ushort, v_float32)
+DEF_CVT_FUNC(16f16s, cvt1_, float16_t, short,  v_float32)
+DEF_CVT_FUNC(16f32s, cvt1_, float16_t, int,    v_float32)
 DEF_CVT_FUNC(16f32f, cvt1_, float16_t, float,  v_float32)
-//DEF_CVT_FUNC(16f64f, cvt1_, float16_t, double, v_float32)
+DEF_CVT_FUNC(16f64f, cvt1_, float16_t, double, v_float32)

 ///////////// "conversion" w/o conversion ///////////////

@@ -339,42 +339,41 @@ BinaryFunc getConvertFunc(int sdepth, int ddepth)
        {
            (BinaryFunc)(cvt8u), (BinaryFunc)GET_OPTIMIZED(cvt8s8u), (BinaryFunc)GET_OPTIMIZED(cvt16u8u),
            (BinaryFunc)GET_OPTIMIZED(cvt16s8u), (BinaryFunc)GET_OPTIMIZED(cvt32s8u), (BinaryFunc)GET_OPTIMIZED(cvt32f8u),
-            (BinaryFunc)GET_OPTIMIZED(cvt64f8u), 0 //(BinaryFunc)(cvt16f8u)
+            (BinaryFunc)GET_OPTIMIZED(cvt64f8u), (BinaryFunc)(cvt16f8u)
        },
        {
            (BinaryFunc)GET_OPTIMIZED(cvt8u8s), (BinaryFunc)cvt8u, (BinaryFunc)GET_OPTIMIZED(cvt16u8s),
            (BinaryFunc)GET_OPTIMIZED(cvt16s8s), (BinaryFunc)GET_OPTIMIZED(cvt32s8s), (BinaryFunc)GET_OPTIMIZED(cvt32f8s),
-            (BinaryFunc)GET_OPTIMIZED(cvt64f8s), 0 //(BinaryFunc)(cvt16f8s)
+            (BinaryFunc)GET_OPTIMIZED(cvt64f8s), (BinaryFunc)(cvt16f8s)
        },
        {
            (BinaryFunc)GET_OPTIMIZED(cvt8u16u), (BinaryFunc)GET_OPTIMIZED(cvt8s16u), (BinaryFunc)cvt16u,
            (BinaryFunc)GET_OPTIMIZED(cvt16s16u), (BinaryFunc)GET_OPTIMIZED(cvt32s16u), (BinaryFunc)GET_OPTIMIZED(cvt32f16u),
-            (BinaryFunc)GET_OPTIMIZED(cvt64f16u), 0 //(BinaryFunc)(cvt16f16u)
+            (BinaryFunc)GET_OPTIMIZED(cvt64f16u), (BinaryFunc)(cvt16f16u)
        },
        {
            (BinaryFunc)GET_OPTIMIZED(cvt8u16s), (BinaryFunc)GET_OPTIMIZED(cvt8s16s), (BinaryFunc)GET_OPTIMIZED(cvt16u16s),
            (BinaryFunc)cvt16u, (BinaryFunc)GET_OPTIMIZED(cvt32s16s), (BinaryFunc)GET_OPTIMIZED(cvt32f16s),
-            (BinaryFunc)GET_OPTIMIZED(cvt64f16s), 0 //(BinaryFunc)(cvt16f16s)
+            (BinaryFunc)GET_OPTIMIZED(cvt64f16s), (BinaryFunc)(cvt16f16s)
        },
        {
            (BinaryFunc)GET_OPTIMIZED(cvt8u32s), (BinaryFunc)GET_OPTIMIZED(cvt8s32s), (BinaryFunc)GET_OPTIMIZED(cvt16u32s),
            (BinaryFunc)GET_OPTIMIZED(cvt16s32s), (BinaryFunc)cvt32s, (BinaryFunc)GET_OPTIMIZED(cvt32f32s),
-            (BinaryFunc)GET_OPTIMIZED(cvt64f32s), 0 //(BinaryFunc)(cvt16f32s)
+            (BinaryFunc)GET_OPTIMIZED(cvt64f32s), (BinaryFunc)(cvt16f32s)
        },
        {
            (BinaryFunc)GET_OPTIMIZED(cvt8u32f), (BinaryFunc)GET_OPTIMIZED(cvt8s32f), (BinaryFunc)GET_OPTIMIZED(cvt16u32f),
            (BinaryFunc)GET_OPTIMIZED(cvt16s32f), (BinaryFunc)GET_OPTIMIZED(cvt32s32f), (BinaryFunc)cvt32s,
-            (BinaryFunc)GET_OPTIMIZED(cvt64f32f), 0 //(BinaryFunc)(cvt16f32f)
+            (BinaryFunc)GET_OPTIMIZED(cvt64f32f), (BinaryFunc)(cvt16f32f)
        },
        {
            (BinaryFunc)GET_OPTIMIZED(cvt8u64f), (BinaryFunc)GET_OPTIMIZED(cvt8s64f), (BinaryFunc)GET_OPTIMIZED(cvt16u64f),
            (BinaryFunc)GET_OPTIMIZED(cvt16s64f), (BinaryFunc)GET_OPTIMIZED(cvt32s64f), (BinaryFunc)GET_OPTIMIZED(cvt32f64f),
-            (BinaryFunc)(cvt64s), 0 //(BinaryFunc)(cvt16f64f)
+            (BinaryFunc)(cvt64s), (BinaryFunc)(cvt16f64f)
        },
        {
-            0, 0, 0, 0, 0, 0, 0, 0
-            //(BinaryFunc)(cvt8u16f), (BinaryFunc)(cvt8s16f), (BinaryFunc)(cvt16u16f), (BinaryFunc)(cvt16s16f),
-            //(BinaryFunc)(cvt32s16f), (BinaryFunc)(cvt32f16f), (BinaryFunc)(cvt64f16f), (BinaryFunc)(cvt16u)
+            (BinaryFunc)(cvt8u16f), (BinaryFunc)(cvt8s16f), (BinaryFunc)(cvt16u16f), (BinaryFunc)(cvt16s16f),
+            (BinaryFunc)(cvt32s16f), (BinaryFunc)(cvt32f16f), (BinaryFunc)(cvt64f16f), (BinaryFunc)(cvt16u)
        }
    };
    return cvtTab[CV_MAT_DEPTH(ddepth)][CV_MAT_DEPTH(sdepth)];
@@ -481,7 +480,7 @@ void cv::convertFp16( InputArray _src, OutputArray _dst )
        if(_dst.fixedType())
        {
            ddepth = _dst.depth();
-            CV_Assert(ddepth == CV_16S /*|| ddepth == CV_16F*/);
+            CV_Assert(ddepth == CV_16S || ddepth == CV_16F);
            CV_Assert(_dst.channels() == _src.channels());
        }
        else
@@ -489,7 +488,7 @@ void cv::convertFp16( InputArray _src, OutputArray _dst )
        func = (BinaryFunc)cvt32f16f;
        break;
    case CV_16S:
-    //case CV_16F:
+    case CV_16F:
        ddepth = CV_32F;
        func = (BinaryFunc)cvt16f32f;
        break;

--- a/modules/core/src/convert.hpp
+++ b/modules/core/src/convert.hpp
@@ -150,12 +150,11 @@ static inline void vx_load_pair_as(const int* ptr, v_float32& a, v_float32& b)
 static inline void vx_load_pair_as(const float* ptr, v_float32& a, v_float32& b)
 { a = vx_load(ptr); b = vx_load(ptr + v_float32::nlanes); }

-//static inline void vx_load_pair_as(const float16_t* ptr, v_float32& a, v_float32& b)
-//{
-//    a = vx_load_expand(ptr);
-//    b = vx_load_expand(ptr + v_float32::nlanes);
-//}
-
+static inline void vx_load_pair_as(const float16_t* ptr, v_float32& a, v_float32& b)
+{
+    a = vx_load_expand(ptr);
+    b = vx_load_expand(ptr + v_float32::nlanes);
+}

 static inline void v_store_pair_as(uchar* ptr, const v_uint16& a, const v_uint16& b)
 {
@@ -295,12 +294,12 @@ static inline void vx_load_pair_as(const double* ptr, v_float64& a, v_float64& b
    b = vx_load(ptr + v_float64::nlanes);
 }

-//static inline void vx_load_pair_as(const float16_t* ptr, v_float64& a, v_float64& b)
-//{
-//    v_float32 v0 = vx_load_expand(ptr);
-//    a = v_cvt_f64(v0);
-//    b = v_cvt_f64_high(v0);
-//}
+static inline void vx_load_pair_as(const float16_t* ptr, v_float64& a, v_float64& b)
+{
+    v_float32 v0 = vx_load_expand(ptr);
+    a = v_cvt_f64(v0);
+    b = v_cvt_f64_high(v0);
+}

 static inline void v_store_as(double* ptr, const v_float32& a)
 {
@@ -349,11 +348,11 @@ static inline void v_store_pair_as(float* ptr, const v_float64& a, const v_float
    v_store(ptr, v);
 }

-//static inline void v_store_pair_as(float16_t* ptr, const v_float64& a, const v_float64& b)
-//{
-//    v_float32 v = v_cvt_f32(a, b);
-//    v_pack_store(ptr, v);
-//}
+static inline void v_store_pair_as(float16_t* ptr, const v_float64& a, const v_float64& b)
+{
+    v_float32 v = v_cvt_f32(a, b);
+    v_pack_store(ptr, v);
+}

 #else


--- a/modules/core/src/convert_scale.cpp
+++ b/modules/core/src/convert_scale.cpp
@@ -222,7 +222,7 @@ DEF_CVT_SCALE_FUNC(16s8u,  cvt_32f, short,  uchar, float)
 DEF_CVT_SCALE_FUNC(32s8u,  cvt_32f, int,    uchar, float)
 DEF_CVT_SCALE_FUNC(32f8u,  cvt_32f, float,  uchar, float)
 DEF_CVT_SCALE_FUNC(64f8u,  cvt_32f, double, uchar, float)
-//DEF_CVT_SCALE_FUNC(16f8u,  cvt_32f, float16_t, uchar, float)
+DEF_CVT_SCALE_FUNC(16f8u,  cvt_32f, float16_t, uchar, float)

 DEF_CVT_SCALE_FUNC(8u8s,   cvt_32f, uchar,  schar, float)
 DEF_CVT_SCALE_FUNC(8s,     cvt_32f, schar,  schar, float)
@@ -231,7 +231,7 @@ DEF_CVT_SCALE_FUNC(16s8s,  cvt_32f, short,  schar, float)
 DEF_CVT_SCALE_FUNC(32s8s,  cvt_32f, int,    schar, float)
 DEF_CVT_SCALE_FUNC(32f8s,  cvt_32f, float,  schar, float)
 DEF_CVT_SCALE_FUNC(64f8s,  cvt_32f, double, schar, float)
-//DEF_CVT_SCALE_FUNC(16f8s,  cvt_32f, float16_t, schar, float)
+DEF_CVT_SCALE_FUNC(16f8s,  cvt_32f, float16_t, schar, float)

 DEF_CVT_SCALE_FUNC(8u16u,  cvt_32f, uchar,  ushort, float)
 DEF_CVT_SCALE_FUNC(8s16u,  cvt_32f, schar,  ushort, float)
@@ -240,7 +240,7 @@ DEF_CVT_SCALE_FUNC(16s16u, cvt_32f, short,  ushort, float)
 DEF_CVT_SCALE_FUNC(32s16u, cvt_32f, int,    ushort, float)
 DEF_CVT_SCALE_FUNC(32f16u, cvt_32f, float,  ushort, float)
 DEF_CVT_SCALE_FUNC(64f16u, cvt_32f, double, ushort, float)
-//DEF_CVT_SCALE_FUNC(16f16u, cvt1_32f, float16_t, ushort, float)
+DEF_CVT_SCALE_FUNC(16f16u, cvt1_32f, float16_t, ushort, float)

 DEF_CVT_SCALE_FUNC(8u16s,  cvt_32f, uchar,  short, float)
 DEF_CVT_SCALE_FUNC(8s16s,  cvt_32f, schar,  short, float)
@@ -249,7 +249,7 @@ DEF_CVT_SCALE_FUNC(16s,    cvt_32f, short,  short, float)
 DEF_CVT_SCALE_FUNC(32s16s, cvt_32f, int,    short, float)
 DEF_CVT_SCALE_FUNC(32f16s, cvt_32f, float,  short, float)
 DEF_CVT_SCALE_FUNC(64f16s, cvt_32f, double, short, float)
-//DEF_CVT_SCALE_FUNC(16f16s, cvt1_32f, float16_t, short, float)
+DEF_CVT_SCALE_FUNC(16f16s, cvt1_32f, float16_t, short, float)

 DEF_CVT_SCALE_FUNC(8u32s,  cvt_32f, uchar,  int, float)
 DEF_CVT_SCALE_FUNC(8s32s,  cvt_32f, schar,  int, float)
@@ -258,7 +258,7 @@ DEF_CVT_SCALE_FUNC(16s32s, cvt_32f, short,  int, float)
 DEF_CVT_SCALE_FUNC(32s,    cvt_64f, int,    int, double)
 DEF_CVT_SCALE_FUNC(32f32s, cvt_32f, float,  int, float)
 DEF_CVT_SCALE_FUNC(64f32s, cvt_64f, double, int, double)
-//DEF_CVT_SCALE_FUNC(16f32s, cvt1_32f, float16_t, int, float)
+DEF_CVT_SCALE_FUNC(16f32s, cvt1_32f, float16_t, int, float)

 DEF_CVT_SCALE_FUNC(8u32f,  cvt_32f, uchar,  float, float)
 DEF_CVT_SCALE_FUNC(8s32f,  cvt_32f, schar,  float, float)
@@ -267,7 +267,7 @@ DEF_CVT_SCALE_FUNC(16s32f, cvt_32f, short,  float, float)
 DEF_CVT_SCALE_FUNC(32s32f, cvt_32f, int,    float, float)
 DEF_CVT_SCALE_FUNC(32f,    cvt_32f, float,  float, float)
 DEF_CVT_SCALE_FUNC(64f32f, cvt_64f, double, float, double)
-//DEF_CVT_SCALE_FUNC(16f32f, cvt1_32f, float16_t, float, float)
+DEF_CVT_SCALE_FUNC(16f32f, cvt1_32f, float16_t, float, float)

 DEF_CVT_SCALE_FUNC(8u64f,  cvt_64f, uchar,  double, double)
 DEF_CVT_SCALE_FUNC(8s64f,  cvt_64f, schar,  double, double)
@@ -276,16 +276,16 @@ DEF_CVT_SCALE_FUNC(16s64f, cvt_64f, short,  double, double)
 DEF_CVT_SCALE_FUNC(32s64f, cvt_64f, int,    double, double)
 DEF_CVT_SCALE_FUNC(32f64f, cvt_64f, float,  double, double)
 DEF_CVT_SCALE_FUNC(64f,    cvt_64f, double, double, double)
-//DEF_CVT_SCALE_FUNC(16f64f, cvt_64f, float16_t, double, double)
+DEF_CVT_SCALE_FUNC(16f64f, cvt_64f, float16_t, double, double)

-/*DEF_CVT_SCALE_FUNC(8u16f,  cvt1_32f, uchar,  float16_t, float)
+DEF_CVT_SCALE_FUNC(8u16f,  cvt1_32f, uchar,  float16_t, float)
 DEF_CVT_SCALE_FUNC(8s16f,  cvt1_32f, schar,  float16_t, float)
 DEF_CVT_SCALE_FUNC(16u16f, cvt1_32f, ushort, float16_t, float)
 DEF_CVT_SCALE_FUNC(16s16f, cvt1_32f, short,  float16_t, float)
 DEF_CVT_SCALE_FUNC(32s16f, cvt1_32f, int,    float16_t, float)
 DEF_CVT_SCALE_FUNC(32f16f, cvt1_32f, float,  float16_t, float)
 DEF_CVT_SCALE_FUNC(64f16f, cvt_64f,  double, float16_t, double)
-DEF_CVT_SCALE_FUNC(16f,    cvt1_32f, float16_t, float16_t, float)*/
+DEF_CVT_SCALE_FUNC(16f,    cvt1_32f, float16_t, float16_t, float)

 static BinaryFunc getCvtScaleAbsFunc(int depth)
 {
@@ -306,43 +306,42 @@ BinaryFunc getConvertScaleFunc(int sdepth, int ddepth)
        {
            (BinaryFunc)GET_OPTIMIZED(cvtScale8u), (BinaryFunc)GET_OPTIMIZED(cvtScale8s8u), (BinaryFunc)GET_OPTIMIZED(cvtScale16u8u),
            (BinaryFunc)GET_OPTIMIZED(cvtScale16s8u), (BinaryFunc)GET_OPTIMIZED(cvtScale32s8u), (BinaryFunc)GET_OPTIMIZED(cvtScale32f8u),
-            (BinaryFunc)cvtScale64f8u, 0 //(BinaryFunc)cvtScale16f8u
+            (BinaryFunc)cvtScale64f8u, (BinaryFunc)cvtScale16f8u
        },
        {
            (BinaryFunc)GET_OPTIMIZED(cvtScale8u8s), (BinaryFunc)GET_OPTIMIZED(cvtScale8s), (BinaryFunc)GET_OPTIMIZED(cvtScale16u8s),
            (BinaryFunc)GET_OPTIMIZED(cvtScale16s8s), (BinaryFunc)GET_OPTIMIZED(cvtScale32s8s), (BinaryFunc)GET_OPTIMIZED(cvtScale32f8s),
-            (BinaryFunc)cvtScale64f8s, 0 //(BinaryFunc)cvtScale16f8s
+            (BinaryFunc)cvtScale64f8s, (BinaryFunc)cvtScale16f8s
        },
        {
            (BinaryFunc)GET_OPTIMIZED(cvtScale8u16u), (BinaryFunc)GET_OPTIMIZED(cvtScale8s16u), (BinaryFunc)GET_OPTIMIZED(cvtScale16u),
            (BinaryFunc)GET_OPTIMIZED(cvtScale16s16u), (BinaryFunc)GET_OPTIMIZED(cvtScale32s16u), (BinaryFunc)GET_OPTIMIZED(cvtScale32f16u),
-            (BinaryFunc)cvtScale64f16u, 0 //(BinaryFunc)cvtScale16f16u
+            (BinaryFunc)cvtScale64f16u, (BinaryFunc)cvtScale16f16u
        },
        {
            (BinaryFunc)GET_OPTIMIZED(cvtScale8u16s), (BinaryFunc)GET_OPTIMIZED(cvtScale8s16s), (BinaryFunc)GET_OPTIMIZED(cvtScale16u16s),
            (BinaryFunc)GET_OPTIMIZED(cvtScale16s), (BinaryFunc)GET_OPTIMIZED(cvtScale32s16s), (BinaryFunc)GET_OPTIMIZED(cvtScale32f16s),
-            (BinaryFunc)cvtScale64f16s, 0 //(BinaryFunc)cvtScale16f16s
+            (BinaryFunc)cvtScale64f16s, (BinaryFunc)cvtScale16f16s
        },
        {
            (BinaryFunc)GET_OPTIMIZED(cvtScale8u32s), (BinaryFunc)GET_OPTIMIZED(cvtScale8s32s), (BinaryFunc)GET_OPTIMIZED(cvtScale16u32s),
            (BinaryFunc)GET_OPTIMIZED(cvtScale16s32s), (BinaryFunc)GET_OPTIMIZED(cvtScale32s), (BinaryFunc)GET_OPTIMIZED(cvtScale32f32s),
-            (BinaryFunc)cvtScale64f32s, 0 //(BinaryFunc)cvtScale16f32s
+            (BinaryFunc)cvtScale64f32s, (BinaryFunc)cvtScale16f32s
        },
        {
            (BinaryFunc)GET_OPTIMIZED(cvtScale8u32f), (BinaryFunc)GET_OPTIMIZED(cvtScale8s32f), (BinaryFunc)GET_OPTIMIZED(cvtScale16u32f),
            (BinaryFunc)GET_OPTIMIZED(cvtScale16s32f), (BinaryFunc)GET_OPTIMIZED(cvtScale32s32f), (BinaryFunc)GET_OPTIMIZED(cvtScale32f),
-            (BinaryFunc)cvtScale64f32f, 0 //(BinaryFunc)cvtScale16f32f
+            (BinaryFunc)cvtScale64f32f, (BinaryFunc)cvtScale16f32f
        },
        {
            (BinaryFunc)cvtScale8u64f, (BinaryFunc)cvtScale8s64f, (BinaryFunc)cvtScale16u64f,
            (BinaryFunc)cvtScale16s64f, (BinaryFunc)cvtScale32s64f, (BinaryFunc)cvtScale32f64f,
-            (BinaryFunc)cvtScale64f, 0 //(BinaryFunc)cvtScale16f64f
+            (BinaryFunc)cvtScale64f, (BinaryFunc)cvtScale16f64f
        },
        {
-            0, 0, 0, 0, 0, 0, 0, 0
-            /*(BinaryFunc)cvtScale8u16f, (BinaryFunc)cvtScale8s16f, (BinaryFunc)cvtScale16u16f,
+            (BinaryFunc)cvtScale8u16f, (BinaryFunc)cvtScale8s16f, (BinaryFunc)cvtScale16u16f,
            (BinaryFunc)cvtScale16s16f, (BinaryFunc)cvtScale32s16f, (BinaryFunc)cvtScale32f16f,
-            (BinaryFunc)cvtScale64f16f, (BinaryFunc)cvtScale16f*/
+            (BinaryFunc)cvtScale64f16f, (BinaryFunc)cvtScale16f
        },
    };


--- a/modules/core/src/merge.cpp
+++ b/modules/core/src/merge.cpp
@@ -216,8 +216,10 @@ static MergeFunc getMergeFunc(int depth)
 {
    static MergeFunc mergeTab[] =
    {
-        (MergeFunc)GET_OPTIMIZED(cv::hal::merge8u), (MergeFunc)GET_OPTIMIZED(cv::hal::merge8u), (MergeFunc)GET_OPTIMIZED(cv::hal::merge16u), (MergeFunc)GET_OPTIMIZED(cv::hal::merge16u),
-        (MergeFunc)GET_OPTIMIZED(cv::hal::merge32s), (MergeFunc)GET_OPTIMIZED(cv::hal::merge32s), (MergeFunc)GET_OPTIMIZED(cv::hal::merge64s), 0
+        (MergeFunc)GET_OPTIMIZED(cv::hal::merge8u), (MergeFunc)GET_OPTIMIZED(cv::hal::merge8u),
+        (MergeFunc)GET_OPTIMIZED(cv::hal::merge16u), (MergeFunc)GET_OPTIMIZED(cv::hal::merge16u),
+        (MergeFunc)GET_OPTIMIZED(cv::hal::merge32s), (MergeFunc)GET_OPTIMIZED(cv::hal::merge32s),
+        (MergeFunc)GET_OPTIMIZED(cv::hal::merge64s), (MergeFunc)GET_OPTIMIZED(cv::hal::merge16u)
    };

    return mergeTab[depth];

--- a/modules/core/src/norm.cpp
+++ b/modules/core/src/norm.cpp
@@ -723,7 +723,7 @@ double cv::norm( InputArray _src, int normType, InputArray _mask )
        return result;
    }

-    NormFunc func = getNormFunc(normType >> 1, depth);
+    NormFunc func = getNormFunc(normType >> 1, depth == CV_16F ? CV_32F : depth);
    CV_Assert( func != 0 );

    const Mat* arrays[] = {&src, &mask, 0};
@@ -737,19 +737,31 @@ double cv::norm( InputArray _src, int normType, InputArray _mask )
    result;
    result.d = 0;
    NAryMatIterator it(arrays, ptrs);
-    int j, total = (int)it.size, blockSize = total, intSumBlockSize = 0, count = 0;
-    bool blockSum = (normType == NORM_L1 && depth <= CV_16S) ||
+    int j, total = (int)it.size, blockSize = total;
+    bool blockSum = depth == CV_16F || (normType == NORM_L1 && depth <= CV_16S) ||
            ((normType == NORM_L2 || normType == NORM_L2SQR) && depth <= CV_8S);
    int isum = 0;
    int *ibuf = &result.i;
+    AutoBuffer<float> fltbuf_;
+    float* fltbuf = 0;
    size_t esz = 0;

    if( blockSum )
    {
-        intSumBlockSize = (normType == NORM_L1 && depth <= CV_8S ? (1 << 23) : (1 << 15))/cn;
-        blockSize = std::min(blockSize, intSumBlockSize);
-        ibuf = &isum;
        esz = src.elemSize();
+
+        if( depth == CV_16F )
+        {
+            blockSize = std::min(blockSize, 1024);
+            fltbuf_.allocate(blockSize);
+            fltbuf = fltbuf_.data();
+        }
+        else
+        {
+            int intSumBlockSize = (normType == NORM_L1 && depth <= CV_8S ? (1 << 23) : (1 << 15))/cn;
+            blockSize = std::min(blockSize, intSumBlockSize);
+            ibuf = &isum;
+        }
    }

    for( size_t i = 0; i < it.nplanes; i++, ++it )
@@ -757,13 +769,17 @@ double cv::norm( InputArray _src, int normType, InputArray _mask )
        for( j = 0; j < total; j += blockSize )
        {
            int bsz = std::min(total - j, blockSize);
-            func( ptrs[0], ptrs[1], (uchar*)ibuf, bsz, cn );
-            count += bsz;
-            if( blockSum && (count + blockSize >= intSumBlockSize || (i+1 >= it.nplanes && j+bsz >= total)) )
+            const uchar* data = ptrs[0];
+            if( depth == CV_16F )
+            {
+                hal::cvt16f32f((const float16_t*)ptrs[0], fltbuf, bsz);
+                data = (const uchar*)fltbuf;
+            }
+            func( data, ptrs[1], (uchar*)ibuf, bsz, cn );
+            if( blockSum && depth != CV_16F )
            {
                result.d += isum;
                isum = 0;
-                count = 0;
            }
            ptrs[0] += bsz*esz;
            if( ptrs[1] )
@@ -1181,7 +1197,7 @@ double cv::norm( InputArray _src1, InputArray _src2, int normType, InputArray _m
        return result;
    }

-    NormDiffFunc func = getNormDiffFunc(normType >> 1, depth);
+    NormDiffFunc func = getNormDiffFunc(normType >> 1, depth == CV_16F ? CV_32F : depth);
    CV_Assert( func != 0 );

    const Mat* arrays[] = {&src1, &src2, &mask, 0};
@@ -1196,19 +1212,31 @@ double cv::norm( InputArray _src1, InputArray _src2, int normType, InputArray _m
    result;
    result.d = 0;
    NAryMatIterator it(arrays, ptrs);
-    int j, total = (int)it.size, blockSize = total, intSumBlockSize = 0, count = 0;
-    bool blockSum = (normType == NORM_L1 && depth <= CV_16S) ||
+    int j, total = (int)it.size, blockSize = total;
+    bool blockSum = depth == CV_16F || (normType == NORM_L1 && depth <= CV_16S) ||
            ((normType == NORM_L2 || normType == NORM_L2SQR) && depth <= CV_8S);
    unsigned isum = 0;
    unsigned *ibuf = &result.u;
+    AutoBuffer<float> fltbuf_;
+    float* fltbuf = 0;
    size_t esz = 0;

    if( blockSum )
    {
-        intSumBlockSize = normType == NORM_L1 && depth <= CV_8S ? (1 << 23) : (1 << 15);
-        blockSize = std::min(blockSize, intSumBlockSize);
-        ibuf = &isum;
        esz = src1.elemSize();
+
+        if( depth == CV_16F )
+        {
+            blockSize = std::min(blockSize, 1024);
+            fltbuf_.allocate(blockSize*2);
+            fltbuf = fltbuf_.data();
+        }
+        else
+        {
+            int intSumBlockSize = (normType == NORM_L1 && depth <= CV_8S ? (1 << 23) : (1 << 15))/cn;
+            blockSize = std::min(blockSize, intSumBlockSize);
+            ibuf = &isum;
+        }
    }

    for( size_t i = 0; i < it.nplanes; i++, ++it )
@@ -1216,13 +1244,19 @@ double cv::norm( InputArray _src1, InputArray _src2, int normType, InputArray _m
        for( j = 0; j < total; j += blockSize )
        {
            int bsz = std::min(total - j, blockSize);
-            func( ptrs[0], ptrs[1], ptrs[2], (uchar*)ibuf, bsz, cn );
-            count += bsz;
-            if( blockSum && (count + blockSize >= intSumBlockSize || (i+1 >= it.nplanes && j+bsz >= total)) )
+            const uchar *data0 = ptrs[0], *data1 = ptrs[1];
+            if( depth == CV_16F )
+            {
+                hal::cvt16f32f((const float16_t*)ptrs[0], fltbuf, bsz);
+                hal::cvt16f32f((const float16_t*)ptrs[1], fltbuf + bsz, bsz);
+                data0 = (const uchar*)fltbuf;
+                data1 = (const uchar*)(fltbuf + bsz);
+            }
+            func( data0, data1, ptrs[2], (uchar*)ibuf, bsz, cn );
+            if( blockSum && depth != CV_16F )
            {
                result.d += isum;
                isum = 0;
-                count = 0;
            }
            ptrs[0] += bsz*esz;
            ptrs[1] += bsz*esz;

--- a/modules/core/src/out.cpp
+++ b/modules/core/src/out.cpp
@@ -77,6 +77,7 @@ namespace cv
        void valueToStr32s() { sprintf(buf, "%d", mtx.ptr<int>(row, col)[cn]); }
        void valueToStr32f() { sprintf(buf, floatFormat, mtx.ptr<float>(row, col)[cn]); }
        void valueToStr64f() { sprintf(buf, floatFormat, mtx.ptr<double>(row, col)[cn]); }
+        void valueToStr16f() { sprintf(buf, floatFormat, (float)mtx.ptr<float16_t>(row, col)[cn]); }
        void valueToStrOther() { buf[0] = 0; }

    public:
@@ -115,7 +116,8 @@ namespace cv
                case CV_32S: valueToStr = &FormattedImpl::valueToStr32s; break;
                case CV_32F: valueToStr = &FormattedImpl::valueToStr32f; break;
                case CV_64F: valueToStr = &FormattedImpl::valueToStr64f; break;
-                default:     valueToStr = &FormattedImpl::valueToStrOther; break;
+                default:     CV_Assert(mtx.depth() == CV_16F);
+                             valueToStr = &FormattedImpl::valueToStr16f;
            }
        }

@@ -256,7 +258,12 @@ namespace cv
    class FormatterBase : public Formatter
    {
    public:
-        FormatterBase() : prec32f(8), prec64f(16), multiline(true) {}
+        FormatterBase() : prec16f(4), prec32f(8), prec64f(16), multiline(true) {}
+
+        void set16fPrecision(int p) CV_OVERRIDE
+        {
+            prec16f = p;
+        }

        void set32fPrecision(int p) CV_OVERRIDE
        {
@@ -274,6 +281,7 @@ namespace cv
        }

    protected:
+        int prec16f;
        int prec32f;
        int prec64f;
        int multiline;
@@ -325,7 +333,7 @@ namespace cv
        {
            static const char* numpyTypes[] =
            {
-                "uint8", "int8", "uint16", "int16", "int32", "float32", "float64", "uint64"
+                "uint8", "int8", "uint16", "int16", "int32", "float32", "float64", "float16"
            };
            char braces[5] = {'[', ']', ',', '[', ']'};
            if (mtx.cols == 1)

--- a/modules/core/src/rand.cpp
+++ b/modules/core/src/rand.cpp
@@ -48,18 +48,6 @@

 #include "precomp.hpp"

-#if defined _WIN32 || defined WINCE
-    #include <windows.h>
-    #undef small
-    #undef min
-    #undef max
-    #undef abs
-#endif
-
-#if defined __SSE2__ || (defined _M_IX86_FP && 2 == _M_IX86_FP)
-    #include "emmintrin.h"
-#endif
-
 namespace cv
 {

@@ -74,12 +62,6 @@ namespace cv

 #define  RNG_NEXT(x)    ((uint64)(unsigned)(x)*CV_RNG_COEFF + ((x) >> 32))

-#ifdef __PPC64__
-    #define PPC_MUL_ADD(ret, tmp, p0, p1)                           \
-    asm volatile("fmuls %0,%1,%2\n\t fadds %0,%0,%3" : "=&f" (ret)  \
-                : "f" (tmp), "f" (p0), "f" (p1))
-#endif
-
 /***************************************************************************************\
 *                           Pseudo-Random Number Generators (PRNGs)                     *
 \***************************************************************************************/
@@ -154,59 +136,26 @@ template<typename T> static void
 randi_( T* arr, int len, uint64* state, const DivStruct* p )
 {
    uint64 temp = *state;
-    int i = 0;
-    unsigned t0, t1, v0, v1;
-
-    for( i = 0; i <= len - 4; i += 4 )
+    for( int i = 0; i < len; i++ )
    {
        temp = RNG_NEXT(temp);
-        t0 = (unsigned)temp;
-        temp = RNG_NEXT(temp);
-        t1 = (unsigned)temp;
-        v0 = (unsigned)(((uint64)t0 * p[i].M) >> 32);
-        v1 = (unsigned)(((uint64)t1 * p[i+1].M) >> 32);
-        v0 = (v0 + ((t0 - v0) >> p[i].sh1)) >> p[i].sh2;
-        v1 = (v1 + ((t1 - v1) >> p[i+1].sh1)) >> p[i+1].sh2;
-        v0 = t0 - v0*p[i].d + p[i].delta;
-        v1 = t1 - v1*p[i+1].d + p[i+1].delta;
-        arr[i] = saturate_cast<T>((int)v0);
-        arr[i+1] = saturate_cast<T>((int)v1);
-
-        temp = RNG_NEXT(temp);
-        t0 = (unsigned)temp;
-        temp = RNG_NEXT(temp);
-        t1 = (unsigned)temp;
-        v0 = (unsigned)(((uint64)t0 * p[i+2].M) >> 32);
-        v1 = (unsigned)(((uint64)t1 * p[i+3].M) >> 32);
-        v0 = (v0 + ((t0 - v0) >> p[i+2].sh1)) >> p[i+2].sh2;
-        v1 = (v1 + ((t1 - v1) >> p[i+3].sh1)) >> p[i+3].sh2;
-        v0 = t0 - v0*p[i+2].d + p[i+2].delta;
-        v1 = t1 - v1*p[i+3].d + p[i+3].delta;
-        arr[i+2] = saturate_cast<T>((int)v0);
-        arr[i+3] = saturate_cast<T>((int)v1);
+        unsigned t = (unsigned)temp;
+        unsigned v = (unsigned)(((uint64)t * p[i].M) >> 32);
+        v = (v + ((t - v) >> p[i].sh1)) >> p[i].sh2;
+        v = t - v*p[i].d + p[i].delta;
+        arr[i] = saturate_cast<T>((int)v);
    }
-
-    for( ; i < len; i++ )
-    {
-        temp = RNG_NEXT(temp);
-        t0 = (unsigned)temp;
-        v0 = (unsigned)(((uint64)t0 * p[i].M) >> 32);
-        v0 = (v0 + ((t0 - v0) >> p[i].sh1)) >> p[i].sh2;
-        v0 = t0 - v0*p[i].d + p[i].delta;
-        arr[i] = saturate_cast<T>((int)v0);
-    }
-
    *state = temp;
 }


 #define DEF_RANDI_FUNC(suffix, type) \
 static void randBits_##suffix(type* arr, int len, uint64* state, \
-                              const Vec2i* p, bool small_flag) \
+                              const Vec2i* p, void*, bool small_flag) \
 { randBits_(arr, len, state, p, small_flag); } \
 \
 static void randi_##suffix(type* arr, int len, uint64* state, \
-                           const DivStruct* p, bool ) \
+                           const DivStruct* p, void*, bool ) \
 { randi_(arr, len, state, p); }

 DEF_RANDI_FUNC(8u, uchar)
@@ -215,131 +164,62 @@ DEF_RANDI_FUNC(16u, ushort)
 DEF_RANDI_FUNC(16s, short)
 DEF_RANDI_FUNC(32s, int)

-static void randf_32f( float* arr, int len, uint64* state, const Vec2f* p, bool )
+static void randf_32f( float* arr, int len, uint64* state, const Vec2f* p, void*, bool )
 {
    uint64 temp = *state;
-    int i = 0;
-
-    for( ; i <= len - 4; i += 4 )
+    for( int i = 0; i < len; i++ )
    {
-        float f[4];
-        f[0] = (float)(int)(temp = RNG_NEXT(temp));
-        f[1] = (float)(int)(temp = RNG_NEXT(temp));
-        f[2] = (float)(int)(temp = RNG_NEXT(temp));
-        f[3] = (float)(int)(temp = RNG_NEXT(temp));
-
-        // handwritten SSE is required not for performance but for numerical stability!
-        // both 32-bit gcc and MSVC compilers trend to generate double precision SSE
-        // while 64-bit compilers generate single precision SIMD instructions
-        // so manual vectorisation forces all compilers to the single precision
-#if defined __SSE2__ || (defined _M_IX86_FP && 2 == _M_IX86_FP)
-        __m128 q0 = _mm_loadu_ps((const float*)(p + i));
-        __m128 q1 = _mm_loadu_ps((const float*)(p + i + 2));
-
-        __m128 q01l = _mm_unpacklo_ps(q0, q1);
-        __m128 q01h = _mm_unpackhi_ps(q0, q1);
-
-        __m128 p0 = _mm_unpacklo_ps(q01l, q01h);
-        __m128 p1 = _mm_unpackhi_ps(q01l, q01h);
-
-        _mm_storeu_ps(arr + i, _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(f), p0), p1));
-#elif defined __ARM_NEON && defined __aarch64__
-        // handwritten NEON is required not for performance but for numerical stability!
-        // 64bit gcc tends to use fmadd instead of separate multiply and add
-        // use volatile to ensure to separate the multiply and add
-        float32x4x2_t q = vld2q_f32((const float*)(p + i));
-
-        float32x4_t p0 = q.val[0];
-        float32x4_t p1 = q.val[1];
-
-        volatile float32x4_t v0 = vmulq_f32(vld1q_f32(f), p0);
-        vst1q_f32(arr+i, vaddq_f32(v0, p1));
-#elif defined __PPC64__
-        // inline asm is required for numerical stability!
-        // compilers tends to use floating multiply-add single(fmadds)
-        // instead of separate multiply and add
-        PPC_MUL_ADD(arr[i+0], f[0], p[i+0][0], p[i+0][1]);
-        PPC_MUL_ADD(arr[i+1], f[1], p[i+1][0], p[i+1][1]);
-        PPC_MUL_ADD(arr[i+2], f[2], p[i+2][0], p[i+2][1]);
-        PPC_MUL_ADD(arr[i+3], f[3], p[i+3][0], p[i+3][1]);
-#else
-        arr[i+0] = f[0]*p[i+0][0] + p[i+0][1];
-        arr[i+1] = f[1]*p[i+1][0] + p[i+1][1];
-        arr[i+2] = f[2]*p[i+2][0] + p[i+2][1];
-        arr[i+3] = f[3]*p[i+3][0] + p[i+3][1];
-#endif
-    }
-
-    for( ; i < len; i++ )
-    {
-        temp = RNG_NEXT(temp);
-#if defined __SSE2__ || (defined _M_IX86_FP && 2 == _M_IX86_FP)
-        _mm_store_ss(arr + i, _mm_add_ss(
-                _mm_mul_ss(_mm_set_ss((float)(int)temp), _mm_set_ss(p[i][0])),
-                _mm_set_ss(p[i][1]))
-                );
-#elif defined __ARM_NEON && defined __aarch64__
-        float32x2_t t = vadd_f32(vmul_f32(
-                vdup_n_f32((float)(int)temp), vdup_n_f32(p[i][0])),
-                vdup_n_f32(p[i][1]));
-        arr[i] = vget_lane_f32(t, 0);
-#elif defined __PPC64__
-        PPC_MUL_ADD(arr[i], (float)(int)temp, p[i][0], p[i][1]);
-#else
-        arr[i] = (int)temp*p[i][0] + p[i][1];
-#endif
+        int t = (int)(temp = RNG_NEXT(temp));
+        arr[i] = (float)(t*p[i][0]);
    }
-
    *state = temp;
-}

+    // add bias separately to make the generated random numbers
+    // more deterministic, independent of
+    // architecture details (FMA instruction use etc.)
+    hal::addRNGBias32f(arr, &p[0][0], len);
+}

 static void
-randf_64f( double* arr, int len, uint64* state, const Vec2d* p, bool )
+randf_64f( double* arr, int len, uint64* state, const Vec2d* p, void*, bool )
 {
    uint64 temp = *state;
-    int64 v = 0;
-    int i;
-
-    for( i = 0; i <= len - 4; i += 4 )
+    for( int i = 0; i < len; i++ )
    {
-        double f0, f1;
-
-        temp = RNG_NEXT(temp);
-        v = (temp >> 32)|(temp << 32);
-        f0 = v*p[i][0] + p[i][1];
-        temp = RNG_NEXT(temp);
-        v = (temp >> 32)|(temp << 32);
-        f1 = v*p[i+1][0] + p[i+1][1];
-        arr[i] = f0; arr[i+1] = f1;
-
-        temp = RNG_NEXT(temp);
-        v = (temp >> 32)|(temp << 32);
-        f0 = v*p[i+2][0] + p[i+2][1];
        temp = RNG_NEXT(temp);
-        v = (temp >> 32)|(temp << 32);
-        f1 = v*p[i+3][0] + p[i+3][1];
-        arr[i+2] = f0; arr[i+3] = f1;
+        int64 v = (temp >> 32)|(temp << 32);
+        arr[i] = v*p[i][0];
    }
+    *state = temp;

-    for( ; i < len; i++ )
+    hal::addRNGBias64f(arr, &p[0][0], len);
+}
+
+static void randf_16f( float16_t* arr, int len, uint64* state, const Vec2f* p, float* fbuf, bool )
+{
+    uint64 temp = *state;
+    for( int i = 0; i < len; i++ )
    {
-        temp = RNG_NEXT(temp);
-        v = (temp >> 32)|(temp << 32);
-        arr[i] = v*p[i][0] + p[i][1];
+        float f = (float)(int)(temp = RNG_NEXT(temp));
+        fbuf[i] = f*p[i][0];
    }
-
    *state = temp;
+
+    // add bias separately to make the generated random numbers
+    // more deterministic, independent of
+    // architecture details (FMA instruction use etc.)
+    hal::addRNGBias32f(fbuf, &p[0][0], len);
+    hal::cvt32f16f(fbuf, arr, len);
 }

-typedef void (*RandFunc)(uchar* arr, int len, uint64* state, const void* p, bool small_flag);
+typedef void (*RandFunc)(uchar* arr, int len, uint64* state, const void* p, void* tempbuf, bool small_flag);


 static RandFunc randTab[][8] =
 {
    {
        (RandFunc)randi_8u, (RandFunc)randi_8s, (RandFunc)randi_16u, (RandFunc)randi_16s,
-        (RandFunc)randi_32s, (RandFunc)randf_32f, (RandFunc)randf_64f, 0
+        (RandFunc)randi_32s, (RandFunc)randf_32f, (RandFunc)randf_64f, (RandFunc)randf_16f
    },
    {
        (RandFunc)randBits_8u, (RandFunc)randBits_8s, (RandFunc)randBits_16u, (RandFunc)randBits_16s,
@@ -350,7 +230,7 @@ static RandFunc randTab[][8] =
 /*
   The code below implements the algorithm described in
   "The Ziggurat Method for Generating Random Variables"
-   by Marsaglia and Tsang, Journal of Statistical Software.
+   by George Marsaglia and Wai Wan Tsang, Journal of Statistical Software, 2007.
 */
 static void
 randn_0_1_32f( float* arr, int len, uint64* state )
@@ -631,8 +511,8 @@ void RNG::fill( InputOutputArray _mat, int disttype,
            // for each channel i compute such dparam[0][i] & dparam[1][i],
            // so that a signed 32/64-bit integer X is transformed to
            // the range [param1.val[i], param2.val[i]) using
-            // dparam[1][i]*X + dparam[0][i]
-            if( depth == CV_32F )
+            // dparam[0][i]*X + dparam[1][i]
+            if( depth != CV_64F )
            {
                fp = (Vec2f*)(parambuf + cn*2);
                for( j = 0; j < cn; j++ )
@@ -704,6 +584,7 @@ void RNG::fill( InputOutputArray _mat, int disttype,
    AutoBuffer<double> buf;
    uchar* param = 0;
    float* nbuf = 0;
+    float* tmpbuf = 0;

    if( disttype == UNIFORM )
    {
@@ -727,12 +608,14 @@ void RNG::fill( InputOutputArray _mat, int disttype,
                        p[j + k] = ip[k];
            }
        }
-        else if( depth == CV_32F )
+        else if( depth != CV_64F )
        {
            Vec2f* p = (Vec2f*)param;
            for( j = 0; j < blockSize*cn; j += cn )
                for( k = 0; k < cn; k++ )
                    p[j + k] = fp[k];
+            if( depth == CV_16F )
+                tmpbuf = (float*)p + blockSize*cn*2;
        }
        else
        {
@@ -755,7 +638,7 @@ void RNG::fill( InputOutputArray _mat, int disttype,
            int len = std::min(total - j, blockSize);

            if( disttype == CV_RAND_UNI )
-                func( ptr, len*cn, &state, param, smallFlag );
+                func( ptr, len*cn, &state, param, tmpbuf, smallFlag );
            else
            {
                randn_0_1_32f(nbuf, len*cn, &state);

--- a/modules/core/src/split.cpp
+++ b/modules/core/src/split.cpp
@@ -224,8 +224,10 @@ static SplitFunc getSplitFunc(int depth)
 {
    static SplitFunc splitTab[] =
    {
-        (SplitFunc)GET_OPTIMIZED(cv::hal::split8u), (SplitFunc)GET_OPTIMIZED(cv::hal::split8u), (SplitFunc)GET_OPTIMIZED(cv::hal::split16u), (SplitFunc)GET_OPTIMIZED(cv::hal::split16u),
-        (SplitFunc)GET_OPTIMIZED(cv::hal::split32s), (SplitFunc)GET_OPTIMIZED(cv::hal::split32s), (SplitFunc)GET_OPTIMIZED(cv::hal::split64s), 0
+        (SplitFunc)GET_OPTIMIZED(cv::hal::split8u), (SplitFunc)GET_OPTIMIZED(cv::hal::split8u),
+        (SplitFunc)GET_OPTIMIZED(cv::hal::split16u), (SplitFunc)GET_OPTIMIZED(cv::hal::split16u),
+        (SplitFunc)GET_OPTIMIZED(cv::hal::split32s), (SplitFunc)GET_OPTIMIZED(cv::hal::split32s),
+        (SplitFunc)GET_OPTIMIZED(cv::hal::split64s), (SplitFunc)GET_OPTIMIZED(cv::hal::split16u)
    };

    return splitTab[depth];

--- a/modules/core/test/ocl/test_matrix_expr.cpp
+++ b/modules/core/test/ocl/test_matrix_expr.cpp
@@ -78,7 +78,7 @@ OCL_TEST_P(UMatExpr, Ones)

 //////////////////////////////// Instantiation /////////////////////////////////////////////////

-OCL_INSTANTIATE_TEST_CASE_P(MatrixOperation, UMatExpr, Combine(OCL_ALL_DEPTHS, OCL_ALL_CHANNELS));
+OCL_INSTANTIATE_TEST_CASE_P(MatrixOperation, UMatExpr, Combine(OCL_ALL_DEPTHS_16F, OCL_ALL_CHANNELS));

 } } // namespace opencv_test::ocl


--- a/modules/core/test/test_arithm.cpp
+++ b/modules/core/test/test_arithm.cpp
@@ -476,7 +476,7 @@ struct CopyOp : public BaseElemWiseOp
    }
    int getRandomType(RNG& rng)
    {
-        return cvtest::randomType(rng, _OutputArray::DEPTH_MASK_ALL, 1, ARITHM_MAX_CHANNELS);
+        return cvtest::randomType(rng, _OutputArray::DEPTH_MASK_ALL_16F, 1, ARITHM_MAX_CHANNELS);
    }
    double getMaxErr(int)
    {
@@ -498,7 +498,7 @@ struct SetOp : public BaseElemWiseOp
    }
    int getRandomType(RNG& rng)
    {
-        return cvtest::randomType(rng, _OutputArray::DEPTH_MASK_ALL, 1, ARITHM_MAX_CHANNELS);
+        return cvtest::randomType(rng, _OutputArray::DEPTH_MASK_ALL_16F, 1, ARITHM_MAX_CHANNELS);
    }
    double getMaxErr(int)
    {

--- a/modules/ts/include/opencv2/ts/ocl_test.hpp
+++ b/modules/ts/include/opencv2/ts/ocl_test.hpp
@@ -372,6 +372,7 @@ IMPLEMENT_PARAM_CLASS(Channels, int)
 #define OCL_ON(...) cv::ocl::setUseOpenCL(true); __VA_ARGS__ ;

 #define OCL_ALL_DEPTHS Values(CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F, CV_64F)
+#define OCL_ALL_DEPTHS_16F Values(CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F, CV_64F, CV_16F)
 #define OCL_ALL_CHANNELS Values(1, 2, 3, 4)

 CV_ENUM(Interpolation, INTER_NEAREST, INTER_LINEAR, INTER_CUBIC, INTER_AREA, INTER_LINEAR_EXACT)

--- a/modules/ts/include/opencv2/ts/ts_perf.hpp
+++ b/modules/ts/include/opencv2/ts/ts_perf.hpp
@@ -160,7 +160,7 @@ private:
    };                                                                                  \
    static inline void PrintTo(const class_name& t, std::ostream* os) { t.PrintTo(os); } }

-CV_ENUM(MatDepth, CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F, CV_64F, CV_USRTYPE1)
+CV_ENUM(MatDepth, CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F, CV_64F, CV_16F)

 /*****************************************************************************************\
 *                 Regression control utility for performance testing                      *

--- a/modules/ts/src/ts_func.cpp
+++ b/modules/ts/src/ts_func.cpp
@@ -72,10 +72,10 @@ int randomType(RNG& rng, int typeMask, int minChannels, int maxChannels)
 {
    int channels = rng.uniform(minChannels, maxChannels+1);
    int depth = 0;
-    CV_Assert((typeMask & _OutputArray::DEPTH_MASK_ALL) != 0);
+    CV_Assert((typeMask & _OutputArray::DEPTH_MASK_ALL_16F) != 0);
    for(;;)
    {
-        depth = rng.uniform(CV_8U, CV_64F+1);
+        depth = rng.uniform(CV_8U, CV_16F+1);
        if( ((1 << depth) & typeMask) != 0 )
            break;
    }
@@ -1260,6 +1260,13 @@ norm_(const _Tp* src1, const _Tp* src2, size_t total, int cn, int normType, doub
 double norm(InputArray _src, int normType, InputArray _mask)
 {
    Mat src = _src.getMat(), mask = _mask.getMat();
+    if( src.depth() == CV_16F )
+    {
+        Mat src32f;
+        src.convertTo(src32f, CV_32F);
+        return cvtest::norm(src32f, normType, _mask);
+    }
+
    if( normType == NORM_HAMMING || normType == NORM_HAMMING2 )
    {
        if( !mask.empty() )
@@ -1340,6 +1347,14 @@ double norm(InputArray _src, int normType, InputArray _mask)
 double norm(InputArray _src1, InputArray _src2, int normType, InputArray _mask)
 {
    Mat src1 = _src1.getMat(), src2 = _src2.getMat(), mask = _mask.getMat();
+    if( src1.depth() == CV_16F )
+    {
+        Mat src1_32f, src2_32f;
+        src1.convertTo(src1_32f, CV_32F);
+        src2.convertTo(src2_32f, CV_32F);
+        return cvtest::norm(src1_32f, src2_32f, normType, _mask);
+    }
+
    bool isRelative = (normType & NORM_RELATIVE) != 0;
    normType &= ~NORM_RELATIVE;

@@ -1982,11 +1997,20 @@ int check( const Mat& a, double fmin, double fmax, vector<int>* _idx )
 // success_err_level is maximum allowed difference, idx is the index of the first
 // element for which difference is >success_err_level
 // (or index of element with the maximum difference)
-int cmpEps( const Mat& arr, const Mat& refarr, double* _realmaxdiff,
+int cmpEps( const Mat& arr_, const Mat& refarr_, double* _realmaxdiff,
            double success_err_level, vector<int>* _idx,
            bool element_wise_relative_error )
 {
+    Mat arr = arr_, refarr = refarr_;
    CV_Assert( arr.type() == refarr.type() && arr.size == refarr.size );
+    if( arr.depth() == CV_16F )
+    {
+        Mat arr32f, refarr32f;
+        arr.convertTo(arr32f, CV_32F);
+        refarr.convertTo(refarr32f, CV_32F);
+        arr = arr32f;
+        refarr = refarr32f;
+    }

    int ilevel = refarr.depth() <= CV_32S ? cvFloor(success_err_level) : 0;
    int result = CMP_EPS_OK;

--- a/modules/ts/src/ts_perf.cpp
+++ b/modules/ts/src/ts_perf.cpp
@@ -594,11 +594,11 @@ Regression& Regression::operator() (const std::string& name, cv::InputArray arra
    // exit if current test is already failed
    if(::testing::UnitTest::GetInstance()->current_test_info()->result()->Failed()) return *this;

-    if(!array.empty() && array.depth() == CV_USRTYPE1)
+    /*if(!array.empty() && array.depth() == CV_USRTYPE1)
    {
        ADD_FAILURE() << "  Can not check regression for CV_USRTYPE1 data type for " << name;
        return *this;
-    }
+    }*/

    std::string nodename = getCurrentTestNodeName();

@@ -2207,7 +2207,7 @@ void PrintTo(const MatType& t, ::std::ostream* os)
        case CV_32S: *os << "32S"; break;
        case CV_32F: *os << "32F"; break;
        case CV_64F: *os << "64F"; break;
-        case CV_USRTYPE1: *os << "USRTYPE1"; break;
+        case CV_USRTYPE1: *os << "16F"; break;
        default: *os << "INVALID_TYPE"; break;
    }
    *os << 'C' << CV_MAT_CN((int)t);