added basic support for CV_16F (the new datatype etc.) (#12463)

* added basic support for CV_16F (the new datatype etc.). CV_USRTYPE1 is now equal to CV_16F, which may break some [rarely used] functionality. We'll see * fixed just introduced bug in norm; reverted errorneous changes in Torch importer (need to find a better solution) * addressed some issues found during the PR review * restored the patch to fix some perf test failures

added basic support for CV_16F (the new datatype etc.) (#12463)
* added basic support for CV_16F (the new datatype etc.). CV_USRTYPE1 is now equal to CV_16F, which may break some [rarely used] functionality. We'll see * fixed just introduced bug in norm; reverted errorneous changes in Torch importer (need to find a better solution) * addressed some issues found during the PR review * restored the patch to fix some perf test failures
6d7f5871 · Vadim Pisarevsky · GitHub · dca657a2 · 6d7f5871 · 6d7f5871
Unverified Commit 6d7f5871 authored Sep 10, 2018 by Vadim Pisarevsky Committed by GitHub Sep 10, 2018
23 changed files
--- a/modules/core/include/opencv2/core.hpp
+++ b/modules/core/include/opencv2/core.hpp
@@ -3009,6 +3009,7 @@ public:
    virtual Ptr<Formatted> format(const Mat& mtx) const = 0;
+    virtual void set16fPrecision(int p = 4) = 0;
    virtual void set32fPrecision(int p = 8) = 0;
    virtual void set64fPrecision(int p = 16) = 0;
    virtual void setMultiline(bool ml = true) = 0;

--- a/modules/core/include/opencv2/core/cvdef.h
+++ b/modules/core/include/opencv2/core/cvdef.h
@@ -317,13 +317,10 @@ Cv64suf;
 #define CV_IS_SUBMAT(flags)     ((flags) & CV_MAT_SUBMAT_FLAG)
 /** Size of each channel item,
-   0x8442211 = 1000 0100 0100 0010 0010 0001 0001 ~ array of sizeof(arr_type_elem) */
+   0x28442211 = 0010 1000 0100 0100 0010 0010 0001 0001 ~ array of sizeof(arr_type_elem) */
-#define CV_ELEM_SIZE1(type) \
+#define CV_ELEM_SIZE1(type) ((0x28442211 >> CV_MAT_DEPTH(type)*4) & 15)
-    ((((sizeof(size_t)<<28)|0x8442211) >> CV_MAT_DEPTH(type)*4) & 15)
-/** 0x3a50 = 11 10 10 01 01 00 00 ~ array of log2(sizeof(arr_type_elem)) */
+#define CV_ELEM_SIZE(type) (CV_MAT_CN(type)*CV_ELEM_SIZE1(type))
-#define CV_ELEM_SIZE(type) \
-    (CV_MAT_CN(type) << ((((sizeof(size_t)/4+1)*16384|0x3a50) >> CV_MAT_DEPTH(type)*2) & 3))
 #ifndef MIN
 #  define MIN(a,b)  ((a) > (b) ? (b) : (a))

--- a/modules/core/include/opencv2/core/hal/hal.hpp
+++ b/modules/core/include/opencv2/core/hal/hal.hpp
@@ -195,6 +195,12 @@ CV_EXPORTS void addWeighted32s( const int* src1, size_t step1, const int* src2,
 CV_EXPORTS void addWeighted32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* scalars );
 CV_EXPORTS void addWeighted64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* scalars );
+CV_EXPORTS void cvt16f32f( const float16_t* src, float* dst, int len );
+CV_EXPORTS void cvt32f16f( const float* src, float16_t* dst, int len );
+CV_EXPORTS void addRNGBias32f( float* arr, const float* scaleBiasPairs, int len );
+CV_EXPORTS void addRNGBias64f( double* arr, const double* scaleBiasPairs, int len );
 struct CV_EXPORTS DFT1D
 {
    static Ptr<DFT1D> create(int len, int count, int depth, int flags, bool * useBuffer = 0);

--- a/modules/core/include/opencv2/core/hal/interface.h
+++ b/modules/core/include/opencv2/core/hal/interface.h
@@ -76,6 +76,7 @@ typedef signed char schar;
 #define CV_32F  5
 #define CV_64F  6
 #define CV_USRTYPE1 7
+#define CV_16F  7
 #define CV_MAT_DEPTH_MASK       (CV_DEPTH_MAX - 1)
 #define CV_MAT_DEPTH(flags)     ((flags) & CV_MAT_DEPTH_MASK)
@@ -124,6 +125,12 @@ typedef signed char schar;
 #define CV_64FC3 CV_MAKETYPE(CV_64F,3)
 #define CV_64FC4 CV_MAKETYPE(CV_64F,4)
 #define CV_64FC(n) CV_MAKETYPE(CV_64F,(n))
+#define CV_16FC1 CV_MAKETYPE(CV_16F,1)
+#define CV_16FC2 CV_MAKETYPE(CV_16F,2)
+#define CV_16FC3 CV_MAKETYPE(CV_16F,3)
+#define CV_16FC4 CV_MAKETYPE(CV_16F,4)
+#define CV_16FC(n) CV_MAKETYPE(CV_16F,(n))
 //! @}
 //! @name Comparison operation

--- a/modules/core/include/opencv2/core/mat.hpp
+++ b/modules/core/include/opencv2/core/mat.hpp
@@ -296,8 +296,10 @@ public:
        DEPTH_MASK_32S = 1 << CV_32S,
        DEPTH_MASK_32F = 1 << CV_32F,
        DEPTH_MASK_64F = 1 << CV_64F,
+        DEPTH_MASK_16F = 1 << CV_16F,
        DEPTH_MASK_ALL = (DEPTH_MASK_64F<<1)-1,
        DEPTH_MASK_ALL_BUT_8S = DEPTH_MASK_ALL & ~DEPTH_MASK_8S,
+        DEPTH_MASK_ALL_16F = (DEPTH_MASK_16F<<1)-1,
        DEPTH_MASK_FLT = DEPTH_MASK_32F + DEPTH_MASK_64F
    };

--- a/modules/core/include/opencv2/core/saturate.hpp
+++ b/modules/core/include/opencv2/core/saturate.hpp
@@ -158,6 +158,22 @@ template<> inline uint64 saturate_cast<uint64>(int64 v)      { return (uint64)st
 template<> inline int64 saturate_cast<int64>(uint64 v)       { return (int64)std::min(v, (uint64)LLONG_MAX); }
+/** @overload */
+template<typename _Tp> static inline _Tp saturate_cast(float16_t v) { return saturate_cast<_Tp>((float)v); }
+// in theory, we could use a LUT for 8u/8s->16f conversion,
+// but with hardware support for FP32->FP16 conversion the current approach is preferable
+template<> inline float16_t saturate_cast<float16_t>(uchar v)   { return float16_t((float)v); }
+template<> inline float16_t saturate_cast<float16_t>(schar v)   { return float16_t((float)v); }
+template<> inline float16_t saturate_cast<float16_t>(ushort v)  { return float16_t((float)v); }
+template<> inline float16_t saturate_cast<float16_t>(short v)   { return float16_t((float)v); }
+template<> inline float16_t saturate_cast<float16_t>(unsigned v){ return float16_t((float)v); }
+template<> inline float16_t saturate_cast<float16_t>(int v)     { return float16_t((float)v); }
+template<> inline float16_t saturate_cast<float16_t>(uint64 v)  { return float16_t((float)v); }
+template<> inline float16_t saturate_cast<float16_t>(int64 v)   { return float16_t((float)v); }
+template<> inline float16_t saturate_cast<float16_t>(float v)   { return float16_t(v); }
+template<> inline float16_t saturate_cast<float16_t>(double v)  { return float16_t((float)v); }
 //! @}
 } // cv

--- a/modules/core/include/opencv2/core/traits.hpp
+++ b/modules/core/include/opencv2/core/traits.hpp
@@ -261,6 +261,20 @@ public:
         };
 };
+template<> class DataType<float16_t>
+{
+public:
+    typedef float16_t   value_type;
+    typedef float       work_type;
+    typedef value_type  channel_type;
+    typedef value_type  vec_type;
+    enum { generic_type = 0,
+           depth        = CV_16F,
+           channels     = 1,
+           fmt          = (int)'h',
+           type         = CV_MAKETYPE(depth, channels)
+         };
+};
 /** @brief A helper class for cv::DataType
@@ -330,6 +344,12 @@ template<> class TypeDepth<CV_64F>
    typedef double value_type;
 };
+template<> class TypeDepth<CV_16F>
+{
+    enum { depth = CV_16F };
+    typedef float16_t value_type;
+};
 #endif
 //! @}

--- a/modules/core/src/array.cpp
+++ b/modules/core/src/array.cpp
@@ -3262,6 +3262,9 @@ void scalarToRawData(const Scalar& s, void* _buf, int type, int unroll_to)
    case CV_64F:
        scalarToRawData_<double>(s, (double*)_buf, cn, unroll_to);
        break;
+    case CV_16F:
+        scalarToRawData_<float16_t>(s, (float16_t*)_buf, cn, unroll_to);
+        break;
    default:
        CV_Error(CV_StsUnsupportedFormat,"");
    }

--- a/modules/core/src/check.cpp
+++ b/modules/core/src/check.cpp
@@ -43,15 +43,15 @@ static const char* getTestOpMath(unsigned testOp)
 const char* depthToString_(int depth)
 {
-    static const char* depthNames[] = { "CV_8U", "CV_8S", "CV_16U", "CV_16S", "CV_32S", "CV_32F", "CV_64F", "CV_USRTYPE1" };
+    static const char* depthNames[] = { "CV_8U", "CV_8S", "CV_16U", "CV_16S", "CV_32S", "CV_32F", "CV_64F", "CV_16F" };
-    return (depth <= CV_USRTYPE1 && depth >= 0) ? depthNames[depth] : NULL;
+    return (depth <= CV_16F && depth >= 0) ? depthNames[depth] : NULL;
 }
 const cv::String typeToString_(int type)
 {
    int depth = CV_MAT_DEPTH(type);
    int cn = CV_MAT_CN(type);
-    if (depth >= 0 && depth <= CV_USRTYPE1)
+    if (depth >= 0 && depth <= CV_16F)
        return cv::format("%sC%d", depthToString_(depth), cn);
    return cv::String();
 }

--- a/modules/core/src/convert.cpp
+++ b/modules/core/src/convert.cpp
@@ -8,7 +8,7 @@
 namespace cv {
-/*namespace hal {
+namespace hal {
 void cvt16f32f( const float16_t* src, float* dst, int len )
 {
@@ -50,21 +50,21 @@ void cvt32f16f( const float* src, float16_t* dst, int len )
        dst[j] = float16_t(src[j]);
 }
-/*void addRNGBias32f( float* arr, const float* scaleBiasPairs, int len )
+void addRNGBias32f( float* arr, const float* scaleBiasPairs, int len )
 {
    // the loop is simple enough, so we let the compiler to vectorize it
    for( int i = 0; i < len; i++ )
-        arr[i] = scaleBiasPairs[i*2 + 1];
+        arr[i] += scaleBiasPairs[i*2 + 1];
 }
 void addRNGBias64f( double* arr, const double* scaleBiasPairs, int len )
 {
    // the loop is simple enough, so we let the compiler to vectorize it
    for( int i = 0; i < len; i++ )
-        arr[i] = scaleBiasPairs[i*2 + 1];
+        arr[i] += scaleBiasPairs[i*2 + 1];
 }
-}*/
+}
 template<typename _Ts, typename _Td, typename _Twvec> inline void
 cvt_( const _Ts* src, size_t sstep, _Td* dst, size_t dstep, Size size )
@@ -150,7 +150,7 @@ DEF_CVT_FUNC(8u16s, cvt_,  uchar, short,    v_int16)
 DEF_CVT_FUNC(8u32s, cvt_,  uchar, int,      v_int32)
 DEF_CVT_FUNC(8u32f, cvt_,  uchar, float,    v_float32)
 DEF_CVT_FUNC(8u64f, cvt_,  uchar, double,   v_int32)
-//DEF_CVT_FUNC(8u16f, cvt1_, uchar, float16_t, v_float32)
+DEF_CVT_FUNC(8u16f, cvt1_, uchar, float16_t, v_float32)
 ////////////////////// 8s -> ... ////////////////////////
@@ -160,7 +160,7 @@ DEF_CVT_FUNC(8s16s, cvt_,  schar, short,    v_int16)
 DEF_CVT_FUNC(8s32s, cvt_,  schar, int,      v_int32)
 DEF_CVT_FUNC(8s32f, cvt_,  schar, float,    v_float32)
 DEF_CVT_FUNC(8s64f, cvt_,  schar, double,   v_int32)
-//DEF_CVT_FUNC(8s16f, cvt1_, schar, float16_t, v_float32)
+DEF_CVT_FUNC(8s16f, cvt1_, schar, float16_t, v_float32)
 ////////////////////// 16u -> ... ////////////////////////
@@ -170,7 +170,7 @@ DEF_CVT_FUNC(16u16s, cvt_, ushort, short,  v_int32)
 DEF_CVT_FUNC(16u32s, cvt_, ushort, int,    v_int32)
 DEF_CVT_FUNC(16u32f, cvt_, ushort, float,  v_float32)
 DEF_CVT_FUNC(16u64f, cvt_, ushort, double, v_int32)
-//DEF_CVT_FUNC(16u16f, cvt1_,ushort, float16_t, v_float32)
+DEF_CVT_FUNC(16u16f, cvt1_,ushort, float16_t, v_float32)
 ////////////////////// 16s -> ... ////////////////////////
@@ -180,7 +180,7 @@ DEF_CVT_FUNC(16s16u, cvt_, short, ushort, v_int32)
 DEF_CVT_FUNC(16s32s, cvt_, short, int,    v_int32)
 DEF_CVT_FUNC(16s32f, cvt_, short, float,  v_float32)
 DEF_CVT_FUNC(16s64f, cvt_, short, double, v_int32)
-//DEF_CVT_FUNC(16s16f, cvt1_,short, float16_t, v_float32)
+DEF_CVT_FUNC(16s16f, cvt1_,short, float16_t, v_float32)
 ////////////////////// 32s -> ... ////////////////////////
@@ -190,7 +190,7 @@ DEF_CVT_FUNC(32s16u, cvt_, int, ushort, v_int32)
 DEF_CVT_FUNC(32s16s, cvt_, int, short,  v_int32)
 DEF_CVT_FUNC(32s32f, cvt_, int, float,  v_float32)
 DEF_CVT_FUNC(32s64f, cvt_, int, double, v_int32)
-//DEF_CVT_FUNC(32s16f, cvt1_,int, float16_t, v_float32)
+DEF_CVT_FUNC(32s16f, cvt1_,int, float16_t, v_float32)
 ////////////////////// 32f -> ... ////////////////////////
@@ -210,17 +210,17 @@ DEF_CVT_FUNC(64f16u, cvt_, double, ushort, v_int32)
 DEF_CVT_FUNC(64f16s, cvt_, double, short,  v_int32)
 DEF_CVT_FUNC(64f32s, cvt_, double, int,    v_int32)
 DEF_CVT_FUNC(64f32f, cvt_, double, float,  v_float32)
-//DEF_CVT_FUNC(64f16f, cvt1_,double, float16_t, v_float32)
+DEF_CVT_FUNC(64f16f, cvt1_,double, float16_t, v_float32)
 ////////////////////// 16f -> ... ////////////////////////
-//DEF_CVT_FUNC(16f8u,  cvt_,  float16_t, uchar,  v_float32)
+DEF_CVT_FUNC(16f8u,  cvt_,  float16_t, uchar,  v_float32)
-//DEF_CVT_FUNC(16f8s,  cvt_,  float16_t, schar,  v_float32)
+DEF_CVT_FUNC(16f8s,  cvt_,  float16_t, schar,  v_float32)
-//DEF_CVT_FUNC(16f16u, cvt1_, float16_t, ushort, v_float32)
+DEF_CVT_FUNC(16f16u, cvt1_, float16_t, ushort, v_float32)
-//DEF_CVT_FUNC(16f16s, cvt1_, float16_t, short,  v_float32)
+DEF_CVT_FUNC(16f16s, cvt1_, float16_t, short,  v_float32)
-//DEF_CVT_FUNC(16f32s, cvt1_, float16_t, int,    v_float32)
+DEF_CVT_FUNC(16f32s, cvt1_, float16_t, int,    v_float32)
 DEF_CVT_FUNC(16f32f, cvt1_, float16_t, float,  v_float32)
-//DEF_CVT_FUNC(16f64f, cvt1_, float16_t, double, v_float32)
+DEF_CVT_FUNC(16f64f, cvt1_, float16_t, double, v_float32)
 ///////////// "conversion" w/o conversion ///////////////
@@ -339,42 +339,41 @@ BinaryFunc getConvertFunc(int sdepth, int ddepth)
        {
            (BinaryFunc)(cvt8u), (BinaryFunc)GET_OPTIMIZED(cvt8s8u), (BinaryFunc)GET_OPTIMIZED(cvt16u8u),
            (BinaryFunc)GET_OPTIMIZED(cvt16s8u), (BinaryFunc)GET_OPTIMIZED(cvt32s8u), (BinaryFunc)GET_OPTIMIZED(cvt32f8u),
-            (BinaryFunc)GET_OPTIMIZED(cvt64f8u), 0 //(BinaryFunc)(cvt16f8u)
+            (BinaryFunc)GET_OPTIMIZED(cvt64f8u), (BinaryFunc)(cvt16f8u)
        },
        {
            (BinaryFunc)GET_OPTIMIZED(cvt8u8s), (BinaryFunc)cvt8u, (BinaryFunc)GET_OPTIMIZED(cvt16u8s),
            (BinaryFunc)GET_OPTIMIZED(cvt16s8s), (BinaryFunc)GET_OPTIMIZED(cvt32s8s), (BinaryFunc)GET_OPTIMIZED(cvt32f8s),
-            (BinaryFunc)GET_OPTIMIZED(cvt64f8s), 0 //(BinaryFunc)(cvt16f8s)
+            (BinaryFunc)GET_OPTIMIZED(cvt64f8s), (BinaryFunc)(cvt16f8s)
        },
        {
            (BinaryFunc)GET_OPTIMIZED(cvt8u16u), (BinaryFunc)GET_OPTIMIZED(cvt8s16u), (BinaryFunc)cvt16u,
            (BinaryFunc)GET_OPTIMIZED(cvt16s16u), (BinaryFunc)GET_OPTIMIZED(cvt32s16u), (BinaryFunc)GET_OPTIMIZED(cvt32f16u),
-            (BinaryFunc)GET_OPTIMIZED(cvt64f16u), 0 //(BinaryFunc)(cvt16f16u)
+            (BinaryFunc)GET_OPTIMIZED(cvt64f16u), (BinaryFunc)(cvt16f16u)
        },
        {
            (BinaryFunc)GET_OPTIMIZED(cvt8u16s), (BinaryFunc)GET_OPTIMIZED(cvt8s16s), (BinaryFunc)GET_OPTIMIZED(cvt16u16s),
            (BinaryFunc)cvt16u, (BinaryFunc)GET_OPTIMIZED(cvt32s16s), (BinaryFunc)GET_OPTIMIZED(cvt32f16s),
-            (BinaryFunc)GET_OPTIMIZED(cvt64f16s), 0 //(BinaryFunc)(cvt16f16s)
+            (BinaryFunc)GET_OPTIMIZED(cvt64f16s), (BinaryFunc)(cvt16f16s)
        },
        {
            (BinaryFunc)GET_OPTIMIZED(cvt8u32s), (BinaryFunc)GET_OPTIMIZED(cvt8s32s), (BinaryFunc)GET_OPTIMIZED(cvt16u32s),
            (BinaryFunc)GET_OPTIMIZED(cvt16s32s), (BinaryFunc)cvt32s, (BinaryFunc)GET_OPTIMIZED(cvt32f32s),
-            (BinaryFunc)GET_OPTIMIZED(cvt64f32s), 0 //(BinaryFunc)(cvt16f32s)
+            (BinaryFunc)GET_OPTIMIZED(cvt64f32s), (BinaryFunc)(cvt16f32s)
        },
        {
            (BinaryFunc)GET_OPTIMIZED(cvt8u32f), (BinaryFunc)GET_OPTIMIZED(cvt8s32f), (BinaryFunc)GET_OPTIMIZED(cvt16u32f),
            (BinaryFunc)GET_OPTIMIZED(cvt16s32f), (BinaryFunc)GET_OPTIMIZED(cvt32s32f), (BinaryFunc)cvt32s,
-            (BinaryFunc)GET_OPTIMIZED(cvt64f32f), 0 //(BinaryFunc)(cvt16f32f)
+            (BinaryFunc)GET_OPTIMIZED(cvt64f32f), (BinaryFunc)(cvt16f32f)
        },
        {
            (BinaryFunc)GET_OPTIMIZED(cvt8u64f), (BinaryFunc)GET_OPTIMIZED(cvt8s64f), (BinaryFunc)GET_OPTIMIZED(cvt16u64f),
            (BinaryFunc)GET_OPTIMIZED(cvt16s64f), (BinaryFunc)GET_OPTIMIZED(cvt32s64f), (BinaryFunc)GET_OPTIMIZED(cvt32f64f),
-            (BinaryFunc)(cvt64s), 0 //(BinaryFunc)(cvt16f64f)
+            (BinaryFunc)(cvt64s), (BinaryFunc)(cvt16f64f)
        },
        {
-            0, 0, 0, 0, 0, 0, 0, 0
+            (BinaryFunc)(cvt8u16f), (BinaryFunc)(cvt8s16f), (BinaryFunc)(cvt16u16f), (BinaryFunc)(cvt16s16f),
-            //(BinaryFunc)(cvt8u16f), (BinaryFunc)(cvt8s16f), (BinaryFunc)(cvt16u16f), (BinaryFunc)(cvt16s16f),
+            (BinaryFunc)(cvt32s16f), (BinaryFunc)(cvt32f16f), (BinaryFunc)(cvt64f16f), (BinaryFunc)(cvt16u)
-            //(BinaryFunc)(cvt32s16f), (BinaryFunc)(cvt32f16f), (BinaryFunc)(cvt64f16f), (BinaryFunc)(cvt16u)
        }
    };
    return cvtTab[CV_MAT_DEPTH(ddepth)][CV_MAT_DEPTH(sdepth)];
@@ -481,7 +480,7 @@ void cv::convertFp16( InputArray _src, OutputArray _dst )
        if(_dst.fixedType())
        {
            ddepth = _dst.depth();
-            CV_Assert(ddepth == CV_16S /*|| ddepth == CV_16F*/);
+            CV_Assert(ddepth == CV_16S || ddepth == CV_16F);
            CV_Assert(_dst.channels() == _src.channels());
        }
        else
@@ -489,7 +488,7 @@ void cv::convertFp16( InputArray _src, OutputArray _dst )
        func = (BinaryFunc)cvt32f16f;
        break;
    case CV_16S:
-    //case CV_16F:
+    case CV_16F:
        ddepth = CV_32F;
        func = (BinaryFunc)cvt16f32f;
        break;

--- a/modules/core/src/convert.hpp
+++ b/modules/core/src/convert.hpp
@@ -150,12 +150,11 @@ static inline void vx_load_pair_as(const int* ptr, v_float32& a, v_float32& b)
 static inline void vx_load_pair_as(const float* ptr, v_float32& a, v_float32& b)
 { a = vx_load(ptr); b = vx_load(ptr + v_float32::nlanes); }
-//static inline void vx_load_pair_as(const float16_t* ptr, v_float32& a, v_float32& b)
+static inline void vx_load_pair_as(const float16_t* ptr, v_float32& a, v_float32& b)
-//{
+{
-//    a = vx_load_expand(ptr);
+    a = vx_load_expand(ptr);
-//    b = vx_load_expand(ptr + v_float32::nlanes);
+    b = vx_load_expand(ptr + v_float32::nlanes);
-//}
+}
 static inline void v_store_pair_as(uchar* ptr, const v_uint16& a, const v_uint16& b)
 {
@@ -295,12 +294,12 @@ static inline void vx_load_pair_as(const double* ptr, v_float64& a, v_float64& b
    b = vx_load(ptr + v_float64::nlanes);
 }
-//static inline void vx_load_pair_as(const float16_t* ptr, v_float64& a, v_float64& b)
+static inline void vx_load_pair_as(const float16_t* ptr, v_float64& a, v_float64& b)
-//{
+{
-//    v_float32 v0 = vx_load_expand(ptr);
+    v_float32 v0 = vx_load_expand(ptr);
-//    a = v_cvt_f64(v0);
+    a = v_cvt_f64(v0);
-//    b = v_cvt_f64_high(v0);
+    b = v_cvt_f64_high(v0);
-//}
+}
 static inline void v_store_as(double* ptr, const v_float32& a)
 {
@@ -349,11 +348,11 @@ static inline void v_store_pair_as(float* ptr, const v_float64& a, const v_float
    v_store(ptr, v);
 }
-//static inline void v_store_pair_as(float16_t* ptr, const v_float64& a, const v_float64& b)
+static inline void v_store_pair_as(float16_t* ptr, const v_float64& a, const v_float64& b)
-//{
+{
-//    v_float32 v = v_cvt_f32(a, b);
+    v_float32 v = v_cvt_f32(a, b);
-//    v_pack_store(ptr, v);
+    v_pack_store(ptr, v);
-//}
+}
 #else

--- a/modules/core/src/convert_scale.cpp
+++ b/modules/core/src/convert_scale.cpp
@@ -222,7 +222,7 @@ DEF_CVT_SCALE_FUNC(16s8u,  cvt_32f, short,  uchar, float)
 DEF_CVT_SCALE_FUNC(32s8u,  cvt_32f, int,    uchar, float)
 DEF_CVT_SCALE_FUNC(32f8u,  cvt_32f, float,  uchar, float)
 DEF_CVT_SCALE_FUNC(64f8u,  cvt_32f, double, uchar, float)
-//DEF_CVT_SCALE_FUNC(16f8u,  cvt_32f, float16_t, uchar, float)
+DEF_CVT_SCALE_FUNC(16f8u,  cvt_32f, float16_t, uchar, float)
 DEF_CVT_SCALE_FUNC(8u8s,   cvt_32f, uchar,  schar, float)
 DEF_CVT_SCALE_FUNC(8s,     cvt_32f, schar,  schar, float)
@@ -231,7 +231,7 @@ DEF_CVT_SCALE_FUNC(16s8s,  cvt_32f, short,  schar, float)
 DEF_CVT_SCALE_FUNC(32s8s,  cvt_32f, int,    schar, float)
 DEF_CVT_SCALE_FUNC(32f8s,  cvt_32f, float,  schar, float)
 DEF_CVT_SCALE_FUNC(64f8s,  cvt_32f, double, schar, float)
-//DEF_CVT_SCALE_FUNC(16f8s,  cvt_32f, float16_t, schar, float)
+DEF_CVT_SCALE_FUNC(16f8s,  cvt_32f, float16_t, schar, float)
 DEF_CVT_SCALE_FUNC(8u16u,  cvt_32f, uchar,  ushort, float)
 DEF_CVT_SCALE_FUNC(8s16u,  cvt_32f, schar,  ushort, float)
@@ -240,7 +240,7 @@ DEF_CVT_SCALE_FUNC(16s16u, cvt_32f, short,  ushort, float)
 DEF_CVT_SCALE_FUNC(32s16u, cvt_32f, int,    ushort, float)
 DEF_CVT_SCALE_FUNC(32f16u, cvt_32f, float,  ushort, float)
 DEF_CVT_SCALE_FUNC(64f16u, cvt_32f, double, ushort, float)
-//DEF_CVT_SCALE_FUNC(16f16u, cvt1_32f, float16_t, ushort, float)
+DEF_CVT_SCALE_FUNC(16f16u, cvt1_32f, float16_t, ushort, float)
 DEF_CVT_SCALE_FUNC(8u16s,  cvt_32f, uchar,  short, float)
 DEF_CVT_SCALE_FUNC(8s16s,  cvt_32f, schar,  short, float)
@@ -249,7 +249,7 @@ DEF_CVT_SCALE_FUNC(16s,    cvt_32f, short,  short, float)
 DEF_CVT_SCALE_FUNC(32s16s, cvt_32f, int,    short, float)
 DEF_CVT_SCALE_FUNC(32f16s, cvt_32f, float,  short, float)
 DEF_CVT_SCALE_FUNC(64f16s, cvt_32f, double, short, float)
-//DEF_CVT_SCALE_FUNC(16f16s, cvt1_32f, float16_t, short, float)
+DEF_CVT_SCALE_FUNC(16f16s, cvt1_32f, float16_t, short, float)
 DEF_CVT_SCALE_FUNC(8u32s,  cvt_32f, uchar,  int, float)
 DEF_CVT_SCALE_FUNC(8s32s,  cvt_32f, schar,  int, float)
@@ -258,7 +258,7 @@ DEF_CVT_SCALE_FUNC(16s32s, cvt_32f, short,  int, float)
 DEF_CVT_SCALE_FUNC(32s,    cvt_64f, int,    int, double)
 DEF_CVT_SCALE_FUNC(32f32s, cvt_32f, float,  int, float)
 DEF_CVT_SCALE_FUNC(64f32s, cvt_64f, double, int, double)
-//DEF_CVT_SCALE_FUNC(16f32s, cvt1_32f, float16_t, int, float)
+DEF_CVT_SCALE_FUNC(16f32s, cvt1_32f, float16_t, int, float)
 DEF_CVT_SCALE_FUNC(8u32f,  cvt_32f, uchar,  float, float)
 DEF_CVT_SCALE_FUNC(8s32f,  cvt_32f, schar,  float, float)
@@ -267,7 +267,7 @@ DEF_CVT_SCALE_FUNC(16s32f, cvt_32f, short,  float, float)
 DEF_CVT_SCALE_FUNC(32s32f, cvt_32f, int,    float, float)
 DEF_CVT_SCALE_FUNC(32f,    cvt_32f, float,  float, float)
 DEF_CVT_SCALE_FUNC(64f32f, cvt_64f, double, float, double)
-//DEF_CVT_SCALE_FUNC(16f32f, cvt1_32f, float16_t, float, float)
+DEF_CVT_SCALE_FUNC(16f32f, cvt1_32f, float16_t, float, float)
 DEF_CVT_SCALE_FUNC(8u64f,  cvt_64f, uchar,  double, double)
 DEF_CVT_SCALE_FUNC(8s64f,  cvt_64f, schar,  double, double)
@@ -276,16 +276,16 @@ DEF_CVT_SCALE_FUNC(16s64f, cvt_64f, short,  double, double)
 DEF_CVT_SCALE_FUNC(32s64f, cvt_64f, int,    double, double)
 DEF_CVT_SCALE_FUNC(32f64f, cvt_64f, float,  double, double)
 DEF_CVT_SCALE_FUNC(64f,    cvt_64f, double, double, double)
-//DEF_CVT_SCALE_FUNC(16f64f, cvt_64f, float16_t, double, double)
+DEF_CVT_SCALE_FUNC(16f64f, cvt_64f, float16_t, double, double)
-/*DEF_CVT_SCALE_FUNC(8u16f,  cvt1_32f, uchar,  float16_t, float)
+DEF_CVT_SCALE_FUNC(8u16f,  cvt1_32f, uchar,  float16_t, float)
 DEF_CVT_SCALE_FUNC(8s16f,  cvt1_32f, schar,  float16_t, float)
 DEF_CVT_SCALE_FUNC(16u16f, cvt1_32f, ushort, float16_t, float)
 DEF_CVT_SCALE_FUNC(16s16f, cvt1_32f, short,  float16_t, float)
 DEF_CVT_SCALE_FUNC(32s16f, cvt1_32f, int,    float16_t, float)
 DEF_CVT_SCALE_FUNC(32f16f, cvt1_32f, float,  float16_t, float)
 DEF_CVT_SCALE_FUNC(64f16f, cvt_64f,  double, float16_t, double)
-DEF_CVT_SCALE_FUNC(16f,    cvt1_32f, float16_t, float16_t, float)*/
+DEF_CVT_SCALE_FUNC(16f,    cvt1_32f, float16_t, float16_t, float)
 static BinaryFunc getCvtScaleAbsFunc(int depth)
 {
@@ -306,43 +306,42 @@ BinaryFunc getConvertScaleFunc(int sdepth, int ddepth)
        {
            (BinaryFunc)GET_OPTIMIZED(cvtScale8u), (BinaryFunc)GET_OPTIMIZED(cvtScale8s8u), (BinaryFunc)GET_OPTIMIZED(cvtScale16u8u),
            (BinaryFunc)GET_OPTIMIZED(cvtScale16s8u), (BinaryFunc)GET_OPTIMIZED(cvtScale32s8u), (BinaryFunc)GET_OPTIMIZED(cvtScale32f8u),
-            (BinaryFunc)cvtScale64f8u, 0 //(BinaryFunc)cvtScale16f8u
+            (BinaryFunc)cvtScale64f8u, (BinaryFunc)cvtScale16f8u
        },
        {
            (BinaryFunc)GET_OPTIMIZED(cvtScale8u8s), (BinaryFunc)GET_OPTIMIZED(cvtScale8s), (BinaryFunc)GET_OPTIMIZED(cvtScale16u8s),
            (BinaryFunc)GET_OPTIMIZED(cvtScale16s8s), (BinaryFunc)GET_OPTIMIZED(cvtScale32s8s), (BinaryFunc)GET_OPTIMIZED(cvtScale32f8s),
-            (BinaryFunc)cvtScale64f8s, 0 //(BinaryFunc)cvtScale16f8s
+            (BinaryFunc)cvtScale64f8s, (BinaryFunc)cvtScale16f8s
        },
        {
            (BinaryFunc)GET_OPTIMIZED(cvtScale8u16u), (BinaryFunc)GET_OPTIMIZED(cvtScale8s16u), (BinaryFunc)GET_OPTIMIZED(cvtScale16u),
            (BinaryFunc)GET_OPTIMIZED(cvtScale16s16u), (BinaryFunc)GET_OPTIMIZED(cvtScale32s16u), (BinaryFunc)GET_OPTIMIZED(cvtScale32f16u),
-            (BinaryFunc)cvtScale64f16u, 0 //(BinaryFunc)cvtScale16f16u
+            (BinaryFunc)cvtScale64f16u, (BinaryFunc)cvtScale16f16u
        },
        {
            (BinaryFunc)GET_OPTIMIZED(cvtScale8u16s), (BinaryFunc)GET_OPTIMIZED(cvtScale8s16s), (BinaryFunc)GET_OPTIMIZED(cvtScale16u16s),
            (BinaryFunc)GET_OPTIMIZED(cvtScale16s), (BinaryFunc)GET_OPTIMIZED(cvtScale32s16s), (BinaryFunc)GET_OPTIMIZED(cvtScale32f16s),
-            (BinaryFunc)cvtScale64f16s, 0 //(BinaryFunc)cvtScale16f16s
+            (BinaryFunc)cvtScale64f16s, (BinaryFunc)cvtScale16f16s
        },
        {
            (BinaryFunc)GET_OPTIMIZED(cvtScale8u32s), (BinaryFunc)GET_OPTIMIZED(cvtScale8s32s), (BinaryFunc)GET_OPTIMIZED(cvtScale16u32s),
            (BinaryFunc)GET_OPTIMIZED(cvtScale16s32s), (BinaryFunc)GET_OPTIMIZED(cvtScale32s), (BinaryFunc)GET_OPTIMIZED(cvtScale32f32s),
-            (BinaryFunc)cvtScale64f32s, 0 //(BinaryFunc)cvtScale16f32s
+            (BinaryFunc)cvtScale64f32s, (BinaryFunc)cvtScale16f32s
        },
        {
            (BinaryFunc)GET_OPTIMIZED(cvtScale8u32f), (BinaryFunc)GET_OPTIMIZED(cvtScale8s32f), (BinaryFunc)GET_OPTIMIZED(cvtScale16u32f),
            (BinaryFunc)GET_OPTIMIZED(cvtScale16s32f), (BinaryFunc)GET_OPTIMIZED(cvtScale32s32f), (BinaryFunc)GET_OPTIMIZED(cvtScale32f),
-            (BinaryFunc)cvtScale64f32f, 0 //(BinaryFunc)cvtScale16f32f
+            (BinaryFunc)cvtScale64f32f, (BinaryFunc)cvtScale16f32f
        },
        {
            (BinaryFunc)cvtScale8u64f, (BinaryFunc)cvtScale8s64f, (BinaryFunc)cvtScale16u64f,
            (BinaryFunc)cvtScale16s64f, (BinaryFunc)cvtScale32s64f, (BinaryFunc)cvtScale32f64f,
-            (BinaryFunc)cvtScale64f, 0 //(BinaryFunc)cvtScale16f64f
+            (BinaryFunc)cvtScale64f, (BinaryFunc)cvtScale16f64f
        },
        {
-            0, 0, 0, 0, 0, 0, 0, 0
+            (BinaryFunc)cvtScale8u16f, (BinaryFunc)cvtScale8s16f, (BinaryFunc)cvtScale16u16f,
-            /*(BinaryFunc)cvtScale8u16f, (BinaryFunc)cvtScale8s16f, (BinaryFunc)cvtScale16u16f,
            (BinaryFunc)cvtScale16s16f, (BinaryFunc)cvtScale32s16f, (BinaryFunc)cvtScale32f16f,
-            (BinaryFunc)cvtScale64f16f, (BinaryFunc)cvtScale16f*/
+            (BinaryFunc)cvtScale64f16f, (BinaryFunc)cvtScale16f
        },
    };

--- a/modules/core/src/merge.cpp
+++ b/modules/core/src/merge.cpp
@@ -216,8 +216,10 @@ static MergeFunc getMergeFunc(int depth)
 {
    static MergeFunc mergeTab[] =
    {
-        (MergeFunc)GET_OPTIMIZED(cv::hal::merge8u), (MergeFunc)GET_OPTIMIZED(cv::hal::merge8u), (MergeFunc)GET_OPTIMIZED(cv::hal::merge16u), (MergeFunc)GET_OPTIMIZED(cv::hal::merge16u),
+        (MergeFunc)GET_OPTIMIZED(cv::hal::merge8u), (MergeFunc)GET_OPTIMIZED(cv::hal::merge8u),
-        (MergeFunc)GET_OPTIMIZED(cv::hal::merge32s), (MergeFunc)GET_OPTIMIZED(cv::hal::merge32s), (MergeFunc)GET_OPTIMIZED(cv::hal::merge64s), 0
+        (MergeFunc)GET_OPTIMIZED(cv::hal::merge16u), (MergeFunc)GET_OPTIMIZED(cv::hal::merge16u),
+        (MergeFunc)GET_OPTIMIZED(cv::hal::merge32s), (MergeFunc)GET_OPTIMIZED(cv::hal::merge32s),
+        (MergeFunc)GET_OPTIMIZED(cv::hal::merge64s), (MergeFunc)GET_OPTIMIZED(cv::hal::merge16u)
    };
    return mergeTab[depth];

--- a/modules/core/src/norm.cpp
+++ b/modules/core/src/norm.cpp
@@ -723,7 +723,7 @@ double cv::norm( InputArray _src, int normType, InputArray _mask )
        return result;
    }
-    NormFunc func = getNormFunc(normType >> 1, depth);
+    NormFunc func = getNormFunc(normType >> 1, depth == CV_16F ? CV_32F : depth);
    CV_Assert( func != 0 );
    const Mat* arrays[] = {&src, &mask, 0};
@@ -737,19 +737,31 @@ double cv::norm( InputArray _src, int normType, InputArray _mask )
    result;
    result.d = 0;
    NAryMatIterator it(arrays, ptrs);
-    int j, total = (int)it.size, blockSize = total, intSumBlockSize = 0, count = 0;
+    int j, total = (int)it.size, blockSize = total;
-    bool blockSum = (normType == NORM_L1 && depth <= CV_16S) ||
+    bool blockSum = depth == CV_16F || (normType == NORM_L1 && depth <= CV_16S) ||
            ((normType == NORM_L2 || normType == NORM_L2SQR) && depth <= CV_8S);
    int isum = 0;
    int *ibuf = &result.i;
+    AutoBuffer<float> fltbuf_;
+    float* fltbuf = 0;
    size_t esz = 0;
    if( blockSum )
    {
-        intSumBlockSize = (normType == NORM_L1 && depth <= CV_8S ? (1 << 23) : (1 << 15))/cn;
-        blockSize = std::min(blockSize, intSumBlockSize);
-        ibuf = &isum;
        esz = src.elemSize();
+        if( depth == CV_16F )
+        {
+            blockSize = std::min(blockSize, 1024);
+            fltbuf_.allocate(blockSize);
+            fltbuf = fltbuf_.data();
+        }
+        else
+        {
+            int intSumBlockSize = (normType == NORM_L1 && depth <= CV_8S ? (1 << 23) : (1 << 15))/cn;
+            blockSize = std::min(blockSize, intSumBlockSize);
+            ibuf = &isum;
+        }
    }
    for( size_t i = 0; i < it.nplanes; i++, ++it )
@@ -757,13 +769,17 @@ double cv::norm( InputArray _src, int normType, InputArray _mask )
        for( j = 0; j < total; j += blockSize )
        {
            int bsz = std::min(total - j, blockSize);
-            func( ptrs[0], ptrs[1], (uchar*)ibuf, bsz, cn );
+            const uchar* data = ptrs[0];
-            count += bsz;
+            if( depth == CV_16F )
-            if( blockSum && (count + blockSize >= intSumBlockSize || (i+1 >= it.nplanes && j+bsz >= total)) )
+            {
+                hal::cvt16f32f((const float16_t*)ptrs[0], fltbuf, bsz);
+                data = (const uchar*)fltbuf;
+            }
+            func( data, ptrs[1], (uchar*)ibuf, bsz, cn );
+            if( blockSum && depth != CV_16F )
            {
                result.d += isum;
                isum = 0;
-                count = 0;
            }
            ptrs[0] += bsz*esz;
            if( ptrs[1] )
@@ -1181,7 +1197,7 @@ double cv::norm( InputArray _src1, InputArray _src2, int normType, InputArray _m
        return result;
    }
-    NormDiffFunc func = getNormDiffFunc(normType >> 1, depth);
+    NormDiffFunc func = getNormDiffFunc(normType >> 1, depth == CV_16F ? CV_32F : depth);
    CV_Assert( func != 0 );
    const Mat* arrays[] = {&src1, &src2, &mask, 0};
@@ -1196,19 +1212,31 @@ double cv::norm( InputArray _src1, InputArray _src2, int normType, InputArray _m
    result;
    result.d = 0;
    NAryMatIterator it(arrays, ptrs);
-    int j, total = (int)it.size, blockSize = total, intSumBlockSize = 0, count = 0;
+    int j, total = (int)it.size, blockSize = total;
-    bool blockSum = (normType == NORM_L1 && depth <= CV_16S) ||
+    bool blockSum = depth == CV_16F || (normType == NORM_L1 && depth <= CV_16S) ||
            ((normType == NORM_L2 || normType == NORM_L2SQR) && depth <= CV_8S);
    unsigned isum = 0;
    unsigned *ibuf = &result.u;
+    AutoBuffer<float> fltbuf_;
+    float* fltbuf = 0;
    size_t esz = 0;
    if( blockSum )
    {
-        intSumBlockSize = normType == NORM_L1 && depth <= CV_8S ? (1 << 23) : (1 << 15);
-        blockSize = std::min(blockSize, intSumBlockSize);
-        ibuf = &isum;
        esz = src1.elemSize();
+        if( depth == CV_16F )
+        {
+            blockSize = std::min(blockSize, 1024);
+            fltbuf_.allocate(blockSize*2);
+            fltbuf = fltbuf_.data();
+        }
+        else
+        {
+            int intSumBlockSize = (normType == NORM_L1 && depth <= CV_8S ? (1 << 23) : (1 << 15))/cn;
+            blockSize = std::min(blockSize, intSumBlockSize);
+            ibuf = &isum;
+        }
    }
    for( size_t i = 0; i < it.nplanes; i++, ++it )
@@ -1216,13 +1244,19 @@ double cv::norm( InputArray _src1, InputArray _src2, int normType, InputArray _m
        for( j = 0; j < total; j += blockSize )
        {
            int bsz = std::min(total - j, blockSize);
-            func( ptrs[0], ptrs[1], ptrs[2], (uchar*)ibuf, bsz, cn );
+            const uchar *data0 = ptrs[0], *data1 = ptrs[1];
-            count += bsz;
+            if( depth == CV_16F )
-            if( blockSum && (count + blockSize >= intSumBlockSize || (i+1 >= it.nplanes && j+bsz >= total)) )
+            {
+                hal::cvt16f32f((const float16_t*)ptrs[0], fltbuf, bsz);
+                hal::cvt16f32f((const float16_t*)ptrs[1], fltbuf + bsz, bsz);
+                data0 = (const uchar*)fltbuf;
+                data1 = (const uchar*)(fltbuf + bsz);
+            }
+            func( data0, data1, ptrs[2], (uchar*)ibuf, bsz, cn );
+            if( blockSum && depth != CV_16F )
            {
                result.d += isum;
                isum = 0;
-                count = 0;
            }
            ptrs[0] += bsz*esz;
            ptrs[1] += bsz*esz;

--- a/modules/core/src/out.cpp
+++ b/modules/core/src/out.cpp
@@ -77,6 +77,7 @@ namespace cv
        void valueToStr32s() { sprintf(buf, "%d", mtx.ptr<int>(row, col)[cn]); }
        void valueToStr32f() { sprintf(buf, floatFormat, mtx.ptr<float>(row, col)[cn]); }
        void valueToStr64f() { sprintf(buf, floatFormat, mtx.ptr<double>(row, col)[cn]); }
+        void valueToStr16f() { sprintf(buf, floatFormat, (float)mtx.ptr<float16_t>(row, col)[cn]); }
        void valueToStrOther() { buf[0] = 0; }
    public:
@@ -115,7 +116,8 @@ namespace cv
                case CV_32S: valueToStr = &FormattedImpl::valueToStr32s; break;
                case CV_32F: valueToStr = &FormattedImpl::valueToStr32f; break;
                case CV_64F: valueToStr = &FormattedImpl::valueToStr64f; break;
-                default:     valueToStr = &FormattedImpl::valueToStrOther; break;
+                default:     CV_Assert(mtx.depth() == CV_16F);
+                             valueToStr = &FormattedImpl::valueToStr16f;
            }
        }
@@ -256,7 +258,12 @@ namespace cv
    class FormatterBase : public Formatter
    {
    public:
-        FormatterBase() : prec32f(8), prec64f(16), multiline(true) {}
+        FormatterBase() : prec16f(4), prec32f(8), prec64f(16), multiline(true) {}
+        void set16fPrecision(int p) CV_OVERRIDE
+        {
+            prec16f = p;
+        }
        void set32fPrecision(int p) CV_OVERRIDE
        {
@@ -274,6 +281,7 @@ namespace cv
        }
    protected:
+        int prec16f;
        int prec32f;
        int prec64f;
        int multiline;
@@ -325,7 +333,7 @@ namespace cv
        {
            static const char* numpyTypes[] =
            {
-                "uint8", "int8", "uint16", "int16", "int32", "float32", "float64", "uint64"
+                "uint8", "int8", "uint16", "int16", "int32", "float32", "float64", "float16"
            };
            char braces[5] = {'[', ']', ',', '[', ']'};
            if (mtx.cols == 1)

--- a/modules/core/src/rand.cpp
+++ b/modules/core/src/rand.cpp
--- a/modules/core/src/split.cpp
+++ b/modules/core/src/split.cpp
@@ -224,8 +224,10 @@ static SplitFunc getSplitFunc(int depth)
 {
    static SplitFunc splitTab[] =
    {
-        (SplitFunc)GET_OPTIMIZED(cv::hal::split8u), (SplitFunc)GET_OPTIMIZED(cv::hal::split8u), (SplitFunc)GET_OPTIMIZED(cv::hal::split16u), (SplitFunc)GET_OPTIMIZED(cv::hal::split16u),
+        (SplitFunc)GET_OPTIMIZED(cv::hal::split8u), (SplitFunc)GET_OPTIMIZED(cv::hal::split8u),
-        (SplitFunc)GET_OPTIMIZED(cv::hal::split32s), (SplitFunc)GET_OPTIMIZED(cv::hal::split32s), (SplitFunc)GET_OPTIMIZED(cv::hal::split64s), 0
+        (SplitFunc)GET_OPTIMIZED(cv::hal::split16u), (SplitFunc)GET_OPTIMIZED(cv::hal::split16u),
+        (SplitFunc)GET_OPTIMIZED(cv::hal::split32s), (SplitFunc)GET_OPTIMIZED(cv::hal::split32s),
+        (SplitFunc)GET_OPTIMIZED(cv::hal::split64s), (SplitFunc)GET_OPTIMIZED(cv::hal::split16u)
    };
    return splitTab[depth];

--- a/modules/core/test/ocl/test_matrix_expr.cpp
+++ b/modules/core/test/ocl/test_matrix_expr.cpp
@@ -78,7 +78,7 @@ OCL_TEST_P(UMatExpr, Ones)
 //////////////////////////////// Instantiation /////////////////////////////////////////////////
-OCL_INSTANTIATE_TEST_CASE_P(MatrixOperation, UMatExpr, Combine(OCL_ALL_DEPTHS, OCL_ALL_CHANNELS));
+OCL_INSTANTIATE_TEST_CASE_P(MatrixOperation, UMatExpr, Combine(OCL_ALL_DEPTHS_16F, OCL_ALL_CHANNELS));
 } } // namespace opencv_test::ocl

--- a/modules/core/test/test_arithm.cpp
+++ b/modules/core/test/test_arithm.cpp
@@ -476,7 +476,7 @@ struct CopyOp : public BaseElemWiseOp
    }
    int getRandomType(RNG& rng)
    {
-        return cvtest::randomType(rng, _OutputArray::DEPTH_MASK_ALL, 1, ARITHM_MAX_CHANNELS);
+        return cvtest::randomType(rng, _OutputArray::DEPTH_MASK_ALL_16F, 1, ARITHM_MAX_CHANNELS);
    }
    double getMaxErr(int)
    {
@@ -498,7 +498,7 @@ struct SetOp : public BaseElemWiseOp
    }
    int getRandomType(RNG& rng)
    {
-        return cvtest::randomType(rng, _OutputArray::DEPTH_MASK_ALL, 1, ARITHM_MAX_CHANNELS);
+        return cvtest::randomType(rng, _OutputArray::DEPTH_MASK_ALL_16F, 1, ARITHM_MAX_CHANNELS);
    }
    double getMaxErr(int)
    {

--- a/modules/ts/include/opencv2/ts/ocl_test.hpp
+++ b/modules/ts/include/opencv2/ts/ocl_test.hpp
@@ -372,6 +372,7 @@ IMPLEMENT_PARAM_CLASS(Channels, int)
 #define OCL_ON(...) cv::ocl::setUseOpenCL(true); __VA_ARGS__ ;
 #define OCL_ALL_DEPTHS Values(CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F, CV_64F)
+#define OCL_ALL_DEPTHS_16F Values(CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F, CV_64F, CV_16F)
 #define OCL_ALL_CHANNELS Values(1, 2, 3, 4)
 CV_ENUM(Interpolation, INTER_NEAREST, INTER_LINEAR, INTER_CUBIC, INTER_AREA, INTER_LINEAR_EXACT)

--- a/modules/ts/include/opencv2/ts/ts_perf.hpp
+++ b/modules/ts/include/opencv2/ts/ts_perf.hpp
@@ -160,7 +160,7 @@ private:
    };                                                                                  \
    static inline void PrintTo(const class_name& t, std::ostream* os) { t.PrintTo(os); } }
-CV_ENUM(MatDepth, CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F, CV_64F, CV_USRTYPE1)
+CV_ENUM(MatDepth, CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F, CV_64F, CV_16F)
 /*****************************************************************************************\
 *                 Regression control utility for performance testing                      *

--- a/modules/ts/src/ts_func.cpp
+++ b/modules/ts/src/ts_func.cpp
@@ -72,10 +72,10 @@ int randomType(RNG& rng, int typeMask, int minChannels, int maxChannels)
 {
    int channels = rng.uniform(minChannels, maxChannels+1);
    int depth = 0;
-    CV_Assert((typeMask & _OutputArray::DEPTH_MASK_ALL) != 0);
+    CV_Assert((typeMask & _OutputArray::DEPTH_MASK_ALL_16F) != 0);
    for(;;)
    {
-        depth = rng.uniform(CV_8U, CV_64F+1);
+        depth = rng.uniform(CV_8U, CV_16F+1);
        if( ((1 << depth) & typeMask) != 0 )
            break;
    }
@@ -1260,6 +1260,13 @@ norm_(const _Tp* src1, const _Tp* src2, size_t total, int cn, int normType, doub
 double norm(InputArray _src, int normType, InputArray _mask)
 {
    Mat src = _src.getMat(), mask = _mask.getMat();
+    if( src.depth() == CV_16F )
+    {
+        Mat src32f;
+        src.convertTo(src32f, CV_32F);
+        return cvtest::norm(src32f, normType, _mask);
+    }
    if( normType == NORM_HAMMING || normType == NORM_HAMMING2 )
    {
        if( !mask.empty() )
@@ -1340,6 +1347,14 @@ double norm(InputArray _src, int normType, InputArray _mask)
 double norm(InputArray _src1, InputArray _src2, int normType, InputArray _mask)
 {
    Mat src1 = _src1.getMat(), src2 = _src2.getMat(), mask = _mask.getMat();
+    if( src1.depth() == CV_16F )
+    {
+        Mat src1_32f, src2_32f;
+        src1.convertTo(src1_32f, CV_32F);
+        src2.convertTo(src2_32f, CV_32F);
+        return cvtest::norm(src1_32f, src2_32f, normType, _mask);
+    }
    bool isRelative = (normType & NORM_RELATIVE) != 0;
    normType &= ~NORM_RELATIVE;
@@ -1982,11 +1997,20 @@ int check( const Mat& a, double fmin, double fmax, vector<int>* _idx )
 // success_err_level is maximum allowed difference, idx is the index of the first
 // element for which difference is >success_err_level
 // (or index of element with the maximum difference)
-int cmpEps( const Mat& arr, const Mat& refarr, double* _realmaxdiff,
+int cmpEps( const Mat& arr_, const Mat& refarr_, double* _realmaxdiff,
            double success_err_level, vector<int>* _idx,
            bool element_wise_relative_error )
 {
+    Mat arr = arr_, refarr = refarr_;
    CV_Assert( arr.type() == refarr.type() && arr.size == refarr.size );
+    if( arr.depth() == CV_16F )
+    {
+        Mat arr32f, refarr32f;
+        arr.convertTo(arr32f, CV_32F);
+        refarr.convertTo(refarr32f, CV_32F);
+        arr = arr32f;
+        refarr = refarr32f;
+    }
    int ilevel = refarr.depth() <= CV_32S ? cvFloor(success_err_level) : 0;
    int result = CMP_EPS_OK;

--- a/modules/ts/src/ts_perf.cpp
+++ b/modules/ts/src/ts_perf.cpp
@@ -594,11 +594,11 @@ Regression& Regression::operator() (const std::string& name, cv::InputArray arra
    // exit if current test is already failed
    if(::testing::UnitTest::GetInstance()->current_test_info()->result()->Failed()) return *this;
-    if(!array.empty() && array.depth() == CV_USRTYPE1)
+    /*if(!array.empty() && array.depth() == CV_USRTYPE1)
    {
        ADD_FAILURE() << "  Can not check regression for CV_USRTYPE1 data type for " << name;
        return *this;
-    }
+    }*/
    std::string nodename = getCurrentTestNodeName();
@@ -2207,7 +2207,7 @@ void PrintTo(const MatType& t, ::std::ostream* os)
        case CV_32S: *os << "32S"; break;
        case CV_32F: *os << "32F"; break;
        case CV_64F: *os << "64F"; break;
-        case CV_USRTYPE1: *os << "USRTYPE1"; break;
+        case CV_USRTYPE1: *os << "16F"; break;
        default: *os << "INVALID_TYPE"; break;
    }
    *os << 'C' << CV_MAT_CN((int)t);