Unverified Commit 6d7f5871 authored by Vadim Pisarevsky's avatar Vadim Pisarevsky Committed by GitHub

added basic support for CV_16F (the new datatype etc.) (#12463)

* added basic support for CV_16F (the new datatype etc.). CV_USRTYPE1 is now equal to CV_16F, which may break some [rarely used] functionality. We'll see

* fixed just introduced bug in norm; reverted errorneous changes in Torch importer (need to find a better solution)

* addressed some issues found during the PR review

* restored the patch to fix some perf test failures
parent dca657a2
......@@ -3009,6 +3009,7 @@ public:
virtual Ptr<Formatted> format(const Mat& mtx) const = 0;
virtual void set16fPrecision(int p = 4) = 0;
virtual void set32fPrecision(int p = 8) = 0;
virtual void set64fPrecision(int p = 16) = 0;
virtual void setMultiline(bool ml = true) = 0;
......
......@@ -317,13 +317,10 @@ Cv64suf;
#define CV_IS_SUBMAT(flags) ((flags) & CV_MAT_SUBMAT_FLAG)
/** Size of each channel item,
0x8442211 = 1000 0100 0100 0010 0010 0001 0001 ~ array of sizeof(arr_type_elem) */
#define CV_ELEM_SIZE1(type) \
((((sizeof(size_t)<<28)|0x8442211) >> CV_MAT_DEPTH(type)*4) & 15)
0x28442211 = 0010 1000 0100 0100 0010 0010 0001 0001 ~ array of sizeof(arr_type_elem) */
#define CV_ELEM_SIZE1(type) ((0x28442211 >> CV_MAT_DEPTH(type)*4) & 15)
/** 0x3a50 = 11 10 10 01 01 00 00 ~ array of log2(sizeof(arr_type_elem)) */
#define CV_ELEM_SIZE(type) \
(CV_MAT_CN(type) << ((((sizeof(size_t)/4+1)*16384|0x3a50) >> CV_MAT_DEPTH(type)*2) & 3))
#define CV_ELEM_SIZE(type) (CV_MAT_CN(type)*CV_ELEM_SIZE1(type))
#ifndef MIN
# define MIN(a,b) ((a) > (b) ? (b) : (a))
......
......@@ -195,6 +195,12 @@ CV_EXPORTS void addWeighted32s( const int* src1, size_t step1, const int* src2,
CV_EXPORTS void addWeighted32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* scalars );
CV_EXPORTS void addWeighted64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* scalars );
CV_EXPORTS void cvt16f32f( const float16_t* src, float* dst, int len );
CV_EXPORTS void cvt32f16f( const float* src, float16_t* dst, int len );
CV_EXPORTS void addRNGBias32f( float* arr, const float* scaleBiasPairs, int len );
CV_EXPORTS void addRNGBias64f( double* arr, const double* scaleBiasPairs, int len );
struct CV_EXPORTS DFT1D
{
static Ptr<DFT1D> create(int len, int count, int depth, int flags, bool * useBuffer = 0);
......
......@@ -76,6 +76,7 @@ typedef signed char schar;
#define CV_32F 5
#define CV_64F 6
#define CV_USRTYPE1 7
#define CV_16F 7
#define CV_MAT_DEPTH_MASK (CV_DEPTH_MAX - 1)
#define CV_MAT_DEPTH(flags) ((flags) & CV_MAT_DEPTH_MASK)
......@@ -124,6 +125,12 @@ typedef signed char schar;
#define CV_64FC3 CV_MAKETYPE(CV_64F,3)
#define CV_64FC4 CV_MAKETYPE(CV_64F,4)
#define CV_64FC(n) CV_MAKETYPE(CV_64F,(n))
#define CV_16FC1 CV_MAKETYPE(CV_16F,1)
#define CV_16FC2 CV_MAKETYPE(CV_16F,2)
#define CV_16FC3 CV_MAKETYPE(CV_16F,3)
#define CV_16FC4 CV_MAKETYPE(CV_16F,4)
#define CV_16FC(n) CV_MAKETYPE(CV_16F,(n))
//! @}
//! @name Comparison operation
......
......@@ -296,8 +296,10 @@ public:
DEPTH_MASK_32S = 1 << CV_32S,
DEPTH_MASK_32F = 1 << CV_32F,
DEPTH_MASK_64F = 1 << CV_64F,
DEPTH_MASK_16F = 1 << CV_16F,
DEPTH_MASK_ALL = (DEPTH_MASK_64F<<1)-1,
DEPTH_MASK_ALL_BUT_8S = DEPTH_MASK_ALL & ~DEPTH_MASK_8S,
DEPTH_MASK_ALL_16F = (DEPTH_MASK_16F<<1)-1,
DEPTH_MASK_FLT = DEPTH_MASK_32F + DEPTH_MASK_64F
};
......
......@@ -158,6 +158,22 @@ template<> inline uint64 saturate_cast<uint64>(int64 v) { return (uint64)st
template<> inline int64 saturate_cast<int64>(uint64 v) { return (int64)std::min(v, (uint64)LLONG_MAX); }
/** @overload */
template<typename _Tp> static inline _Tp saturate_cast(float16_t v) { return saturate_cast<_Tp>((float)v); }
// in theory, we could use a LUT for 8u/8s->16f conversion,
// but with hardware support for FP32->FP16 conversion the current approach is preferable
template<> inline float16_t saturate_cast<float16_t>(uchar v) { return float16_t((float)v); }
template<> inline float16_t saturate_cast<float16_t>(schar v) { return float16_t((float)v); }
template<> inline float16_t saturate_cast<float16_t>(ushort v) { return float16_t((float)v); }
template<> inline float16_t saturate_cast<float16_t>(short v) { return float16_t((float)v); }
template<> inline float16_t saturate_cast<float16_t>(unsigned v){ return float16_t((float)v); }
template<> inline float16_t saturate_cast<float16_t>(int v) { return float16_t((float)v); }
template<> inline float16_t saturate_cast<float16_t>(uint64 v) { return float16_t((float)v); }
template<> inline float16_t saturate_cast<float16_t>(int64 v) { return float16_t((float)v); }
template<> inline float16_t saturate_cast<float16_t>(float v) { return float16_t(v); }
template<> inline float16_t saturate_cast<float16_t>(double v) { return float16_t((float)v); }
//! @}
} // cv
......
......@@ -261,6 +261,20 @@ public:
};
};
template<> class DataType<float16_t>
{
public:
typedef float16_t value_type;
typedef float work_type;
typedef value_type channel_type;
typedef value_type vec_type;
enum { generic_type = 0,
depth = CV_16F,
channels = 1,
fmt = (int)'h',
type = CV_MAKETYPE(depth, channels)
};
};
/** @brief A helper class for cv::DataType
......@@ -330,6 +344,12 @@ template<> class TypeDepth<CV_64F>
typedef double value_type;
};
template<> class TypeDepth<CV_16F>
{
enum { depth = CV_16F };
typedef float16_t value_type;
};
#endif
//! @}
......
......@@ -3262,6 +3262,9 @@ void scalarToRawData(const Scalar& s, void* _buf, int type, int unroll_to)
case CV_64F:
scalarToRawData_<double>(s, (double*)_buf, cn, unroll_to);
break;
case CV_16F:
scalarToRawData_<float16_t>(s, (float16_t*)_buf, cn, unroll_to);
break;
default:
CV_Error(CV_StsUnsupportedFormat,"");
}
......
......@@ -43,15 +43,15 @@ static const char* getTestOpMath(unsigned testOp)
const char* depthToString_(int depth)
{
static const char* depthNames[] = { "CV_8U", "CV_8S", "CV_16U", "CV_16S", "CV_32S", "CV_32F", "CV_64F", "CV_USRTYPE1" };
return (depth <= CV_USRTYPE1 && depth >= 0) ? depthNames[depth] : NULL;
static const char* depthNames[] = { "CV_8U", "CV_8S", "CV_16U", "CV_16S", "CV_32S", "CV_32F", "CV_64F", "CV_16F" };
return (depth <= CV_16F && depth >= 0) ? depthNames[depth] : NULL;
}
const cv::String typeToString_(int type)
{
int depth = CV_MAT_DEPTH(type);
int cn = CV_MAT_CN(type);
if (depth >= 0 && depth <= CV_USRTYPE1)
if (depth >= 0 && depth <= CV_16F)
return cv::format("%sC%d", depthToString_(depth), cn);
return cv::String();
}
......
......@@ -8,7 +8,7 @@
namespace cv {
/*namespace hal {
namespace hal {
void cvt16f32f( const float16_t* src, float* dst, int len )
{
......@@ -50,21 +50,21 @@ void cvt32f16f( const float* src, float16_t* dst, int len )
dst[j] = float16_t(src[j]);
}
/*void addRNGBias32f( float* arr, const float* scaleBiasPairs, int len )
void addRNGBias32f( float* arr, const float* scaleBiasPairs, int len )
{
// the loop is simple enough, so we let the compiler to vectorize it
for( int i = 0; i < len; i++ )
arr[i] = scaleBiasPairs[i*2 + 1];
arr[i] += scaleBiasPairs[i*2 + 1];
}
void addRNGBias64f( double* arr, const double* scaleBiasPairs, int len )
{
// the loop is simple enough, so we let the compiler to vectorize it
for( int i = 0; i < len; i++ )
arr[i] = scaleBiasPairs[i*2 + 1];
arr[i] += scaleBiasPairs[i*2 + 1];
}
}*/
}
template<typename _Ts, typename _Td, typename _Twvec> inline void
cvt_( const _Ts* src, size_t sstep, _Td* dst, size_t dstep, Size size )
......@@ -150,7 +150,7 @@ DEF_CVT_FUNC(8u16s, cvt_, uchar, short, v_int16)
DEF_CVT_FUNC(8u32s, cvt_, uchar, int, v_int32)
DEF_CVT_FUNC(8u32f, cvt_, uchar, float, v_float32)
DEF_CVT_FUNC(8u64f, cvt_, uchar, double, v_int32)
//DEF_CVT_FUNC(8u16f, cvt1_, uchar, float16_t, v_float32)
DEF_CVT_FUNC(8u16f, cvt1_, uchar, float16_t, v_float32)
////////////////////// 8s -> ... ////////////////////////
......@@ -160,7 +160,7 @@ DEF_CVT_FUNC(8s16s, cvt_, schar, short, v_int16)
DEF_CVT_FUNC(8s32s, cvt_, schar, int, v_int32)
DEF_CVT_FUNC(8s32f, cvt_, schar, float, v_float32)
DEF_CVT_FUNC(8s64f, cvt_, schar, double, v_int32)
//DEF_CVT_FUNC(8s16f, cvt1_, schar, float16_t, v_float32)
DEF_CVT_FUNC(8s16f, cvt1_, schar, float16_t, v_float32)
////////////////////// 16u -> ... ////////////////////////
......@@ -170,7 +170,7 @@ DEF_CVT_FUNC(16u16s, cvt_, ushort, short, v_int32)
DEF_CVT_FUNC(16u32s, cvt_, ushort, int, v_int32)
DEF_CVT_FUNC(16u32f, cvt_, ushort, float, v_float32)
DEF_CVT_FUNC(16u64f, cvt_, ushort, double, v_int32)
//DEF_CVT_FUNC(16u16f, cvt1_,ushort, float16_t, v_float32)
DEF_CVT_FUNC(16u16f, cvt1_,ushort, float16_t, v_float32)
////////////////////// 16s -> ... ////////////////////////
......@@ -180,7 +180,7 @@ DEF_CVT_FUNC(16s16u, cvt_, short, ushort, v_int32)
DEF_CVT_FUNC(16s32s, cvt_, short, int, v_int32)
DEF_CVT_FUNC(16s32f, cvt_, short, float, v_float32)
DEF_CVT_FUNC(16s64f, cvt_, short, double, v_int32)
//DEF_CVT_FUNC(16s16f, cvt1_,short, float16_t, v_float32)
DEF_CVT_FUNC(16s16f, cvt1_,short, float16_t, v_float32)
////////////////////// 32s -> ... ////////////////////////
......@@ -190,7 +190,7 @@ DEF_CVT_FUNC(32s16u, cvt_, int, ushort, v_int32)
DEF_CVT_FUNC(32s16s, cvt_, int, short, v_int32)
DEF_CVT_FUNC(32s32f, cvt_, int, float, v_float32)
DEF_CVT_FUNC(32s64f, cvt_, int, double, v_int32)
//DEF_CVT_FUNC(32s16f, cvt1_,int, float16_t, v_float32)
DEF_CVT_FUNC(32s16f, cvt1_,int, float16_t, v_float32)
////////////////////// 32f -> ... ////////////////////////
......@@ -210,17 +210,17 @@ DEF_CVT_FUNC(64f16u, cvt_, double, ushort, v_int32)
DEF_CVT_FUNC(64f16s, cvt_, double, short, v_int32)
DEF_CVT_FUNC(64f32s, cvt_, double, int, v_int32)
DEF_CVT_FUNC(64f32f, cvt_, double, float, v_float32)
//DEF_CVT_FUNC(64f16f, cvt1_,double, float16_t, v_float32)
DEF_CVT_FUNC(64f16f, cvt1_,double, float16_t, v_float32)
////////////////////// 16f -> ... ////////////////////////
//DEF_CVT_FUNC(16f8u, cvt_, float16_t, uchar, v_float32)
//DEF_CVT_FUNC(16f8s, cvt_, float16_t, schar, v_float32)
//DEF_CVT_FUNC(16f16u, cvt1_, float16_t, ushort, v_float32)
//DEF_CVT_FUNC(16f16s, cvt1_, float16_t, short, v_float32)
//DEF_CVT_FUNC(16f32s, cvt1_, float16_t, int, v_float32)
DEF_CVT_FUNC(16f8u, cvt_, float16_t, uchar, v_float32)
DEF_CVT_FUNC(16f8s, cvt_, float16_t, schar, v_float32)
DEF_CVT_FUNC(16f16u, cvt1_, float16_t, ushort, v_float32)
DEF_CVT_FUNC(16f16s, cvt1_, float16_t, short, v_float32)
DEF_CVT_FUNC(16f32s, cvt1_, float16_t, int, v_float32)
DEF_CVT_FUNC(16f32f, cvt1_, float16_t, float, v_float32)
//DEF_CVT_FUNC(16f64f, cvt1_, float16_t, double, v_float32)
DEF_CVT_FUNC(16f64f, cvt1_, float16_t, double, v_float32)
///////////// "conversion" w/o conversion ///////////////
......@@ -339,42 +339,41 @@ BinaryFunc getConvertFunc(int sdepth, int ddepth)
{
(BinaryFunc)(cvt8u), (BinaryFunc)GET_OPTIMIZED(cvt8s8u), (BinaryFunc)GET_OPTIMIZED(cvt16u8u),
(BinaryFunc)GET_OPTIMIZED(cvt16s8u), (BinaryFunc)GET_OPTIMIZED(cvt32s8u), (BinaryFunc)GET_OPTIMIZED(cvt32f8u),
(BinaryFunc)GET_OPTIMIZED(cvt64f8u), 0 //(BinaryFunc)(cvt16f8u)
(BinaryFunc)GET_OPTIMIZED(cvt64f8u), (BinaryFunc)(cvt16f8u)
},
{
(BinaryFunc)GET_OPTIMIZED(cvt8u8s), (BinaryFunc)cvt8u, (BinaryFunc)GET_OPTIMIZED(cvt16u8s),
(BinaryFunc)GET_OPTIMIZED(cvt16s8s), (BinaryFunc)GET_OPTIMIZED(cvt32s8s), (BinaryFunc)GET_OPTIMIZED(cvt32f8s),
(BinaryFunc)GET_OPTIMIZED(cvt64f8s), 0 //(BinaryFunc)(cvt16f8s)
(BinaryFunc)GET_OPTIMIZED(cvt64f8s), (BinaryFunc)(cvt16f8s)
},
{
(BinaryFunc)GET_OPTIMIZED(cvt8u16u), (BinaryFunc)GET_OPTIMIZED(cvt8s16u), (BinaryFunc)cvt16u,
(BinaryFunc)GET_OPTIMIZED(cvt16s16u), (BinaryFunc)GET_OPTIMIZED(cvt32s16u), (BinaryFunc)GET_OPTIMIZED(cvt32f16u),
(BinaryFunc)GET_OPTIMIZED(cvt64f16u), 0 //(BinaryFunc)(cvt16f16u)
(BinaryFunc)GET_OPTIMIZED(cvt64f16u), (BinaryFunc)(cvt16f16u)
},
{
(BinaryFunc)GET_OPTIMIZED(cvt8u16s), (BinaryFunc)GET_OPTIMIZED(cvt8s16s), (BinaryFunc)GET_OPTIMIZED(cvt16u16s),
(BinaryFunc)cvt16u, (BinaryFunc)GET_OPTIMIZED(cvt32s16s), (BinaryFunc)GET_OPTIMIZED(cvt32f16s),
(BinaryFunc)GET_OPTIMIZED(cvt64f16s), 0 //(BinaryFunc)(cvt16f16s)
(BinaryFunc)GET_OPTIMIZED(cvt64f16s), (BinaryFunc)(cvt16f16s)
},
{
(BinaryFunc)GET_OPTIMIZED(cvt8u32s), (BinaryFunc)GET_OPTIMIZED(cvt8s32s), (BinaryFunc)GET_OPTIMIZED(cvt16u32s),
(BinaryFunc)GET_OPTIMIZED(cvt16s32s), (BinaryFunc)cvt32s, (BinaryFunc)GET_OPTIMIZED(cvt32f32s),
(BinaryFunc)GET_OPTIMIZED(cvt64f32s), 0 //(BinaryFunc)(cvt16f32s)
(BinaryFunc)GET_OPTIMIZED(cvt64f32s), (BinaryFunc)(cvt16f32s)
},
{
(BinaryFunc)GET_OPTIMIZED(cvt8u32f), (BinaryFunc)GET_OPTIMIZED(cvt8s32f), (BinaryFunc)GET_OPTIMIZED(cvt16u32f),
(BinaryFunc)GET_OPTIMIZED(cvt16s32f), (BinaryFunc)GET_OPTIMIZED(cvt32s32f), (BinaryFunc)cvt32s,
(BinaryFunc)GET_OPTIMIZED(cvt64f32f), 0 //(BinaryFunc)(cvt16f32f)
(BinaryFunc)GET_OPTIMIZED(cvt64f32f), (BinaryFunc)(cvt16f32f)
},
{
(BinaryFunc)GET_OPTIMIZED(cvt8u64f), (BinaryFunc)GET_OPTIMIZED(cvt8s64f), (BinaryFunc)GET_OPTIMIZED(cvt16u64f),
(BinaryFunc)GET_OPTIMIZED(cvt16s64f), (BinaryFunc)GET_OPTIMIZED(cvt32s64f), (BinaryFunc)GET_OPTIMIZED(cvt32f64f),
(BinaryFunc)(cvt64s), 0 //(BinaryFunc)(cvt16f64f)
(BinaryFunc)(cvt64s), (BinaryFunc)(cvt16f64f)
},
{
0, 0, 0, 0, 0, 0, 0, 0
//(BinaryFunc)(cvt8u16f), (BinaryFunc)(cvt8s16f), (BinaryFunc)(cvt16u16f), (BinaryFunc)(cvt16s16f),
//(BinaryFunc)(cvt32s16f), (BinaryFunc)(cvt32f16f), (BinaryFunc)(cvt64f16f), (BinaryFunc)(cvt16u)
(BinaryFunc)(cvt8u16f), (BinaryFunc)(cvt8s16f), (BinaryFunc)(cvt16u16f), (BinaryFunc)(cvt16s16f),
(BinaryFunc)(cvt32s16f), (BinaryFunc)(cvt32f16f), (BinaryFunc)(cvt64f16f), (BinaryFunc)(cvt16u)
}
};
return cvtTab[CV_MAT_DEPTH(ddepth)][CV_MAT_DEPTH(sdepth)];
......@@ -481,7 +480,7 @@ void cv::convertFp16( InputArray _src, OutputArray _dst )
if(_dst.fixedType())
{
ddepth = _dst.depth();
CV_Assert(ddepth == CV_16S /*|| ddepth == CV_16F*/);
CV_Assert(ddepth == CV_16S || ddepth == CV_16F);
CV_Assert(_dst.channels() == _src.channels());
}
else
......@@ -489,7 +488,7 @@ void cv::convertFp16( InputArray _src, OutputArray _dst )
func = (BinaryFunc)cvt32f16f;
break;
case CV_16S:
//case CV_16F:
case CV_16F:
ddepth = CV_32F;
func = (BinaryFunc)cvt16f32f;
break;
......
......@@ -150,12 +150,11 @@ static inline void vx_load_pair_as(const int* ptr, v_float32& a, v_float32& b)
static inline void vx_load_pair_as(const float* ptr, v_float32& a, v_float32& b)
{ a = vx_load(ptr); b = vx_load(ptr + v_float32::nlanes); }
//static inline void vx_load_pair_as(const float16_t* ptr, v_float32& a, v_float32& b)
//{
// a = vx_load_expand(ptr);
// b = vx_load_expand(ptr + v_float32::nlanes);
//}
static inline void vx_load_pair_as(const float16_t* ptr, v_float32& a, v_float32& b)
{
a = vx_load_expand(ptr);
b = vx_load_expand(ptr + v_float32::nlanes);
}
static inline void v_store_pair_as(uchar* ptr, const v_uint16& a, const v_uint16& b)
{
......@@ -295,12 +294,12 @@ static inline void vx_load_pair_as(const double* ptr, v_float64& a, v_float64& b
b = vx_load(ptr + v_float64::nlanes);
}
//static inline void vx_load_pair_as(const float16_t* ptr, v_float64& a, v_float64& b)
//{
// v_float32 v0 = vx_load_expand(ptr);
// a = v_cvt_f64(v0);
// b = v_cvt_f64_high(v0);
//}
static inline void vx_load_pair_as(const float16_t* ptr, v_float64& a, v_float64& b)
{
v_float32 v0 = vx_load_expand(ptr);
a = v_cvt_f64(v0);
b = v_cvt_f64_high(v0);
}
static inline void v_store_as(double* ptr, const v_float32& a)
{
......@@ -349,11 +348,11 @@ static inline void v_store_pair_as(float* ptr, const v_float64& a, const v_float
v_store(ptr, v);
}
//static inline void v_store_pair_as(float16_t* ptr, const v_float64& a, const v_float64& b)
//{
// v_float32 v = v_cvt_f32(a, b);
// v_pack_store(ptr, v);
//}
static inline void v_store_pair_as(float16_t* ptr, const v_float64& a, const v_float64& b)
{
v_float32 v = v_cvt_f32(a, b);
v_pack_store(ptr, v);
}
#else
......
......@@ -222,7 +222,7 @@ DEF_CVT_SCALE_FUNC(16s8u, cvt_32f, short, uchar, float)
DEF_CVT_SCALE_FUNC(32s8u, cvt_32f, int, uchar, float)
DEF_CVT_SCALE_FUNC(32f8u, cvt_32f, float, uchar, float)
DEF_CVT_SCALE_FUNC(64f8u, cvt_32f, double, uchar, float)
//DEF_CVT_SCALE_FUNC(16f8u, cvt_32f, float16_t, uchar, float)
DEF_CVT_SCALE_FUNC(16f8u, cvt_32f, float16_t, uchar, float)
DEF_CVT_SCALE_FUNC(8u8s, cvt_32f, uchar, schar, float)
DEF_CVT_SCALE_FUNC(8s, cvt_32f, schar, schar, float)
......@@ -231,7 +231,7 @@ DEF_CVT_SCALE_FUNC(16s8s, cvt_32f, short, schar, float)
DEF_CVT_SCALE_FUNC(32s8s, cvt_32f, int, schar, float)
DEF_CVT_SCALE_FUNC(32f8s, cvt_32f, float, schar, float)
DEF_CVT_SCALE_FUNC(64f8s, cvt_32f, double, schar, float)
//DEF_CVT_SCALE_FUNC(16f8s, cvt_32f, float16_t, schar, float)
DEF_CVT_SCALE_FUNC(16f8s, cvt_32f, float16_t, schar, float)
DEF_CVT_SCALE_FUNC(8u16u, cvt_32f, uchar, ushort, float)
DEF_CVT_SCALE_FUNC(8s16u, cvt_32f, schar, ushort, float)
......@@ -240,7 +240,7 @@ DEF_CVT_SCALE_FUNC(16s16u, cvt_32f, short, ushort, float)
DEF_CVT_SCALE_FUNC(32s16u, cvt_32f, int, ushort, float)
DEF_CVT_SCALE_FUNC(32f16u, cvt_32f, float, ushort, float)
DEF_CVT_SCALE_FUNC(64f16u, cvt_32f, double, ushort, float)
//DEF_CVT_SCALE_FUNC(16f16u, cvt1_32f, float16_t, ushort, float)
DEF_CVT_SCALE_FUNC(16f16u, cvt1_32f, float16_t, ushort, float)
DEF_CVT_SCALE_FUNC(8u16s, cvt_32f, uchar, short, float)
DEF_CVT_SCALE_FUNC(8s16s, cvt_32f, schar, short, float)
......@@ -249,7 +249,7 @@ DEF_CVT_SCALE_FUNC(16s, cvt_32f, short, short, float)
DEF_CVT_SCALE_FUNC(32s16s, cvt_32f, int, short, float)
DEF_CVT_SCALE_FUNC(32f16s, cvt_32f, float, short, float)
DEF_CVT_SCALE_FUNC(64f16s, cvt_32f, double, short, float)
//DEF_CVT_SCALE_FUNC(16f16s, cvt1_32f, float16_t, short, float)
DEF_CVT_SCALE_FUNC(16f16s, cvt1_32f, float16_t, short, float)
DEF_CVT_SCALE_FUNC(8u32s, cvt_32f, uchar, int, float)
DEF_CVT_SCALE_FUNC(8s32s, cvt_32f, schar, int, float)
......@@ -258,7 +258,7 @@ DEF_CVT_SCALE_FUNC(16s32s, cvt_32f, short, int, float)
DEF_CVT_SCALE_FUNC(32s, cvt_64f, int, int, double)
DEF_CVT_SCALE_FUNC(32f32s, cvt_32f, float, int, float)
DEF_CVT_SCALE_FUNC(64f32s, cvt_64f, double, int, double)
//DEF_CVT_SCALE_FUNC(16f32s, cvt1_32f, float16_t, int, float)
DEF_CVT_SCALE_FUNC(16f32s, cvt1_32f, float16_t, int, float)
DEF_CVT_SCALE_FUNC(8u32f, cvt_32f, uchar, float, float)
DEF_CVT_SCALE_FUNC(8s32f, cvt_32f, schar, float, float)
......@@ -267,7 +267,7 @@ DEF_CVT_SCALE_FUNC(16s32f, cvt_32f, short, float, float)
DEF_CVT_SCALE_FUNC(32s32f, cvt_32f, int, float, float)
DEF_CVT_SCALE_FUNC(32f, cvt_32f, float, float, float)
DEF_CVT_SCALE_FUNC(64f32f, cvt_64f, double, float, double)
//DEF_CVT_SCALE_FUNC(16f32f, cvt1_32f, float16_t, float, float)
DEF_CVT_SCALE_FUNC(16f32f, cvt1_32f, float16_t, float, float)
DEF_CVT_SCALE_FUNC(8u64f, cvt_64f, uchar, double, double)
DEF_CVT_SCALE_FUNC(8s64f, cvt_64f, schar, double, double)
......@@ -276,16 +276,16 @@ DEF_CVT_SCALE_FUNC(16s64f, cvt_64f, short, double, double)
DEF_CVT_SCALE_FUNC(32s64f, cvt_64f, int, double, double)
DEF_CVT_SCALE_FUNC(32f64f, cvt_64f, float, double, double)
DEF_CVT_SCALE_FUNC(64f, cvt_64f, double, double, double)
//DEF_CVT_SCALE_FUNC(16f64f, cvt_64f, float16_t, double, double)
DEF_CVT_SCALE_FUNC(16f64f, cvt_64f, float16_t, double, double)
/*DEF_CVT_SCALE_FUNC(8u16f, cvt1_32f, uchar, float16_t, float)
DEF_CVT_SCALE_FUNC(8u16f, cvt1_32f, uchar, float16_t, float)
DEF_CVT_SCALE_FUNC(8s16f, cvt1_32f, schar, float16_t, float)
DEF_CVT_SCALE_FUNC(16u16f, cvt1_32f, ushort, float16_t, float)
DEF_CVT_SCALE_FUNC(16s16f, cvt1_32f, short, float16_t, float)
DEF_CVT_SCALE_FUNC(32s16f, cvt1_32f, int, float16_t, float)
DEF_CVT_SCALE_FUNC(32f16f, cvt1_32f, float, float16_t, float)
DEF_CVT_SCALE_FUNC(64f16f, cvt_64f, double, float16_t, double)
DEF_CVT_SCALE_FUNC(16f, cvt1_32f, float16_t, float16_t, float)*/
DEF_CVT_SCALE_FUNC(16f, cvt1_32f, float16_t, float16_t, float)
static BinaryFunc getCvtScaleAbsFunc(int depth)
{
......@@ -306,43 +306,42 @@ BinaryFunc getConvertScaleFunc(int sdepth, int ddepth)
{
(BinaryFunc)GET_OPTIMIZED(cvtScale8u), (BinaryFunc)GET_OPTIMIZED(cvtScale8s8u), (BinaryFunc)GET_OPTIMIZED(cvtScale16u8u),
(BinaryFunc)GET_OPTIMIZED(cvtScale16s8u), (BinaryFunc)GET_OPTIMIZED(cvtScale32s8u), (BinaryFunc)GET_OPTIMIZED(cvtScale32f8u),
(BinaryFunc)cvtScale64f8u, 0 //(BinaryFunc)cvtScale16f8u
(BinaryFunc)cvtScale64f8u, (BinaryFunc)cvtScale16f8u
},
{
(BinaryFunc)GET_OPTIMIZED(cvtScale8u8s), (BinaryFunc)GET_OPTIMIZED(cvtScale8s), (BinaryFunc)GET_OPTIMIZED(cvtScale16u8s),
(BinaryFunc)GET_OPTIMIZED(cvtScale16s8s), (BinaryFunc)GET_OPTIMIZED(cvtScale32s8s), (BinaryFunc)GET_OPTIMIZED(cvtScale32f8s),
(BinaryFunc)cvtScale64f8s, 0 //(BinaryFunc)cvtScale16f8s
(BinaryFunc)cvtScale64f8s, (BinaryFunc)cvtScale16f8s
},
{
(BinaryFunc)GET_OPTIMIZED(cvtScale8u16u), (BinaryFunc)GET_OPTIMIZED(cvtScale8s16u), (BinaryFunc)GET_OPTIMIZED(cvtScale16u),
(BinaryFunc)GET_OPTIMIZED(cvtScale16s16u), (BinaryFunc)GET_OPTIMIZED(cvtScale32s16u), (BinaryFunc)GET_OPTIMIZED(cvtScale32f16u),
(BinaryFunc)cvtScale64f16u, 0 //(BinaryFunc)cvtScale16f16u
(BinaryFunc)cvtScale64f16u, (BinaryFunc)cvtScale16f16u
},
{
(BinaryFunc)GET_OPTIMIZED(cvtScale8u16s), (BinaryFunc)GET_OPTIMIZED(cvtScale8s16s), (BinaryFunc)GET_OPTIMIZED(cvtScale16u16s),
(BinaryFunc)GET_OPTIMIZED(cvtScale16s), (BinaryFunc)GET_OPTIMIZED(cvtScale32s16s), (BinaryFunc)GET_OPTIMIZED(cvtScale32f16s),
(BinaryFunc)cvtScale64f16s, 0 //(BinaryFunc)cvtScale16f16s
(BinaryFunc)cvtScale64f16s, (BinaryFunc)cvtScale16f16s
},
{
(BinaryFunc)GET_OPTIMIZED(cvtScale8u32s), (BinaryFunc)GET_OPTIMIZED(cvtScale8s32s), (BinaryFunc)GET_OPTIMIZED(cvtScale16u32s),
(BinaryFunc)GET_OPTIMIZED(cvtScale16s32s), (BinaryFunc)GET_OPTIMIZED(cvtScale32s), (BinaryFunc)GET_OPTIMIZED(cvtScale32f32s),
(BinaryFunc)cvtScale64f32s, 0 //(BinaryFunc)cvtScale16f32s
(BinaryFunc)cvtScale64f32s, (BinaryFunc)cvtScale16f32s
},
{
(BinaryFunc)GET_OPTIMIZED(cvtScale8u32f), (BinaryFunc)GET_OPTIMIZED(cvtScale8s32f), (BinaryFunc)GET_OPTIMIZED(cvtScale16u32f),
(BinaryFunc)GET_OPTIMIZED(cvtScale16s32f), (BinaryFunc)GET_OPTIMIZED(cvtScale32s32f), (BinaryFunc)GET_OPTIMIZED(cvtScale32f),
(BinaryFunc)cvtScale64f32f, 0 //(BinaryFunc)cvtScale16f32f
(BinaryFunc)cvtScale64f32f, (BinaryFunc)cvtScale16f32f
},
{
(BinaryFunc)cvtScale8u64f, (BinaryFunc)cvtScale8s64f, (BinaryFunc)cvtScale16u64f,
(BinaryFunc)cvtScale16s64f, (BinaryFunc)cvtScale32s64f, (BinaryFunc)cvtScale32f64f,
(BinaryFunc)cvtScale64f, 0 //(BinaryFunc)cvtScale16f64f
(BinaryFunc)cvtScale64f, (BinaryFunc)cvtScale16f64f
},
{
0, 0, 0, 0, 0, 0, 0, 0
/*(BinaryFunc)cvtScale8u16f, (BinaryFunc)cvtScale8s16f, (BinaryFunc)cvtScale16u16f,
(BinaryFunc)cvtScale8u16f, (BinaryFunc)cvtScale8s16f, (BinaryFunc)cvtScale16u16f,
(BinaryFunc)cvtScale16s16f, (BinaryFunc)cvtScale32s16f, (BinaryFunc)cvtScale32f16f,
(BinaryFunc)cvtScale64f16f, (BinaryFunc)cvtScale16f*/
(BinaryFunc)cvtScale64f16f, (BinaryFunc)cvtScale16f
},
};
......
......@@ -216,8 +216,10 @@ static MergeFunc getMergeFunc(int depth)
{
static MergeFunc mergeTab[] =
{
(MergeFunc)GET_OPTIMIZED(cv::hal::merge8u), (MergeFunc)GET_OPTIMIZED(cv::hal::merge8u), (MergeFunc)GET_OPTIMIZED(cv::hal::merge16u), (MergeFunc)GET_OPTIMIZED(cv::hal::merge16u),
(MergeFunc)GET_OPTIMIZED(cv::hal::merge32s), (MergeFunc)GET_OPTIMIZED(cv::hal::merge32s), (MergeFunc)GET_OPTIMIZED(cv::hal::merge64s), 0
(MergeFunc)GET_OPTIMIZED(cv::hal::merge8u), (MergeFunc)GET_OPTIMIZED(cv::hal::merge8u),
(MergeFunc)GET_OPTIMIZED(cv::hal::merge16u), (MergeFunc)GET_OPTIMIZED(cv::hal::merge16u),
(MergeFunc)GET_OPTIMIZED(cv::hal::merge32s), (MergeFunc)GET_OPTIMIZED(cv::hal::merge32s),
(MergeFunc)GET_OPTIMIZED(cv::hal::merge64s), (MergeFunc)GET_OPTIMIZED(cv::hal::merge16u)
};
return mergeTab[depth];
......
......@@ -723,7 +723,7 @@ double cv::norm( InputArray _src, int normType, InputArray _mask )
return result;
}
NormFunc func = getNormFunc(normType >> 1, depth);
NormFunc func = getNormFunc(normType >> 1, depth == CV_16F ? CV_32F : depth);
CV_Assert( func != 0 );
const Mat* arrays[] = {&src, &mask, 0};
......@@ -737,19 +737,31 @@ double cv::norm( InputArray _src, int normType, InputArray _mask )
result;
result.d = 0;
NAryMatIterator it(arrays, ptrs);
int j, total = (int)it.size, blockSize = total, intSumBlockSize = 0, count = 0;
bool blockSum = (normType == NORM_L1 && depth <= CV_16S) ||
int j, total = (int)it.size, blockSize = total;
bool blockSum = depth == CV_16F || (normType == NORM_L1 && depth <= CV_16S) ||
((normType == NORM_L2 || normType == NORM_L2SQR) && depth <= CV_8S);
int isum = 0;
int *ibuf = &result.i;
AutoBuffer<float> fltbuf_;
float* fltbuf = 0;
size_t esz = 0;
if( blockSum )
{
intSumBlockSize = (normType == NORM_L1 && depth <= CV_8S ? (1 << 23) : (1 << 15))/cn;
blockSize = std::min(blockSize, intSumBlockSize);
ibuf = &isum;
esz = src.elemSize();
if( depth == CV_16F )
{
blockSize = std::min(blockSize, 1024);
fltbuf_.allocate(blockSize);
fltbuf = fltbuf_.data();
}
else
{
int intSumBlockSize = (normType == NORM_L1 && depth <= CV_8S ? (1 << 23) : (1 << 15))/cn;
blockSize = std::min(blockSize, intSumBlockSize);
ibuf = &isum;
}
}
for( size_t i = 0; i < it.nplanes; i++, ++it )
......@@ -757,13 +769,17 @@ double cv::norm( InputArray _src, int normType, InputArray _mask )
for( j = 0; j < total; j += blockSize )
{
int bsz = std::min(total - j, blockSize);
func( ptrs[0], ptrs[1], (uchar*)ibuf, bsz, cn );
count += bsz;
if( blockSum && (count + blockSize >= intSumBlockSize || (i+1 >= it.nplanes && j+bsz >= total)) )
const uchar* data = ptrs[0];
if( depth == CV_16F )
{
hal::cvt16f32f((const float16_t*)ptrs[0], fltbuf, bsz);
data = (const uchar*)fltbuf;
}
func( data, ptrs[1], (uchar*)ibuf, bsz, cn );
if( blockSum && depth != CV_16F )
{
result.d += isum;
isum = 0;
count = 0;
}
ptrs[0] += bsz*esz;
if( ptrs[1] )
......@@ -1181,7 +1197,7 @@ double cv::norm( InputArray _src1, InputArray _src2, int normType, InputArray _m
return result;
}
NormDiffFunc func = getNormDiffFunc(normType >> 1, depth);
NormDiffFunc func = getNormDiffFunc(normType >> 1, depth == CV_16F ? CV_32F : depth);
CV_Assert( func != 0 );
const Mat* arrays[] = {&src1, &src2, &mask, 0};
......@@ -1196,19 +1212,31 @@ double cv::norm( InputArray _src1, InputArray _src2, int normType, InputArray _m
result;
result.d = 0;
NAryMatIterator it(arrays, ptrs);
int j, total = (int)it.size, blockSize = total, intSumBlockSize = 0, count = 0;
bool blockSum = (normType == NORM_L1 && depth <= CV_16S) ||
int j, total = (int)it.size, blockSize = total;
bool blockSum = depth == CV_16F || (normType == NORM_L1 && depth <= CV_16S) ||
((normType == NORM_L2 || normType == NORM_L2SQR) && depth <= CV_8S);
unsigned isum = 0;
unsigned *ibuf = &result.u;
AutoBuffer<float> fltbuf_;
float* fltbuf = 0;
size_t esz = 0;
if( blockSum )
{
intSumBlockSize = normType == NORM_L1 && depth <= CV_8S ? (1 << 23) : (1 << 15);
blockSize = std::min(blockSize, intSumBlockSize);
ibuf = &isum;
esz = src1.elemSize();
if( depth == CV_16F )
{
blockSize = std::min(blockSize, 1024);
fltbuf_.allocate(blockSize*2);
fltbuf = fltbuf_.data();
}
else
{
int intSumBlockSize = (normType == NORM_L1 && depth <= CV_8S ? (1 << 23) : (1 << 15))/cn;
blockSize = std::min(blockSize, intSumBlockSize);
ibuf = &isum;
}
}
for( size_t i = 0; i < it.nplanes; i++, ++it )
......@@ -1216,13 +1244,19 @@ double cv::norm( InputArray _src1, InputArray _src2, int normType, InputArray _m
for( j = 0; j < total; j += blockSize )
{
int bsz = std::min(total - j, blockSize);
func( ptrs[0], ptrs[1], ptrs[2], (uchar*)ibuf, bsz, cn );
count += bsz;
if( blockSum && (count + blockSize >= intSumBlockSize || (i+1 >= it.nplanes && j+bsz >= total)) )
const uchar *data0 = ptrs[0], *data1 = ptrs[1];
if( depth == CV_16F )
{
hal::cvt16f32f((const float16_t*)ptrs[0], fltbuf, bsz);
hal::cvt16f32f((const float16_t*)ptrs[1], fltbuf + bsz, bsz);
data0 = (const uchar*)fltbuf;
data1 = (const uchar*)(fltbuf + bsz);
}
func( data0, data1, ptrs[2], (uchar*)ibuf, bsz, cn );
if( blockSum && depth != CV_16F )
{
result.d += isum;
isum = 0;
count = 0;
}
ptrs[0] += bsz*esz;
ptrs[1] += bsz*esz;
......
......@@ -77,6 +77,7 @@ namespace cv
void valueToStr32s() { sprintf(buf, "%d", mtx.ptr<int>(row, col)[cn]); }
void valueToStr32f() { sprintf(buf, floatFormat, mtx.ptr<float>(row, col)[cn]); }
void valueToStr64f() { sprintf(buf, floatFormat, mtx.ptr<double>(row, col)[cn]); }
void valueToStr16f() { sprintf(buf, floatFormat, (float)mtx.ptr<float16_t>(row, col)[cn]); }
void valueToStrOther() { buf[0] = 0; }
public:
......@@ -115,7 +116,8 @@ namespace cv
case CV_32S: valueToStr = &FormattedImpl::valueToStr32s; break;
case CV_32F: valueToStr = &FormattedImpl::valueToStr32f; break;
case CV_64F: valueToStr = &FormattedImpl::valueToStr64f; break;
default: valueToStr = &FormattedImpl::valueToStrOther; break;
default: CV_Assert(mtx.depth() == CV_16F);
valueToStr = &FormattedImpl::valueToStr16f;
}
}
......@@ -256,7 +258,12 @@ namespace cv
class FormatterBase : public Formatter
{
public:
FormatterBase() : prec32f(8), prec64f(16), multiline(true) {}
FormatterBase() : prec16f(4), prec32f(8), prec64f(16), multiline(true) {}
void set16fPrecision(int p) CV_OVERRIDE
{
prec16f = p;
}
void set32fPrecision(int p) CV_OVERRIDE
{
......@@ -274,6 +281,7 @@ namespace cv
}
protected:
int prec16f;
int prec32f;
int prec64f;
int multiline;
......@@ -325,7 +333,7 @@ namespace cv
{
static const char* numpyTypes[] =
{
"uint8", "int8", "uint16", "int16", "int32", "float32", "float64", "uint64"
"uint8", "int8", "uint16", "int16", "int32", "float32", "float64", "float16"
};
char braces[5] = {'[', ']', ',', '[', ']'};
if (mtx.cols == 1)
......
......@@ -48,18 +48,6 @@
#include "precomp.hpp"
#if defined _WIN32 || defined WINCE
#include <windows.h>
#undef small
#undef min
#undef max
#undef abs
#endif
#if defined __SSE2__ || (defined _M_IX86_FP && 2 == _M_IX86_FP)
#include "emmintrin.h"
#endif
namespace cv
{
......@@ -74,12 +62,6 @@ namespace cv
#define RNG_NEXT(x) ((uint64)(unsigned)(x)*CV_RNG_COEFF + ((x) >> 32))
#ifdef __PPC64__
#define PPC_MUL_ADD(ret, tmp, p0, p1) \
asm volatile("fmuls %0,%1,%2\n\t fadds %0,%0,%3" : "=&f" (ret) \
: "f" (tmp), "f" (p0), "f" (p1))
#endif
/***************************************************************************************\
* Pseudo-Random Number Generators (PRNGs) *
\***************************************************************************************/
......@@ -154,59 +136,26 @@ template<typename T> static void
randi_( T* arr, int len, uint64* state, const DivStruct* p )
{
uint64 temp = *state;
int i = 0;
unsigned t0, t1, v0, v1;
for( i = 0; i <= len - 4; i += 4 )
for( int i = 0; i < len; i++ )
{
temp = RNG_NEXT(temp);
t0 = (unsigned)temp;
temp = RNG_NEXT(temp);
t1 = (unsigned)temp;
v0 = (unsigned)(((uint64)t0 * p[i].M) >> 32);
v1 = (unsigned)(((uint64)t1 * p[i+1].M) >> 32);
v0 = (v0 + ((t0 - v0) >> p[i].sh1)) >> p[i].sh2;
v1 = (v1 + ((t1 - v1) >> p[i+1].sh1)) >> p[i+1].sh2;
v0 = t0 - v0*p[i].d + p[i].delta;
v1 = t1 - v1*p[i+1].d + p[i+1].delta;
arr[i] = saturate_cast<T>((int)v0);
arr[i+1] = saturate_cast<T>((int)v1);
temp = RNG_NEXT(temp);
t0 = (unsigned)temp;
temp = RNG_NEXT(temp);
t1 = (unsigned)temp;
v0 = (unsigned)(((uint64)t0 * p[i+2].M) >> 32);
v1 = (unsigned)(((uint64)t1 * p[i+3].M) >> 32);
v0 = (v0 + ((t0 - v0) >> p[i+2].sh1)) >> p[i+2].sh2;
v1 = (v1 + ((t1 - v1) >> p[i+3].sh1)) >> p[i+3].sh2;
v0 = t0 - v0*p[i+2].d + p[i+2].delta;
v1 = t1 - v1*p[i+3].d + p[i+3].delta;
arr[i+2] = saturate_cast<T>((int)v0);
arr[i+3] = saturate_cast<T>((int)v1);
unsigned t = (unsigned)temp;
unsigned v = (unsigned)(((uint64)t * p[i].M) >> 32);
v = (v + ((t - v) >> p[i].sh1)) >> p[i].sh2;
v = t - v*p[i].d + p[i].delta;
arr[i] = saturate_cast<T>((int)v);
}
for( ; i < len; i++ )
{
temp = RNG_NEXT(temp);
t0 = (unsigned)temp;
v0 = (unsigned)(((uint64)t0 * p[i].M) >> 32);
v0 = (v0 + ((t0 - v0) >> p[i].sh1)) >> p[i].sh2;
v0 = t0 - v0*p[i].d + p[i].delta;
arr[i] = saturate_cast<T>((int)v0);
}
*state = temp;
}
#define DEF_RANDI_FUNC(suffix, type) \
static void randBits_##suffix(type* arr, int len, uint64* state, \
const Vec2i* p, bool small_flag) \
const Vec2i* p, void*, bool small_flag) \
{ randBits_(arr, len, state, p, small_flag); } \
\
static void randi_##suffix(type* arr, int len, uint64* state, \
const DivStruct* p, bool ) \
const DivStruct* p, void*, bool ) \
{ randi_(arr, len, state, p); }
DEF_RANDI_FUNC(8u, uchar)
......@@ -215,131 +164,62 @@ DEF_RANDI_FUNC(16u, ushort)
DEF_RANDI_FUNC(16s, short)
DEF_RANDI_FUNC(32s, int)
static void randf_32f( float* arr, int len, uint64* state, const Vec2f* p, bool )
static void randf_32f( float* arr, int len, uint64* state, const Vec2f* p, void*, bool )
{
uint64 temp = *state;
int i = 0;
for( ; i <= len - 4; i += 4 )
for( int i = 0; i < len; i++ )
{
float f[4];
f[0] = (float)(int)(temp = RNG_NEXT(temp));
f[1] = (float)(int)(temp = RNG_NEXT(temp));
f[2] = (float)(int)(temp = RNG_NEXT(temp));
f[3] = (float)(int)(temp = RNG_NEXT(temp));
// handwritten SSE is required not for performance but for numerical stability!
// both 32-bit gcc and MSVC compilers trend to generate double precision SSE
// while 64-bit compilers generate single precision SIMD instructions
// so manual vectorisation forces all compilers to the single precision
#if defined __SSE2__ || (defined _M_IX86_FP && 2 == _M_IX86_FP)
__m128 q0 = _mm_loadu_ps((const float*)(p + i));
__m128 q1 = _mm_loadu_ps((const float*)(p + i + 2));
__m128 q01l = _mm_unpacklo_ps(q0, q1);
__m128 q01h = _mm_unpackhi_ps(q0, q1);
__m128 p0 = _mm_unpacklo_ps(q01l, q01h);
__m128 p1 = _mm_unpackhi_ps(q01l, q01h);
_mm_storeu_ps(arr + i, _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(f), p0), p1));
#elif defined __ARM_NEON && defined __aarch64__
// handwritten NEON is required not for performance but for numerical stability!
// 64bit gcc tends to use fmadd instead of separate multiply and add
// use volatile to ensure to separate the multiply and add
float32x4x2_t q = vld2q_f32((const float*)(p + i));
float32x4_t p0 = q.val[0];
float32x4_t p1 = q.val[1];
volatile float32x4_t v0 = vmulq_f32(vld1q_f32(f), p0);
vst1q_f32(arr+i, vaddq_f32(v0, p1));
#elif defined __PPC64__
// inline asm is required for numerical stability!
// compilers tends to use floating multiply-add single(fmadds)
// instead of separate multiply and add
PPC_MUL_ADD(arr[i+0], f[0], p[i+0][0], p[i+0][1]);
PPC_MUL_ADD(arr[i+1], f[1], p[i+1][0], p[i+1][1]);
PPC_MUL_ADD(arr[i+2], f[2], p[i+2][0], p[i+2][1]);
PPC_MUL_ADD(arr[i+3], f[3], p[i+3][0], p[i+3][1]);
#else
arr[i+0] = f[0]*p[i+0][0] + p[i+0][1];
arr[i+1] = f[1]*p[i+1][0] + p[i+1][1];
arr[i+2] = f[2]*p[i+2][0] + p[i+2][1];
arr[i+3] = f[3]*p[i+3][0] + p[i+3][1];
#endif
}
for( ; i < len; i++ )
{
temp = RNG_NEXT(temp);
#if defined __SSE2__ || (defined _M_IX86_FP && 2 == _M_IX86_FP)
_mm_store_ss(arr + i, _mm_add_ss(
_mm_mul_ss(_mm_set_ss((float)(int)temp), _mm_set_ss(p[i][0])),
_mm_set_ss(p[i][1]))
);
#elif defined __ARM_NEON && defined __aarch64__
float32x2_t t = vadd_f32(vmul_f32(
vdup_n_f32((float)(int)temp), vdup_n_f32(p[i][0])),
vdup_n_f32(p[i][1]));
arr[i] = vget_lane_f32(t, 0);
#elif defined __PPC64__
PPC_MUL_ADD(arr[i], (float)(int)temp, p[i][0], p[i][1]);
#else
arr[i] = (int)temp*p[i][0] + p[i][1];
#endif
int t = (int)(temp = RNG_NEXT(temp));
arr[i] = (float)(t*p[i][0]);
}
*state = temp;
}
// add bias separately to make the generated random numbers
// more deterministic, independent of
// architecture details (FMA instruction use etc.)
hal::addRNGBias32f(arr, &p[0][0], len);
}
static void
randf_64f( double* arr, int len, uint64* state, const Vec2d* p, bool )
randf_64f( double* arr, int len, uint64* state, const Vec2d* p, void*, bool )
{
uint64 temp = *state;
int64 v = 0;
int i;
for( i = 0; i <= len - 4; i += 4 )
for( int i = 0; i < len; i++ )
{
double f0, f1;
temp = RNG_NEXT(temp);
v = (temp >> 32)|(temp << 32);
f0 = v*p[i][0] + p[i][1];
temp = RNG_NEXT(temp);
v = (temp >> 32)|(temp << 32);
f1 = v*p[i+1][0] + p[i+1][1];
arr[i] = f0; arr[i+1] = f1;
temp = RNG_NEXT(temp);
v = (temp >> 32)|(temp << 32);
f0 = v*p[i+2][0] + p[i+2][1];
temp = RNG_NEXT(temp);
v = (temp >> 32)|(temp << 32);
f1 = v*p[i+3][0] + p[i+3][1];
arr[i+2] = f0; arr[i+3] = f1;
int64 v = (temp >> 32)|(temp << 32);
arr[i] = v*p[i][0];
}
*state = temp;
for( ; i < len; i++ )
hal::addRNGBias64f(arr, &p[0][0], len);
}
static void randf_16f( float16_t* arr, int len, uint64* state, const Vec2f* p, float* fbuf, bool )
{
uint64 temp = *state;
for( int i = 0; i < len; i++ )
{
temp = RNG_NEXT(temp);
v = (temp >> 32)|(temp << 32);
arr[i] = v*p[i][0] + p[i][1];
float f = (float)(int)(temp = RNG_NEXT(temp));
fbuf[i] = f*p[i][0];
}
*state = temp;
// add bias separately to make the generated random numbers
// more deterministic, independent of
// architecture details (FMA instruction use etc.)
hal::addRNGBias32f(fbuf, &p[0][0], len);
hal::cvt32f16f(fbuf, arr, len);
}
typedef void (*RandFunc)(uchar* arr, int len, uint64* state, const void* p, bool small_flag);
typedef void (*RandFunc)(uchar* arr, int len, uint64* state, const void* p, void* tempbuf, bool small_flag);
static RandFunc randTab[][8] =
{
{
(RandFunc)randi_8u, (RandFunc)randi_8s, (RandFunc)randi_16u, (RandFunc)randi_16s,
(RandFunc)randi_32s, (RandFunc)randf_32f, (RandFunc)randf_64f, 0
(RandFunc)randi_32s, (RandFunc)randf_32f, (RandFunc)randf_64f, (RandFunc)randf_16f
},
{
(RandFunc)randBits_8u, (RandFunc)randBits_8s, (RandFunc)randBits_16u, (RandFunc)randBits_16s,
......@@ -350,7 +230,7 @@ static RandFunc randTab[][8] =
/*
The code below implements the algorithm described in
"The Ziggurat Method for Generating Random Variables"
by Marsaglia and Tsang, Journal of Statistical Software.
by George Marsaglia and Wai Wan Tsang, Journal of Statistical Software, 2007.
*/
static void
randn_0_1_32f( float* arr, int len, uint64* state )
......@@ -631,8 +511,8 @@ void RNG::fill( InputOutputArray _mat, int disttype,
// for each channel i compute such dparam[0][i] & dparam[1][i],
// so that a signed 32/64-bit integer X is transformed to
// the range [param1.val[i], param2.val[i]) using
// dparam[1][i]*X + dparam[0][i]
if( depth == CV_32F )
// dparam[0][i]*X + dparam[1][i]
if( depth != CV_64F )
{
fp = (Vec2f*)(parambuf + cn*2);
for( j = 0; j < cn; j++ )
......@@ -704,6 +584,7 @@ void RNG::fill( InputOutputArray _mat, int disttype,
AutoBuffer<double> buf;
uchar* param = 0;
float* nbuf = 0;
float* tmpbuf = 0;
if( disttype == UNIFORM )
{
......@@ -727,12 +608,14 @@ void RNG::fill( InputOutputArray _mat, int disttype,
p[j + k] = ip[k];
}
}
else if( depth == CV_32F )
else if( depth != CV_64F )
{
Vec2f* p = (Vec2f*)param;
for( j = 0; j < blockSize*cn; j += cn )
for( k = 0; k < cn; k++ )
p[j + k] = fp[k];
if( depth == CV_16F )
tmpbuf = (float*)p + blockSize*cn*2;
}
else
{
......@@ -755,7 +638,7 @@ void RNG::fill( InputOutputArray _mat, int disttype,
int len = std::min(total - j, blockSize);
if( disttype == CV_RAND_UNI )
func( ptr, len*cn, &state, param, smallFlag );
func( ptr, len*cn, &state, param, tmpbuf, smallFlag );
else
{
randn_0_1_32f(nbuf, len*cn, &state);
......
......@@ -224,8 +224,10 @@ static SplitFunc getSplitFunc(int depth)
{
static SplitFunc splitTab[] =
{
(SplitFunc)GET_OPTIMIZED(cv::hal::split8u), (SplitFunc)GET_OPTIMIZED(cv::hal::split8u), (SplitFunc)GET_OPTIMIZED(cv::hal::split16u), (SplitFunc)GET_OPTIMIZED(cv::hal::split16u),
(SplitFunc)GET_OPTIMIZED(cv::hal::split32s), (SplitFunc)GET_OPTIMIZED(cv::hal::split32s), (SplitFunc)GET_OPTIMIZED(cv::hal::split64s), 0
(SplitFunc)GET_OPTIMIZED(cv::hal::split8u), (SplitFunc)GET_OPTIMIZED(cv::hal::split8u),
(SplitFunc)GET_OPTIMIZED(cv::hal::split16u), (SplitFunc)GET_OPTIMIZED(cv::hal::split16u),
(SplitFunc)GET_OPTIMIZED(cv::hal::split32s), (SplitFunc)GET_OPTIMIZED(cv::hal::split32s),
(SplitFunc)GET_OPTIMIZED(cv::hal::split64s), (SplitFunc)GET_OPTIMIZED(cv::hal::split16u)
};
return splitTab[depth];
......
......@@ -78,7 +78,7 @@ OCL_TEST_P(UMatExpr, Ones)
//////////////////////////////// Instantiation /////////////////////////////////////////////////
OCL_INSTANTIATE_TEST_CASE_P(MatrixOperation, UMatExpr, Combine(OCL_ALL_DEPTHS, OCL_ALL_CHANNELS));
OCL_INSTANTIATE_TEST_CASE_P(MatrixOperation, UMatExpr, Combine(OCL_ALL_DEPTHS_16F, OCL_ALL_CHANNELS));
} } // namespace opencv_test::ocl
......
......@@ -476,7 +476,7 @@ struct CopyOp : public BaseElemWiseOp
}
int getRandomType(RNG& rng)
{
return cvtest::randomType(rng, _OutputArray::DEPTH_MASK_ALL, 1, ARITHM_MAX_CHANNELS);
return cvtest::randomType(rng, _OutputArray::DEPTH_MASK_ALL_16F, 1, ARITHM_MAX_CHANNELS);
}
double getMaxErr(int)
{
......@@ -498,7 +498,7 @@ struct SetOp : public BaseElemWiseOp
}
int getRandomType(RNG& rng)
{
return cvtest::randomType(rng, _OutputArray::DEPTH_MASK_ALL, 1, ARITHM_MAX_CHANNELS);
return cvtest::randomType(rng, _OutputArray::DEPTH_MASK_ALL_16F, 1, ARITHM_MAX_CHANNELS);
}
double getMaxErr(int)
{
......
......@@ -372,6 +372,7 @@ IMPLEMENT_PARAM_CLASS(Channels, int)
#define OCL_ON(...) cv::ocl::setUseOpenCL(true); __VA_ARGS__ ;
#define OCL_ALL_DEPTHS Values(CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F, CV_64F)
#define OCL_ALL_DEPTHS_16F Values(CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F, CV_64F, CV_16F)
#define OCL_ALL_CHANNELS Values(1, 2, 3, 4)
CV_ENUM(Interpolation, INTER_NEAREST, INTER_LINEAR, INTER_CUBIC, INTER_AREA, INTER_LINEAR_EXACT)
......
......@@ -160,7 +160,7 @@ private:
}; \
static inline void PrintTo(const class_name& t, std::ostream* os) { t.PrintTo(os); } }
CV_ENUM(MatDepth, CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F, CV_64F, CV_USRTYPE1)
CV_ENUM(MatDepth, CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F, CV_64F, CV_16F)
/*****************************************************************************************\
* Regression control utility for performance testing *
......
......@@ -72,10 +72,10 @@ int randomType(RNG& rng, int typeMask, int minChannels, int maxChannels)
{
int channels = rng.uniform(minChannels, maxChannels+1);
int depth = 0;
CV_Assert((typeMask & _OutputArray::DEPTH_MASK_ALL) != 0);
CV_Assert((typeMask & _OutputArray::DEPTH_MASK_ALL_16F) != 0);
for(;;)
{
depth = rng.uniform(CV_8U, CV_64F+1);
depth = rng.uniform(CV_8U, CV_16F+1);
if( ((1 << depth) & typeMask) != 0 )
break;
}
......@@ -1260,6 +1260,13 @@ norm_(const _Tp* src1, const _Tp* src2, size_t total, int cn, int normType, doub
double norm(InputArray _src, int normType, InputArray _mask)
{
Mat src = _src.getMat(), mask = _mask.getMat();
if( src.depth() == CV_16F )
{
Mat src32f;
src.convertTo(src32f, CV_32F);
return cvtest::norm(src32f, normType, _mask);
}
if( normType == NORM_HAMMING || normType == NORM_HAMMING2 )
{
if( !mask.empty() )
......@@ -1340,6 +1347,14 @@ double norm(InputArray _src, int normType, InputArray _mask)
double norm(InputArray _src1, InputArray _src2, int normType, InputArray _mask)
{
Mat src1 = _src1.getMat(), src2 = _src2.getMat(), mask = _mask.getMat();
if( src1.depth() == CV_16F )
{
Mat src1_32f, src2_32f;
src1.convertTo(src1_32f, CV_32F);
src2.convertTo(src2_32f, CV_32F);
return cvtest::norm(src1_32f, src2_32f, normType, _mask);
}
bool isRelative = (normType & NORM_RELATIVE) != 0;
normType &= ~NORM_RELATIVE;
......@@ -1982,11 +1997,20 @@ int check( const Mat& a, double fmin, double fmax, vector<int>* _idx )
// success_err_level is maximum allowed difference, idx is the index of the first
// element for which difference is >success_err_level
// (or index of element with the maximum difference)
int cmpEps( const Mat& arr, const Mat& refarr, double* _realmaxdiff,
int cmpEps( const Mat& arr_, const Mat& refarr_, double* _realmaxdiff,
double success_err_level, vector<int>* _idx,
bool element_wise_relative_error )
{
Mat arr = arr_, refarr = refarr_;
CV_Assert( arr.type() == refarr.type() && arr.size == refarr.size );
if( arr.depth() == CV_16F )
{
Mat arr32f, refarr32f;
arr.convertTo(arr32f, CV_32F);
refarr.convertTo(refarr32f, CV_32F);
arr = arr32f;
refarr = refarr32f;
}
int ilevel = refarr.depth() <= CV_32S ? cvFloor(success_err_level) : 0;
int result = CMP_EPS_OK;
......
......@@ -594,11 +594,11 @@ Regression& Regression::operator() (const std::string& name, cv::InputArray arra
// exit if current test is already failed
if(::testing::UnitTest::GetInstance()->current_test_info()->result()->Failed()) return *this;
if(!array.empty() && array.depth() == CV_USRTYPE1)
/*if(!array.empty() && array.depth() == CV_USRTYPE1)
{
ADD_FAILURE() << " Can not check regression for CV_USRTYPE1 data type for " << name;
return *this;
}
}*/
std::string nodename = getCurrentTestNodeName();
......@@ -2207,7 +2207,7 @@ void PrintTo(const MatType& t, ::std::ostream* os)
case CV_32S: *os << "32S"; break;
case CV_32F: *os << "32F"; break;
case CV_64F: *os << "64F"; break;
case CV_USRTYPE1: *os << "USRTYPE1"; break;
case CV_USRTYPE1: *os << "16F"; break;
default: *os << "INVALID_TYPE"; break;
}
*os << 'C' << CV_MAT_CN((int)t);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment