Commit b2ad7cd9 authored by Tomoaki Teshima's avatar Tomoaki Teshima

add feature to convert FP32(float) to FP16(half)

  * check compiler support
  * check HW support before executing
  * add test doing round trip conversion from / to FP32
  * treat array correctly if size is not multiple of 4
  * add declaration to prevent warning
  * make it possible to enable fp16 on 32bit ARM
  * let the conversion possible on non-supported HW, too.
  * add test using both HW and SW implementation
parent c3d1f94e
...@@ -146,8 +146,11 @@ if(CMAKE_COMPILER_IS_GNUCXX) ...@@ -146,8 +146,11 @@ if(CMAKE_COMPILER_IS_GNUCXX)
elseif(X86 OR X86_64) elseif(X86 OR X86_64)
add_extra_compiler_option(-mno-sse2) add_extra_compiler_option(-mno-sse2)
endif() endif()
if(ARM)
add_extra_compiler_option("-mfp16-format=ieee")
endif(ARM)
if(ENABLE_NEON) if(ENABLE_NEON)
add_extra_compiler_option("-mfpu=neon") add_extra_compiler_option("-mfpu=neon-fp16")
endif() endif()
if(ENABLE_VFPV3 AND NOT ENABLE_NEON) if(ENABLE_VFPV3 AND NOT ENABLE_NEON)
add_extra_compiler_option("-mfpu=vfpv3") add_extra_compiler_option("-mfpu=vfpv3")
...@@ -167,6 +170,9 @@ if(CMAKE_COMPILER_IS_GNUCXX) ...@@ -167,6 +170,9 @@ if(CMAKE_COMPILER_IS_GNUCXX)
add_extra_compiler_option(-mfma) add_extra_compiler_option(-mfma)
endif() endif()
endif() endif()
if((X86 OR X86_64) AND NOT MSVC)
add_extra_compiler_option(-mf16c)
endif((X86 OR X86_64) AND NOT MSVC)
# GCC depresses SSEx instructions when -mavx is used. Instead, it generates new AVX instructions or AVX equivalence for all SSEx instructions when needed. # GCC depresses SSEx instructions when -mavx is used. Instead, it generates new AVX instructions or AVX equivalence for all SSEx instructions when needed.
if(NOT OPENCV_EXTRA_CXX_FLAGS MATCHES "-mavx") if(NOT OPENCV_EXTRA_CXX_FLAGS MATCHES "-mavx")
......
...@@ -524,6 +524,17 @@ For example: ...@@ -524,6 +524,17 @@ For example:
CV_EXPORTS_W void convertScaleAbs(InputArray src, OutputArray dst, CV_EXPORTS_W void convertScaleAbs(InputArray src, OutputArray dst,
double alpha = 1, double beta = 0); double alpha = 1, double beta = 0);
/** @brief Converts an array to half precision floating number.
convertFp16 converts FP32 to FP16 or FP16 to FP32. The input array has to have type of CV_32F or
CV_16S to represent the bit depth. If the input array is neither of them, it'll do nothing.
@param src input array.
@param dst output array.
@param useHW if possible use HW SIMD instruction to convert
*/
CV_EXPORTS_W void convertFp16(InputArray src, OutputArray dst, bool useHW = true);
/** @brief Performs a look-up table transform of an array. /** @brief Performs a look-up table transform of an array.
The function LUT fills the output array with values from the look-up table. Indices of the entries The function LUT fills the output array with values from the look-up table. Indices of the entries
......
...@@ -112,7 +112,7 @@ ...@@ -112,7 +112,7 @@
#define CV_CPU_SSE4_1 6 #define CV_CPU_SSE4_1 6
#define CV_CPU_SSE4_2 7 #define CV_CPU_SSE4_2 7
#define CV_CPU_POPCNT 8 #define CV_CPU_POPCNT 8
#define CV_CPU_FP16 9
#define CV_CPU_AVX 10 #define CV_CPU_AVX 10
#define CV_CPU_AVX2 11 #define CV_CPU_AVX2 11
#define CV_CPU_FMA3 12 #define CV_CPU_FMA3 12
...@@ -143,7 +143,7 @@ enum CpuFeatures { ...@@ -143,7 +143,7 @@ enum CpuFeatures {
CPU_SSE4_1 = 6, CPU_SSE4_1 = 6,
CPU_SSE4_2 = 7, CPU_SSE4_2 = 7,
CPU_POPCNT = 8, CPU_POPCNT = 8,
CPU_FP16 = 9,
CPU_AVX = 10, CPU_AVX = 10,
CPU_AVX2 = 11, CPU_AVX2 = 11,
CPU_FMA3 = 12, CPU_FMA3 = 12,
...@@ -193,6 +193,10 @@ enum CpuFeatures { ...@@ -193,6 +193,10 @@ enum CpuFeatures {
# endif # endif
# define CV_POPCNT 1 # define CV_POPCNT 1
# endif # endif
# if defined __F16C__ || (defined _MSC_VER && _MSC_VER >= 1700)
# include <immintrin.h>
# define CV_FP16 1
# endif
# if defined __AVX__ || (defined _MSC_VER && _MSC_VER >= 1600 && 0) # if defined __AVX__ || (defined _MSC_VER && _MSC_VER >= 1600 && 0)
// MS Visual Studio 2010 (2012?) has no macro pre-defined to identify the use of /arch:AVX // MS Visual Studio 2010 (2012?) has no macro pre-defined to identify the use of /arch:AVX
// See: http://connect.microsoft.com/VisualStudio/feedback/details/605858/arch-avx-should-define-a-predefined-macro-in-x64-and-set-a-unique-value-for-m-ix86-fp-in-win32 // See: http://connect.microsoft.com/VisualStudio/feedback/details/605858/arch-avx-should-define-a-predefined-macro-in-x64-and-set-a-unique-value-for-m-ix86-fp-in-win32
...@@ -223,6 +227,10 @@ enum CpuFeatures { ...@@ -223,6 +227,10 @@ enum CpuFeatures {
# define CV_NEON 1 # define CV_NEON 1
#endif #endif
#if defined __GNUC__ && ((defined (__arm__) && (__ARM_FP & 0x2)) || defined(__aarch64__))
# define CV_FP16 1
#endif
#if defined __GNUC__ && defined __arm__ && (defined __ARM_PCS_VFP || defined __ARM_VFPV3__ || defined __ARM_NEON__) && !defined __SOFTFP__ #if defined __GNUC__ && defined __arm__ && (defined __ARM_PCS_VFP || defined __ARM_VFPV3__ || defined __ARM_NEON__) && !defined __SOFTFP__
# define CV_VFP 1 # define CV_VFP 1
#endif #endif
...@@ -253,6 +261,9 @@ enum CpuFeatures { ...@@ -253,6 +261,9 @@ enum CpuFeatures {
#ifndef CV_SSE4_2 #ifndef CV_SSE4_2
# define CV_SSE4_2 0 # define CV_SSE4_2 0
#endif #endif
#ifndef CV_FP16
# define CV_FP16 0
#endif
#ifndef CV_AVX #ifndef CV_AVX
# define CV_AVX 0 # define CV_AVX 0
#endif #endif
......
This diff is collapsed.
...@@ -135,6 +135,7 @@ typedef void (*BinaryFuncC)(const uchar* src1, size_t step1, ...@@ -135,6 +135,7 @@ typedef void (*BinaryFuncC)(const uchar* src1, size_t step1,
uchar* dst, size_t step, int width, int height, uchar* dst, size_t step, int width, int height,
void*); void*);
BinaryFunc getConvertFuncFp16(int ddepth, bool useHW);
BinaryFunc getConvertFunc(int sdepth, int ddepth); BinaryFunc getConvertFunc(int sdepth, int ddepth);
BinaryFunc getCopyMaskFunc(size_t esz); BinaryFunc getCopyMaskFunc(size_t esz);
......
...@@ -291,6 +291,7 @@ struct HWFeatures ...@@ -291,6 +291,7 @@ struct HWFeatures
f.have[CV_CPU_SSE4_2] = (cpuid_data[2] & (1<<20)) != 0; f.have[CV_CPU_SSE4_2] = (cpuid_data[2] & (1<<20)) != 0;
f.have[CV_CPU_POPCNT] = (cpuid_data[2] & (1<<23)) != 0; f.have[CV_CPU_POPCNT] = (cpuid_data[2] & (1<<23)) != 0;
f.have[CV_CPU_AVX] = (((cpuid_data[2] & (1<<28)) != 0)&&((cpuid_data[2] & (1<<27)) != 0));//OS uses XSAVE_XRSTORE and CPU support AVX f.have[CV_CPU_AVX] = (((cpuid_data[2] & (1<<28)) != 0)&&((cpuid_data[2] & (1<<27)) != 0));//OS uses XSAVE_XRSTORE and CPU support AVX
f.have[CV_CPU_FP16] = (cpuid_data[2] & (1<<29)) != 0;
// make the second call to the cpuid command in order to get // make the second call to the cpuid command in order to get
// information about extended features like AVX2 // information about extended features like AVX2
...@@ -338,7 +339,8 @@ struct HWFeatures ...@@ -338,7 +339,8 @@ struct HWFeatures
#if defined ANDROID || defined __linux__ #if defined ANDROID || defined __linux__
#ifdef __aarch64__ #ifdef __aarch64__
f.have[CV_CPU_NEON] = true; f.have[CV_CPU_NEON] = true;
#else f.have[CV_CPU_FP16] = true;
#elif defined __arm__
int cpufile = open("/proc/self/auxv", O_RDONLY); int cpufile = open("/proc/self/auxv", O_RDONLY);
if (cpufile >= 0) if (cpufile >= 0)
...@@ -351,6 +353,7 @@ struct HWFeatures ...@@ -351,6 +353,7 @@ struct HWFeatures
if (auxv.a_type == AT_HWCAP) if (auxv.a_type == AT_HWCAP)
{ {
f.have[CV_CPU_NEON] = (auxv.a_un.a_val & 4096) != 0; f.have[CV_CPU_NEON] = (auxv.a_un.a_val & 4096) != 0;
f.have[CV_CPU_FP16] = (auxv.a_un.a_val & 2) != 0;
break; break;
} }
} }
...@@ -358,8 +361,13 @@ struct HWFeatures ...@@ -358,8 +361,13 @@ struct HWFeatures
close(cpufile); close(cpufile);
} }
#endif #endif
#elif (defined __clang__ || defined __APPLE__) && (defined __ARM_NEON__ || (defined __ARM_NEON && defined __aarch64__)) #elif (defined __clang__ || defined __APPLE__)
#if (defined __ARM_NEON__ || (defined __ARM_NEON && defined __aarch64__))
f.have[CV_CPU_NEON] = true; f.have[CV_CPU_NEON] = true;
#endif
#if (defined __ARM_FP && (((__ARM_FP & 0x2) != 0) && defined __ARM_NEON__))
f.have[CV_CPU_FP16] = true;
#endif
#endif #endif
return f; return f;
......
...@@ -737,6 +737,60 @@ struct ConvertScaleOp : public BaseElemWiseOp ...@@ -737,6 +737,60 @@ struct ConvertScaleOp : public BaseElemWiseOp
int ddepth; int ddepth;
}; };
struct ConvertScaleFp16Op : public BaseElemWiseOp
{
ConvertScaleFp16Op() : BaseElemWiseOp(1, FIX_BETA+REAL_GAMMA, 1, 1, Scalar::all(0)), nextRange(0) { }
void op(const vector<Mat>& src, Mat& dst, const Mat&)
{
convertFp16(src[0], dst, true);
}
void refop(const vector<Mat>& src, Mat& dst, const Mat&)
{
convertFp16(src[0], dst, false);
}
int getRandomType(RNG&)
{
// 0: FP32 -> FP16
// 1: FP16 -> FP32
int srctype = (nextRange & 1) == 0 ? CV_32F : CV_16S;
return srctype;
}
void getValueRange(int, double& minval, double& maxval)
{
// 0: FP32 -> FP16
// 1: FP16 -> FP32
if( (nextRange & 1) == 0 )
{
// largest integer number that fp16 can express
maxval = 65504.f;
minval = -maxval;
}
else
{
// 0: positive number range
// 1: negative number range
if( (nextRange & 2) == 0 )
{
minval = 0; // 0x0000 +0
maxval = 31744; // 0x7C00 +Inf
}
else
{
minval = -32768; // 0x8000 -0
maxval = -1024; // 0xFC00 -Inf
}
}
}
double getMaxErr(int)
{
return 0.5f;
}
void generateScalars(int, RNG& rng)
{
nextRange = rng.next();
}
int nextRange;
};
struct ConvertScaleAbsOp : public BaseElemWiseOp struct ConvertScaleAbsOp : public BaseElemWiseOp
{ {
...@@ -1371,6 +1425,7 @@ INSTANTIATE_TEST_CASE_P(Core_Copy, ElemWiseTest, ::testing::Values(ElemWiseOpPtr ...@@ -1371,6 +1425,7 @@ INSTANTIATE_TEST_CASE_P(Core_Copy, ElemWiseTest, ::testing::Values(ElemWiseOpPtr
INSTANTIATE_TEST_CASE_P(Core_Set, ElemWiseTest, ::testing::Values(ElemWiseOpPtr(new cvtest::SetOp))); INSTANTIATE_TEST_CASE_P(Core_Set, ElemWiseTest, ::testing::Values(ElemWiseOpPtr(new cvtest::SetOp)));
INSTANTIATE_TEST_CASE_P(Core_SetZero, ElemWiseTest, ::testing::Values(ElemWiseOpPtr(new cvtest::SetZeroOp))); INSTANTIATE_TEST_CASE_P(Core_SetZero, ElemWiseTest, ::testing::Values(ElemWiseOpPtr(new cvtest::SetZeroOp)));
INSTANTIATE_TEST_CASE_P(Core_ConvertScale, ElemWiseTest, ::testing::Values(ElemWiseOpPtr(new cvtest::ConvertScaleOp))); INSTANTIATE_TEST_CASE_P(Core_ConvertScale, ElemWiseTest, ::testing::Values(ElemWiseOpPtr(new cvtest::ConvertScaleOp)));
INSTANTIATE_TEST_CASE_P(Core_ConvertScaleFp16, ElemWiseTest, ::testing::Values(ElemWiseOpPtr(new cvtest::ConvertScaleFp16Op)));
INSTANTIATE_TEST_CASE_P(Core_ConvertScaleAbs, ElemWiseTest, ::testing::Values(ElemWiseOpPtr(new cvtest::ConvertScaleAbsOp))); INSTANTIATE_TEST_CASE_P(Core_ConvertScaleAbs, ElemWiseTest, ::testing::Values(ElemWiseOpPtr(new cvtest::ConvertScaleAbsOp)));
INSTANTIATE_TEST_CASE_P(Core_Add, ElemWiseTest, ::testing::Values(ElemWiseOpPtr(new cvtest::AddOp))); INSTANTIATE_TEST_CASE_P(Core_Add, ElemWiseTest, ::testing::Values(ElemWiseOpPtr(new cvtest::AddOp)));
......
...@@ -3064,6 +3064,9 @@ void printVersionInfo(bool useStdOut) ...@@ -3064,6 +3064,9 @@ void printVersionInfo(bool useStdOut)
#if CV_NEON #if CV_NEON
if (checkHardwareSupport(CV_CPU_NEON)) cpu_features += " neon"; if (checkHardwareSupport(CV_CPU_NEON)) cpu_features += " neon";
#endif #endif
#if CV_FP16
if (checkHardwareSupport(CV_CPU_FP16)) cpu_features += " fp16";
#endif
cpu_features.erase(0, 1); // erase initial space cpu_features.erase(0, 1); // erase initial space
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment