Unverified Commit 517a35e6 authored by Scott Cyphers's avatar Scott Cyphers Committed by GitHub

float16 round halves to even (#4404)

* float16 round halves to even

* NaN details are platform-dependent

* typo

* typos
Co-authored-by: 's avatarIlya Churaev <ilya.churaev@intel.com>
parent 39ad8e42
...@@ -44,48 +44,88 @@ static_assert(sizeof(float16) == 2, "class float16 must be exactly 2 bytes"); ...@@ -44,48 +44,88 @@ static_assert(sizeof(float16) == 2, "class float16 must be exactly 2 bytes");
float16::float16(float value) float16::float16(float value)
{ {
// Work in 32-bit and shift right 16 in the end
union { union {
float fv; float fv;
uint32_t iv; uint32_t iv;
}; };
fv = value; fv = value;
uint32_t hidden_one = 0x00800000; // sign
uint32_t sign = iv & 0x80000000; constexpr uint32_t smask = 0x80000000;
uint32_t biased_exp = (iv & 0x7F800000) >> 23; // floqt32 exp
uint32_t raw_frac = (iv & 0x007FFFFF); constexpr uint32_t emask_32 = 0x7F800000;
int32_t exp = biased_exp - 127; // float32 frac
int32_t min_exp = -14 - frac_size; constexpr uint32_t fmask_32 = 0x007fffff;
if (biased_exp == 0 || exp < min_exp) // float16 exp
constexpr uint32_t emask_16 = 0x7c000000;
// float16 frac
constexpr uint32_t fmask_16 = 0x03ff0000;
// bits for half to even round
constexpr uint32_t rhalf_16 = 0x0001ffff;
// bit value for normal round
constexpr uint32_t rnorm_16 = 0x00007fff;
// bit value for half to even round
constexpr uint32_t reven_16 = 0x00008000;
// value for an non-half to even round
constexpr uint32_t rodd_16 = 0x000018000;
// exp bits in position
uint32_t biased_exp_field_32 = iv & emask_32;
uint32_t frac = (iv & fmask_32) << 3;
if (biased_exp_field_32 == emask_32)
{ {
// Goes to 0 // Inf or NaN
biased_exp = 0; if (frac != 0)
{
// NaN
frac &= fmask_16;
if (frac == 0)
{
frac = 0x00010000;
}
}
m_value = ((iv & smask) | emask_16 | frac) >> 16;
return;
} }
else if (biased_exp == 0xFF) if (biased_exp_field_32 == 0)
{ {
// Infinity or NAN. m_value = (iv & smask) >> 16;
biased_exp = 0x1F; return;
raw_frac = raw_frac >> (23 - frac_size);
} }
else if (exp < -14) int16_t biased_exp_16 = (biased_exp_field_32 >> 23) - 127 + 15;
// In the normalized_16 realm
if ((frac & rhalf_16) == rodd_16 || (frac & rnorm_16) != 0)
{ {
// denorm frac += reven_16;
biased_exp = 0; if (0 != (frac & emask_16))
raw_frac |= hidden_one; {
uint32_t exp_shift = (-15 - exp) + 1; frac &= emask_16;
uint32_t shift = exp_shift + (23 - frac_size); biased_exp_16++;
raw_frac = (raw_frac + (hidden_one >> (frac_size - exp_shift + 1))) >> shift; }
} }
else if (exp > 15 || (exp == 15 && raw_frac > 0x7fef00 /* numpy overflow value */)) frac &= fmask_16;
if (biased_exp_16 > 30)
{ {
biased_exp = 0x1F; // Infinity
raw_frac = 0; m_value = ((iv & smask) | emask_16 | 0) >> 16;
return;
} }
else if ((biased_exp != 0 || raw_frac != 0)) if (biased_exp_16 > 0)
{ {
raw_frac = (raw_frac + 0x1000) >> (23 - frac_size); m_value = ((iv & smask) | biased_exp_16 << 26 | frac) >> 16;
biased_exp = exp + exp_bias; return;
} }
m_value = (sign >> 16) | (biased_exp << frac_size) | raw_frac; // Restore the hidden 1
frac = 0x04000000 | ((iv & fmask_32) << 3);
// Will any bits be shifted off?
uint32_t sticky = (frac & ((1 << (1 - biased_exp_16)) - 1)) ? 1 : 0;
frac >>= 1 + (-biased_exp_16);
frac |= sticky;
if (((frac & rhalf_16) == rodd_16) || ((frac & rnorm_16) != 0))
{
frac += reven_16;
}
m_value = ((iv & smask) | frac) >> 16;
} }
std::string float16::to_string() const std::string float16::to_string() const
...@@ -166,6 +206,12 @@ float16::operator float() const ...@@ -166,6 +206,12 @@ float16::operator float() const
return f_val; return f_val;
} }
bool std::isnan(float16 x)
{
// Sign doesn't matter, frac not zero (infinity)
return (x.to_bits() & 0x7FFF) > 0x7c00;
}
uint16_t float16::to_bits() const uint16_t float16::to_bits() const
{ {
return m_value; return m_value;
......
...@@ -90,6 +90,8 @@ namespace ngraph ...@@ -90,6 +90,8 @@ namespace ngraph
namespace std namespace std
{ {
bool isnan(ngraph::float16 x);
template <> template <>
class numeric_limits<ngraph::float16> class numeric_limits<ngraph::float16>
{ {
......
...@@ -90,23 +90,32 @@ TEST(float16, assigns) ...@@ -90,23 +90,32 @@ TEST(float16, assigns)
TEST(float16, values) TEST(float16, values)
{ {
std::vector<double> f32vec{2.73786e-05, EXPECT_EQ(static_cast<float16>(test::FloatUnion(0, 112 - 8, (1 << 21) + 0).f).to_bits(),
3.87722e-05, float16(0, 0, 2).to_bits());
-0.0223043, EXPECT_EQ(static_cast<float16>(test::FloatUnion(0, 112 - 8, (1 << 21) + 1).f).to_bits(),
5.10779e-05, float16(0, 0, 3).to_bits());
-5.10779e-05, EXPECT_EQ(static_cast<float16>(1.0 / (256.0 * 65536.0)).to_bits(), float16(0, 0, 1).to_bits());
-2.553895e-05, EXPECT_EQ(static_cast<float16>(1.5 / (256.0 * 65536.0)).to_bits(), float16(0, 0, 2).to_bits());
-0.0001021558, EXPECT_EQ(static_cast<float16>(1.25 / (256.0 * 65536.0)).to_bits(), float16(0, 0, 1).to_bits());
5.960464477539063e-08, EXPECT_EQ(static_cast<float16>(1.0 / (128.0 * 65536.0)).to_bits(), float16(0, 0, 2).to_bits());
8.940696716308594e-08, EXPECT_EQ(static_cast<float16>(1.5 / (128.0 * 65536.0)).to_bits(), float16(0, 0, 3).to_bits());
65536.0, EXPECT_EQ(static_cast<float16>(1.25 / (128.0 * 65536.0)).to_bits(), float16(0, 0, 2).to_bits());
65519.0, EXPECT_EQ(static_cast<float16>(std::numeric_limits<float>::infinity()).to_bits(),
65520.0}; float16(0, 0x1F, 0).to_bits());
std::vector<uint16_t> intvals = { EXPECT_EQ(static_cast<float16>(-std::numeric_limits<float>::infinity()).to_bits(),
459, 650, 42422, 857, 0x8359, 0x81ac, 0x86b2, 0x01, 0x02, 0x7c00, 0x7bff, 0x7c00}; float16(1, 0x1F, 0).to_bits());
for (size_t i = 0; i < f32vec.size(); ++i) EXPECT_TRUE(isnan(static_cast<float16>(std::numeric_limits<float>::quiet_NaN())));
{ EXPECT_TRUE(isnan(static_cast<float16>(std::numeric_limits<float>::signaling_NaN())));
float16 fp16val = f32vec.at(i); EXPECT_EQ(static_cast<float16>(2.73786e-05).to_bits(), 459);
EXPECT_EQ(intvals.at(i), fp16val.to_bits()); EXPECT_EQ(static_cast<float16>(3.87722e-05).to_bits(), 650);
} EXPECT_EQ(static_cast<float16>(-0.0223043).to_bits(), 42422);
EXPECT_EQ(static_cast<float16>(5.10779e-05).to_bits(), 857);
EXPECT_EQ(static_cast<float16>(-5.10779e-05).to_bits(), 0x8359);
EXPECT_EQ(static_cast<float16>(-2.553895e-05).to_bits(), 0x81ac);
EXPECT_EQ(static_cast<float16>(-0.0001021558).to_bits(), 0x86b2);
EXPECT_EQ(static_cast<float16>(5.960464477539063e-08).to_bits(), 0x01);
EXPECT_EQ(static_cast<float16>(8.940696716308594e-08).to_bits(), 0x02);
EXPECT_EQ(static_cast<float16>(65536.0).to_bits(), 0x7c00);
EXPECT_EQ(static_cast<float16>(65519.0).to_bits(), 0x7bff);
EXPECT_EQ(static_cast<float16>(65520.0).to_bits(), 0x7c00);
} }
...@@ -16,22 +16,6 @@ ...@@ -16,22 +16,6 @@
#include "util/float_util.hpp" #include "util/float_util.hpp"
union FloatUnion {
FloatUnion() { i = 0; }
FloatUnion(float val) { f = val; }
FloatUnion(uint32_t val) { i = val; }
float f;
uint32_t i;
};
union DoubleUnion {
DoubleUnion() { i = 0; }
DoubleUnion(double val) { d = val; }
DoubleUnion(uint64_t val) { i = val; }
double d;
uint64_t i;
};
std::string ngraph::test::bfloat16_to_bits(bfloat16 f) std::string ngraph::test::bfloat16_to_bits(bfloat16 f)
{ {
std::stringstream ss; std::stringstream ss;
......
...@@ -30,6 +30,10 @@ namespace ngraph ...@@ -30,6 +30,10 @@ namespace ngraph
FloatUnion() { i = 0; } FloatUnion() { i = 0; }
FloatUnion(float val) { f = val; } FloatUnion(float val) { f = val; }
FloatUnion(uint32_t val) { i = val; } FloatUnion(uint32_t val) { i = val; }
FloatUnion(uint32_t s, uint32_t e, uint32_t f)
: FloatUnion(s << 31 | e << 23 | f)
{
}
float f; float f;
uint32_t i; uint32_t i;
}; };
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment