Unverified Commit 517a35e6 authored by Scott Cyphers's avatar Scott Cyphers Committed by GitHub

float16 round halves to even (#4404)

* float16 round halves to even

* NaN details are platform-dependent

* typo

* typos
Co-authored-by: 's avatarIlya Churaev <ilya.churaev@intel.com>
parent 39ad8e42
......@@ -44,48 +44,88 @@ static_assert(sizeof(float16) == 2, "class float16 must be exactly 2 bytes");
float16::float16(float value)
{
// Work in 32-bit and shift right 16 in the end
union {
float fv;
uint32_t iv;
};
fv = value;
uint32_t hidden_one = 0x00800000;
uint32_t sign = iv & 0x80000000;
uint32_t biased_exp = (iv & 0x7F800000) >> 23;
uint32_t raw_frac = (iv & 0x007FFFFF);
int32_t exp = biased_exp - 127;
int32_t min_exp = -14 - frac_size;
if (biased_exp == 0 || exp < min_exp)
// sign
constexpr uint32_t smask = 0x80000000;
// floqt32 exp
constexpr uint32_t emask_32 = 0x7F800000;
// float32 frac
constexpr uint32_t fmask_32 = 0x007fffff;
// float16 exp
constexpr uint32_t emask_16 = 0x7c000000;
// float16 frac
constexpr uint32_t fmask_16 = 0x03ff0000;
// bits for half to even round
constexpr uint32_t rhalf_16 = 0x0001ffff;
// bit value for normal round
constexpr uint32_t rnorm_16 = 0x00007fff;
// bit value for half to even round
constexpr uint32_t reven_16 = 0x00008000;
// value for an non-half to even round
constexpr uint32_t rodd_16 = 0x000018000;
// exp bits in position
uint32_t biased_exp_field_32 = iv & emask_32;
uint32_t frac = (iv & fmask_32) << 3;
if (biased_exp_field_32 == emask_32)
{
// Goes to 0
biased_exp = 0;
// Inf or NaN
if (frac != 0)
{
// NaN
frac &= fmask_16;
if (frac == 0)
{
frac = 0x00010000;
}
}
m_value = ((iv & smask) | emask_16 | frac) >> 16;
return;
}
if (biased_exp_field_32 == 0)
{
m_value = (iv & smask) >> 16;
return;
}
else if (biased_exp == 0xFF)
int16_t biased_exp_16 = (biased_exp_field_32 >> 23) - 127 + 15;
// In the normalized_16 realm
if ((frac & rhalf_16) == rodd_16 || (frac & rnorm_16) != 0)
{
frac += reven_16;
if (0 != (frac & emask_16))
{
// Infinity or NAN.
biased_exp = 0x1F;
raw_frac = raw_frac >> (23 - frac_size);
frac &= emask_16;
biased_exp_16++;
}
}
else if (exp < -14)
frac &= fmask_16;
if (biased_exp_16 > 30)
{
// denorm
biased_exp = 0;
raw_frac |= hidden_one;
uint32_t exp_shift = (-15 - exp) + 1;
uint32_t shift = exp_shift + (23 - frac_size);
raw_frac = (raw_frac + (hidden_one >> (frac_size - exp_shift + 1))) >> shift;
// Infinity
m_value = ((iv & smask) | emask_16 | 0) >> 16;
return;
}
else if (exp > 15 || (exp == 15 && raw_frac > 0x7fef00 /* numpy overflow value */))
if (biased_exp_16 > 0)
{
biased_exp = 0x1F;
raw_frac = 0;
m_value = ((iv & smask) | biased_exp_16 << 26 | frac) >> 16;
return;
}
else if ((biased_exp != 0 || raw_frac != 0))
// Restore the hidden 1
frac = 0x04000000 | ((iv & fmask_32) << 3);
// Will any bits be shifted off?
uint32_t sticky = (frac & ((1 << (1 - biased_exp_16)) - 1)) ? 1 : 0;
frac >>= 1 + (-biased_exp_16);
frac |= sticky;
if (((frac & rhalf_16) == rodd_16) || ((frac & rnorm_16) != 0))
{
raw_frac = (raw_frac + 0x1000) >> (23 - frac_size);
biased_exp = exp + exp_bias;
frac += reven_16;
}
m_value = (sign >> 16) | (biased_exp << frac_size) | raw_frac;
m_value = ((iv & smask) | frac) >> 16;
}
std::string float16::to_string() const
......@@ -166,6 +206,12 @@ float16::operator float() const
return f_val;
}
bool std::isnan(float16 x)
{
// Sign doesn't matter, frac not zero (infinity)
return (x.to_bits() & 0x7FFF) > 0x7c00;
}
uint16_t float16::to_bits() const
{
return m_value;
......
......@@ -90,6 +90,8 @@ namespace ngraph
namespace std
{
bool isnan(ngraph::float16 x);
template <>
class numeric_limits<ngraph::float16>
{
......
......@@ -90,23 +90,32 @@ TEST(float16, assigns)
TEST(float16, values)
{
std::vector<double> f32vec{2.73786e-05,
3.87722e-05,
-0.0223043,
5.10779e-05,
-5.10779e-05,
-2.553895e-05,
-0.0001021558,
5.960464477539063e-08,
8.940696716308594e-08,
65536.0,
65519.0,
65520.0};
std::vector<uint16_t> intvals = {
459, 650, 42422, 857, 0x8359, 0x81ac, 0x86b2, 0x01, 0x02, 0x7c00, 0x7bff, 0x7c00};
for (size_t i = 0; i < f32vec.size(); ++i)
{
float16 fp16val = f32vec.at(i);
EXPECT_EQ(intvals.at(i), fp16val.to_bits());
}
EXPECT_EQ(static_cast<float16>(test::FloatUnion(0, 112 - 8, (1 << 21) + 0).f).to_bits(),
float16(0, 0, 2).to_bits());
EXPECT_EQ(static_cast<float16>(test::FloatUnion(0, 112 - 8, (1 << 21) + 1).f).to_bits(),
float16(0, 0, 3).to_bits());
EXPECT_EQ(static_cast<float16>(1.0 / (256.0 * 65536.0)).to_bits(), float16(0, 0, 1).to_bits());
EXPECT_EQ(static_cast<float16>(1.5 / (256.0 * 65536.0)).to_bits(), float16(0, 0, 2).to_bits());
EXPECT_EQ(static_cast<float16>(1.25 / (256.0 * 65536.0)).to_bits(), float16(0, 0, 1).to_bits());
EXPECT_EQ(static_cast<float16>(1.0 / (128.0 * 65536.0)).to_bits(), float16(0, 0, 2).to_bits());
EXPECT_EQ(static_cast<float16>(1.5 / (128.0 * 65536.0)).to_bits(), float16(0, 0, 3).to_bits());
EXPECT_EQ(static_cast<float16>(1.25 / (128.0 * 65536.0)).to_bits(), float16(0, 0, 2).to_bits());
EXPECT_EQ(static_cast<float16>(std::numeric_limits<float>::infinity()).to_bits(),
float16(0, 0x1F, 0).to_bits());
EXPECT_EQ(static_cast<float16>(-std::numeric_limits<float>::infinity()).to_bits(),
float16(1, 0x1F, 0).to_bits());
EXPECT_TRUE(isnan(static_cast<float16>(std::numeric_limits<float>::quiet_NaN())));
EXPECT_TRUE(isnan(static_cast<float16>(std::numeric_limits<float>::signaling_NaN())));
EXPECT_EQ(static_cast<float16>(2.73786e-05).to_bits(), 459);
EXPECT_EQ(static_cast<float16>(3.87722e-05).to_bits(), 650);
EXPECT_EQ(static_cast<float16>(-0.0223043).to_bits(), 42422);
EXPECT_EQ(static_cast<float16>(5.10779e-05).to_bits(), 857);
EXPECT_EQ(static_cast<float16>(-5.10779e-05).to_bits(), 0x8359);
EXPECT_EQ(static_cast<float16>(-2.553895e-05).to_bits(), 0x81ac);
EXPECT_EQ(static_cast<float16>(-0.0001021558).to_bits(), 0x86b2);
EXPECT_EQ(static_cast<float16>(5.960464477539063e-08).to_bits(), 0x01);
EXPECT_EQ(static_cast<float16>(8.940696716308594e-08).to_bits(), 0x02);
EXPECT_EQ(static_cast<float16>(65536.0).to_bits(), 0x7c00);
EXPECT_EQ(static_cast<float16>(65519.0).to_bits(), 0x7bff);
EXPECT_EQ(static_cast<float16>(65520.0).to_bits(), 0x7c00);
}
......@@ -16,22 +16,6 @@
#include "util/float_util.hpp"
union FloatUnion {
FloatUnion() { i = 0; }
FloatUnion(float val) { f = val; }
FloatUnion(uint32_t val) { i = val; }
float f;
uint32_t i;
};
union DoubleUnion {
DoubleUnion() { i = 0; }
DoubleUnion(double val) { d = val; }
DoubleUnion(uint64_t val) { i = val; }
double d;
uint64_t i;
};
std::string ngraph::test::bfloat16_to_bits(bfloat16 f)
{
std::stringstream ss;
......
......@@ -30,6 +30,10 @@ namespace ngraph
FloatUnion() { i = 0; }
FloatUnion(float val) { f = val; }
FloatUnion(uint32_t val) { i = val; }
FloatUnion(uint32_t s, uint32_t e, uint32_t f)
: FloatUnion(s << 31 | e << 23 | f)
{
}
float f;
uint32_t i;
};
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment