Unverified Commit 3b558069 authored by Scott Cyphers's avatar Scott Cyphers Committed by GitHub

Fix float16 denorm conversion, add rounding (#3848)

* Fix float16 denorm conversion, add rounding

* review comments
parent 3d57b025
......@@ -49,6 +49,7 @@ float16::float16(float value)
uint32_t iv;
};
fv = value;
uint32_t hidden_one = 0x00800000;
uint32_t sign = iv & 0x80000000;
uint32_t biased_exp = (iv & 0x7F800000) >> 23;
uint32_t raw_frac = (iv & 0x007FFFFF);
......@@ -67,19 +68,20 @@ float16::float16(float value)
}
else if (exp < -14)
{
// denorm or 0
// denorm
biased_exp = 0;
raw_frac |= 0x00800000;
raw_frac = raw_frac >> (exp + 16);
raw_frac |= hidden_one;
uint32_t shift = (-15 - exp) + (23 - frac_size) + 1;
raw_frac = (raw_frac + (hidden_one >> (shift + 1))) >> shift;
}
else if (exp > 15)
{
biased_exp = 0x1F;
raw_frac = 0;
}
else
else if ((biased_exp != 0 || raw_frac != 0))
{
raw_frac = raw_frac >> (23 - frac_size);
raw_frac = (raw_frac + 0x1000) >> (23 - frac_size);
biased_exp = exp + exp_bias;
}
m_value = (sign >> 16) | (biased_exp << frac_size) | raw_frac;
......
......@@ -87,3 +87,14 @@ TEST(float16, assigns)
EXPECT_EQ(f32arr[i], f16arr[i]);
}
}
TEST(float16, values)
{
std::vector<double> f32vec{2.73786e-05, 3.87722e-05, -0.0223043};
std::vector<uint16_t> intvals = {459, 650, 42422};
for (size_t i = 0; i < f32vec.size(); ++i)
{
float16 fp16val = f32vec.at(i);
EXPECT_EQ(intvals.at(i), fp16val.to_bits());
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment