Unverified Commit 5fa5854c authored by Scott Cyphers's avatar Scott Cyphers Committed by GitHub

Round the right bit with denorms (#3885)

* Round the right bit with denorms

* Rounding to inf
parent aa359461
......@@ -71,10 +71,11 @@ float16::float16(float value)
// denorm
biased_exp = 0;
raw_frac |= hidden_one;
uint32_t shift = (-15 - exp) + (23 - frac_size) + 1;
raw_frac = (raw_frac + (hidden_one >> (shift + 1))) >> shift;
uint32_t exp_shift = (-15 - exp) + 1;
uint32_t shift = exp_shift + (23 - frac_size);
raw_frac = (raw_frac + (hidden_one >> (frac_size - exp_shift + 1))) >> shift;
}
else if (exp > 15)
else if (exp > 15 || (exp == 15 && raw_frac > 0x7fef00 /* numpy overflow value */))
{
biased_exp = 0x1F;
raw_frac = 0;
......
......@@ -90,8 +90,20 @@ TEST(float16, assigns)
TEST(float16, values)
{
std::vector<double> f32vec{2.73786e-05, 3.87722e-05, -0.0223043};
std::vector<uint16_t> intvals = {459, 650, 42422};
std::vector<double> f32vec{2.73786e-05,
3.87722e-05,
-0.0223043,
5.10779e-05,
-5.10779e-05,
-2.553895e-05,
-0.0001021558,
5.960464477539063e-08,
8.940696716308594e-08,
65536.0,
65519.0,
65520.0};
std::vector<uint16_t> intvals = {
459, 650, 42422, 857, 0x8359, 0x81ac, 0x86b2, 0x01, 0x02, 0x7c00, 0x7bff, 0x7c00};
for (size_t i = 0; i < f32vec.size(); ++i)
{
float16 fp16val = f32vec.at(i);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment