Round the right bit with denorms (#3885)

* Round the right bit with denorms * Rounding to inf

Round the right bit with denorms (#3885)
* Round the right bit with denorms * Rounding to inf
5fa5854c · Scott Cyphers · GitHub · aa359461 · 5fa5854c · 5fa5854c
Unverified Commit 5fa5854c authored Nov 15, 2019 by Scott Cyphers Committed by GitHub Nov 15, 2019
Hide whitespace changes
Inline Side-by-side

Showing with 18 additions and 5 deletions

float16.cpp src/ngraph/type/float16.cpp +4 -3

float16.cpp test/float16.cpp +14 -2

No files found.
--- a/src/ngraph/type/float16.cpp
+++ b/src/ngraph/type/float16.cpp
@@ -71,10 +71,11 @@ float16::float16(float value)
        // denorm
        biased_exp = 0;
        raw_frac |= hidden_one;
-        uint32_t shift = (-15 - exp) + (23 - frac_size) + 1;
-        raw_frac = (raw_frac + (hidden_one >> (shift + 1))) >> shift;
+        uint32_t exp_shift = (-15 - exp) + 1;
+        uint32_t shift = exp_shift + (23 - frac_size);
+        raw_frac = (raw_frac + (hidden_one >> (frac_size - exp_shift + 1))) >> shift;
    }
-    else if (exp > 15)
+    else if (exp > 15 || (exp == 15 && raw_frac > 0x7fef00 /* numpy overflow value */))
    {
        biased_exp = 0x1F;
        raw_frac = 0;

--- a/test/float16.cpp
+++ b/test/float16.cpp
@@ -90,8 +90,20 @@ TEST(float16, assigns)

 TEST(float16, values)
 {
-    std::vector<double> f32vec{2.73786e-05, 3.87722e-05, -0.0223043};
-    std::vector<uint16_t> intvals = {459, 650, 42422};
+    std::vector<double> f32vec{2.73786e-05,
+                               3.87722e-05,
+                               -0.0223043,
+                               5.10779e-05,
+                               -5.10779e-05,
+                               -2.553895e-05,
+                               -0.0001021558,
+                               5.960464477539063e-08,
+                               8.940696716308594e-08,
+                               65536.0,
+                               65519.0,
+                               65520.0};
+    std::vector<uint16_t> intvals = {
+        459, 650, 42422, 857, 0x8359, 0x81ac, 0x86b2, 0x01, 0x02, 0x7c00, 0x7bff, 0x7c00};
    for (size_t i = 0; i < f32vec.size(); ++i)
    {
        float16 fp16val = f32vec.at(i);