Modify convert op to convert from bf16 to float and vice-versa (#3689)

* Modify convert op to convert from bf16 to float and vice-versa * Add a check * Remove conversion routines and use the convert op for casting * disable unit tests for plaidml

Modify convert op to convert from bf16 to float and vice-versa (#3689)
* Modify convert op to convert from bf16 to float and vice-versa * Add a check * Remove conversion routines and use the convert op for casting * disable unit tests for plaidml
0b1a386e · Nishant Patel · Scott Cyphers · 9d45682c · 0b1a386e · 0b1a386e
Commit 0b1a386e authored Oct 09, 2019 by Nishant Patel Committed by Scott Cyphers Oct 09, 2019
8 changed files
--- a/src/ngraph/runtime/cpu/builder/convert.cpp
+++ b/src/ngraph/runtime/cpu/builder/convert.cpp
@@ -45,6 +45,11 @@ namespace ngraph
                    SELECT_KERNEL(
                        kernel, args[0].get_element_type(), runtime::cpu::kernel::convert_to_bool)
                }
+                else if (args[0].get_element_type() == element::bf16 &&
+                         out[0].get_element_type() == element::f32)
+                {
+                    kernel = runtime::cpu::kernel::convert_to_float32<bfloat16>;
+                }
                else if (out[0].get_element_type() == element::f32)
                {
                    SELECT_KERNEL(kernel,
@@ -97,6 +102,11 @@ namespace ngraph
                    SELECT_KERNEL(
                        kernel, args[0].get_element_type(), runtime::cpu::kernel::convert_to_u64)
                }
+                else if (args[0].get_element_type() == element::f32 &&
+                         out[0].get_element_type() == element::bf16)
+                {
+                    kernel = runtime::cpu::kernel::convert_to_bf16<float>;
+                }
                else
                {
                    throw ngraph_error("Cannot convert from an invalid input element type");

--- a/src/ngraph/runtime/cpu/kernel/convert.hpp
+++ b/src/ngraph/runtime/cpu/kernel/convert.hpp
@@ -20,6 +20,7 @@
 #include <unsupported/Eigen/CXX11/Tensor>
 #include "ngraph/runtime/cpu/cpu_executor.hpp"
+#include "ngraph/runtime/reference/convert.hpp"
 namespace ngraph
 {
@@ -110,6 +111,12 @@ namespace ngraph
                {
                    convert<InputElementType, bool>(input, output, count, arena);
                }
+                template <typename InputElementType>
+                void convert_to_bf16(void* input, void* output, size_t count, int arena)
+                {
+                    convert<InputElementType, bfloat16>(input, output, count, arena);
+                }
            }
        }
    }

--- a/src/ngraph/runtime/interpreter/unit_test.manifest
+++ b/src/ngraph/runtime/interpreter/unit_test.manifest
@@ -9,3 +9,7 @@ model_matmul_integer_4d_no_zero_point
 fake_quantize
 fake_quantize_with_clip
 fake_quantize_with_clip_across_channels
+# casting not supported on interpreter
+convert_float32_bf16
+convert_bf16_float32
--- a/src/ngraph/runtime/plaidml/unit_test.manifest
+++ b/src/ngraph/runtime/plaidml/unit_test.manifest
@@ -294,3 +294,7 @@ layer_norm_bprop_affine
 # shapes with zeros dimensions like (5, 0, 5) not supported in PlaidML backend
 dyn_replace_slice
+# bf16 test cases not supported
+convert_float32_bf16
+convert_bf16_float32
--- a/test/backend/convert.in.cpp
+++ b/test/backend/convert.in.cpp
@@ -110,3 +110,62 @@ NGRAPH_TEST(${BACKEND_NAME}, convert_float32_bool)
    handle->call_with_validate({result}, {a});
    EXPECT_EQ((vector<char>{0, 1, 1, 0, 1, 1, 1, 1, 1}), read_vector<char>(result));
 }
+NGRAPH_TEST(${BACKEND_NAME}, convert_float32_bf16)
+{
+    Shape shape_a{1, 1, 3, 5};
+    // input data
+    vector<float> a_data = {
+        0.5f, 1.5f, 0.5f, 2.5f, 1.5f, 0.5f, 3.5f, 2.5f, 0.5f, 0.5f, 2.5f, 0.5f, 0.5f, 0.5f, 1.5f};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    auto convert = make_shared<op::Convert>(A, element::bf16);
+    auto f = make_shared<Function>(NodeVector{convert}, ParameterVector{A});
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a, a_data);
+    auto result = backend->create_tensor(element::bf16, shape_a);
+    auto handle = backend->compile(f);
+    handle->call_with_validate({result}, {a});
+    EXPECT_EQ((vector<bfloat16>{
+                  0.5, 1.5, 0.5, 2.5, 1.5, 0.5, 3.5, 2.5, 0.5, 0.5, 2.5, 0.5, 0.5, 0.5, 1.5}),
+              read_vector<bfloat16>(result));
+}
+NGRAPH_TEST(${BACKEND_NAME}, convert_bf16_float32)
+{
+    Shape shape_a{1, 1, 3, 5};
+    // input data
+    vector<bfloat16> a_data = {
+        0.5, 1.5, 0.5, 2.5, 1.5, 0.5, 3.5, 2.5, 0.5, 0.5, 2.5, 0.5, 0.5, 0.5, 1.5};
+    auto A = make_shared<op::Parameter>(element::bf16, shape_a);
+    auto convert = make_shared<op::Convert>(A, element::f32);
+    auto f = make_shared<Function>(NodeVector{convert}, ParameterVector{A});
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::bf16, shape_a);
+    copy_data(a, a_data);
+    auto result = backend->create_tensor(element::f32, shape_a);
+    auto handle = backend->compile(f);
+    handle->call_with_validate({result}, {a});
+    EXPECT_EQ((vector<float>{0.5f,
+                             1.5f,
+                             0.5f,
+                             2.5f,
+                             1.5f,
+                             0.5f,
+                             3.5f,
+                             2.5f,
+                             0.5f,
+                             0.5f,
+                             2.5f,
+                             0.5f,
+                             0.5f,
+                             0.5f,
+                             1.5f}),
+              read_vector<float>(result));
+}
--- a/test/cpu_test.cpp
+++ b/test/cpu_test.cpp
@@ -2169,55 +2169,36 @@ TEST(cpu_test, max_pool_bf16)
    vector<float> a_data = {
        0.5f, 1.5f, 0.5f, 2.5f, 1.5f, 0.5f, 3.5f, 2.5f, 0.5f, 0.5f, 2.5f, 0.5f, 0.5f, 0.5f, 1.5f};
-    // allocate memory for destination
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
-    int size = a_data.size() * sizeof(float) / 2;
+    auto A_bf16 = make_shared<op::Convert>(A, element::bf16);
-    void* bf16_dst = std::malloc(size);
-    // convert float data to bfloat16
-    ngraph::test::float_to_bf16(a_data.data(), bf16_dst, a_data.size());
-    auto A = make_shared<op::Parameter>(element::bf16, shape_a);
    auto QMP = make_shared<ngraph::op::MaxPool>(
-        A, window_shape, window_movement_strides, padding_below, padding_above);
+        A_bf16, window_shape, window_movement_strides, padding_below, padding_above);
    auto f = make_shared<Function>(NodeVector{QMP}, ParameterVector{A});
    auto backend = runtime::Backend::create("CPU");
    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::bf16, shape_a);
+    auto a = backend->create_tensor(element::f32, shape_a);
-    a->write(bf16_dst, size);
+    copy_data(a, a_data);
    auto result = backend->create_tensor(element::bf16, shape_r);
    auto handle = backend->compile(f);
    handle->call_with_validate({result}, {a});
-    // convert the output back to float
+    EXPECT_EQ((vector<bfloat16>{3.5, 3.5, 2.5, 3.5, 3.5, 2.5}), read_vector<bfloat16>(result));
-    void* fp_dst = malloc(shape_size(shape_r) * 4);
-    ngraph::test::bf16_to_float(
-        static_pointer_cast<runtime::cpu::CPUTensorView>(result)->get_data_ptr(),
-        fp_dst,
-        shape_size(shape_r));
-    auto b = backend->create_tensor(element::f32, shape_r);
-    b->write(fp_dst, shape_size(shape_r) * 4);
-    EXPECT_EQ((vector<float>{3.5f, 3.5f, 2.5f, 3.5f, 3.5f, 2.5f}), read_vector<float>(b));
 }
 TEST(cpu_test, convolution_simple_bf16)
 {
    Shape shape_a{1, 2, 2, 2};
-    auto A = make_shared<op::Parameter>(element::bf16, shape_a);
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
    Shape shape_b{2, 2, 1, 1};
-    auto B = make_shared<op::Parameter>(element::bf16, shape_b);
+    auto B = make_shared<op::Parameter>(element::f32, shape_b);
    Shape shape_r{1, 2, 2, 2};
    vector<float> input = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f};
    vector<float> weights = {3.0f, 3.0f, 3.0f, 3.0f};
-    int input_size = input.size() * sizeof(float) / 2;
+    auto A_bf16 = make_shared<op::Convert>(A, element::bf16);
-    int weights_size = weights.size() * sizeof(float) / 2;
+    auto B_bf16 = make_shared<op::Convert>(B, element::bf16);
-    void* bf16_input_dst = std::malloc(input_size);
+    auto conv1 = make_shared<op::Convolution>(A_bf16,
-    void* bf16_weights_dst = std::malloc(weights_size);
+                                              B_bf16,
-    // convert float data to bfloat16
-    ngraph::test::float_to_bf16(input.data(), bf16_input_dst, input.size());
-    ngraph::test::float_to_bf16(weights.data(), bf16_weights_dst, weights.size());
-    auto conv1 = make_shared<op::Convolution>(A,
-                                              B,
                                              Strides{1, 1},
                                              Strides{1, 1},
                                              CoordinateDiff{0, 0},
@@ -2229,24 +2210,15 @@ TEST(cpu_test, convolution_simple_bf16)
    auto backend = runtime::Backend::create("CPU");
    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::bf16, shape_a);
+    auto a = backend->create_tensor(element::f32, shape_a);
-    a->write(bf16_input_dst, input_size);
+    copy_data(a, input);
-    auto b = backend->create_tensor(element::bf16, shape_b);
+    auto b = backend->create_tensor(element::f32, shape_b);
-    b->write(bf16_weights_dst, weights_size);
+    copy_data(b, weights);
    auto result = backend->create_tensor(element::bf16, shape_r);
-    vector<float> expected_result{18.0f, 24.0f, 30.0f, 36.0f, 18.0f, 24.0f, 30.0f, 36.0f};
    auto handle = backend->compile(f);
    handle->call_with_validate({result}, {a, b});
-    // convert the output back to float
+    EXPECT_EQ((vector<bfloat16>{18.0, 24.0, 30.0, 36.0, 18.0, 24.0, 30.0, 36.0}),
-    void* fp_dst = malloc(shape_size(shape_r) * 4);
+              read_vector<bfloat16>(result));
-    ngraph::test::bf16_to_float(
-        static_pointer_cast<runtime::cpu::CPUTensorView>(result)->get_data_ptr(),
-        fp_dst,
-        shape_size(shape_r));
-    auto c = backend->create_tensor(element::f32, shape_r);
-    c->write(fp_dst, shape_size(shape_r) * 4);
-    EXPECT_TRUE(test::all_close_f(vector<float>{expected_result}, read_vector<float>(c)));
 }
 #endif
--- a/test/util/float_util.cpp
+++ b/test/util/float_util.cpp
@@ -184,35 +184,3 @@ double ngraph::test::bits_to_double(const std::string& s)
    du.i = static_cast<uint64_t>(bs.to_ullong());
    return du.d;
 }
-//
-//                f32 Mantissa
-//             <---------------------->
-//               bf16 Mantissa
-//   S   E      <------>
-//   0|00011110|0101010|1000011111111000
-//   1. Right shift number >> 16 which gives 0|00011110|0101010
-//   2. Logical & with 0xffff gives     &    1|11111111|1111111
-//                                         ---------------------
-//                                           0|00011110|0101010
-void ngraph::test::float_to_bf16(void* src, void* dst, int size)
-{
-    int* a = static_cast<int*>(src);
-    char16_t* b = static_cast<char16_t*>(dst);
-    for (; size != 0; b++, size--, a++)
-    {
-        *b = (a[0] >> 16) & 0xffff;
-    }
-}
-void ngraph::test::bf16_to_float(void* src, void* dst, int size)
-{
-    char16_t* a = static_cast<char16_t*>(src);
-    int* b = static_cast<int*>(dst);
-    for (; size != 0; a++, b++, size--)
-    {
-        *b = (a[0] & 0xffff) << 16;
-    }
-}
--- a/test/util/float_util.hpp
+++ b/test/util/float_util.hpp
@@ -57,9 +57,5 @@ namespace ngraph
        double bits_to_double(const std::string& s);
        float16 bits_to_float16(const std::string& s);
-        void float_to_bf16(void* src, void* dst, int size);
-        void bf16_to_float(void* src, void* dst, int size);
    }
 }