Simple bf16 test case for CPU backend (#3598)

* test bfloat16 with CPU maxpool * Move test to cpu_test * convert result from bfloat16 to float * Modify result op to support bfloat16 * Correct comment * Add a simple convolution test case * initialize floats with float literals * Guard bf16 tests with MKLDNN_VERSION_MAJOR >= 1

Simple bf16 test case for CPU backend (#3598)
* test bfloat16 with CPU maxpool * Move test to cpu_test * convert result from bfloat16 to float * Modify result op to support bfloat16 * Correct comment * Add a simple convolution test case * initialize floats with float literals * Guard bf16 tests with MKLDNN_VERSION_MAJOR >= 1
7ea34fc2 · Nishant Patel · Scott Cyphers · 61c9d2b0 · 7ea34fc2 · 7ea34fc2
Commit 7ea34fc2 authored Sep 23, 2019 by Nishant Patel Committed by Scott Cyphers Sep 23, 2019
7 changed files
--- a/src/ngraph/runtime/cpu/cpu_builder.cpp
+++ b/src/ngraph/runtime/cpu/cpu_builder.cpp
@@ -350,7 +350,31 @@ namespace ngraph
            template <>
            void Builder::BUILDER_DECL(ngraph::op::Result)
            {
-                BUILD_UNARY_ELEMWISE_FUNCTOR(runtime::cpu::kernel::result);
+                if (args[0].get_element_type() == element::bf16)
+                {
+                    auto& functors = external_function->get_functors();
+                    std::function<void(void*, void*, size_t, int)> kernel;
+                    kernel = ngraph::runtime::cpu::kernel::result<bfloat16>;
+                    auto element_count = out[0].get_size();
+                    auto arg0_buffer_index =
+                        external_function->get_buffer_index(args[0].get_name());
+                    auto out0_buffer_index = external_function->get_buffer_index(out[0].get_name());
+                    auto functor = [&, kernel, element_count, arg0_buffer_index, out0_buffer_index](
+                        CPURuntimeContext* ctx, CPUExecutionContext* ectx) {
+                        kernel(ctx->buffer_data[arg0_buffer_index],
+                               ctx->buffer_data[out0_buffer_index],
+                               element_count,
+                               ectx->arena);
+                    };
+                    functors.emplace_back(functor);
+                }
+                else
+                {
+                    BUILD_UNARY_ELEMWISE_FUNCTOR(runtime::cpu::kernel::result);
+                }
            }
            template <>

--- a/src/ngraph/runtime/cpu/mkldnn_utils.hpp
+++ b/src/ngraph/runtime/cpu/mkldnn_utils.hpp
@@ -204,13 +204,15 @@ namespace ngraph
                    // Data
                    if (node->get_input_element_type(0) != element::f32 &&
                        node->get_input_element_type(0) != element::i8 &&
-                        node->get_input_element_type(0) != element::u8)
+                        node->get_input_element_type(0) != element::u8 &&
+                        node->get_input_element_type(0) != element::bf16)
                    {
                        return false;
                    }
                    // Weights
                    if (node->get_input_element_type(1) != element::f32 &&
-                        node->get_input_element_type(1) != element::i8)
+                        node->get_input_element_type(1) != element::i8 &&
+                        node->get_input_element_type(1) != element::bf16)
                    {
                        return false;
                    }
@@ -218,7 +220,8 @@ namespace ngraph
                    if (node->get_output_element_type(0) != element::f32 &&
                        node->get_output_element_type(0) != element::i8 &&
                        node->get_output_element_type(0) != element::u8 &&
-                        node->get_output_element_type(0) != element::i32)
+                        node->get_output_element_type(0) != element::i32 &&
+                        node->get_output_element_type(0) != element::bf16)
                    {
                        return false;
                    }

--- a/src/ngraph/runtime/cpu/pass/cpu_assignment.cpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_assignment.cpp
@@ -408,7 +408,8 @@ namespace ngraph
                         (arg0_rank == 5 && max_pool->get_window_shape().size() == 3)) &&
                        (node->get_input_element_type(0) == element::f32 ||
                         node->get_input_element_type(0) == element::u8 ||
-                         node->get_input_element_type(0) == element::i8))
+                         node->get_input_element_type(0) == element::i8 ||
+                         node->get_input_element_type(0) == element::bf16))
                    {
                        runtime::cpu::mkldnn_utils::assign_mkldnn_kernel(node);
                    }

--- a/test/backend/pool.in.cpp
+++ b/test/backend/pool.in.cpp
@@ -25,6 +25,7 @@
 #include "ngraph/ngraph.hpp"
 #include "util/all_close.hpp"
 #include "util/all_close_f.hpp"
+#include "util/float_util.hpp"
 #include "util/ndarray.hpp"
 #include "util/random.hpp"
 #include "util/test_control.hpp"

--- a/test/cpu_test.cpp
+++ b/test/cpu_test.cpp
@@ -49,6 +49,7 @@
 #include "util/all_close_f.hpp"
 #include "util/autodiff/backprop_function.hpp"
 #include "util/autodiff/numeric_compare.hpp"
+#include "util/float_util.hpp"
 #include "util/ndarray.hpp"
 #include "util/random.hpp"
 #include "util/test_tools.hpp"
@@ -2153,3 +2154,99 @@ TEST(cpu_test, tensor_copy_from_different_layout)
    EXPECT_EQ((vector<uint8_t>{1, 4, 2, 5, 3, 6}), read_vector<uint8_t>(b));
 }
+#if MKLDNN_VERSION_MAJOR >= 1
+TEST(cpu_test, max_pool_bf16)
+{
+    Shape shape_a{1, 1, 3, 5};
+    Shape window_shape{2, 3};
+    auto window_movement_strides = Strides{1, 1};
+    Shape padding_below{0, 0};
+    Shape padding_above{0, 0};
+    Shape shape_r{1, 1, 2, 3};
+    // input data
+    vector<float> a_data = {
+        0.5f, 1.5f, 0.5f, 2.5f, 1.5f, 0.5f, 3.5f, 2.5f, 0.5f, 0.5f, 2.5f, 0.5f, 0.5f, 0.5f, 1.5f};
+    // allocate memory for destination
+    int size = a_data.size() * sizeof(float) / 2;
+    void* bf16_dst = std::malloc(size);
+    // convert float data to bfloat16
+    ngraph::test::float_to_bf16(a_data.data(), bf16_dst, a_data.size());
+    auto A = make_shared<op::Parameter>(element::bf16, shape_a);
+    auto QMP = make_shared<ngraph::op::MaxPool>(
+        A, window_shape, window_movement_strides, padding_below, padding_above);
+    auto f = make_shared<Function>(NodeVector{QMP}, ParameterVector{A});
+    auto backend = runtime::Backend::create("CPU");
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::bf16, shape_a);
+    a->write(bf16_dst, size);
+    auto result = backend->create_tensor(element::bf16, shape_r);
+    auto handle = backend->compile(f);
+    handle->call_with_validate({result}, {a});
+    // convert the output back to float
+    void* fp_dst = malloc(shape_size(shape_r) * 4);
+    ngraph::test::bf16_to_float(
+        static_pointer_cast<runtime::cpu::CPUTensorView>(result)->get_data_ptr(),
+        fp_dst,
+        shape_size(shape_r));
+    auto b = backend->create_tensor(element::f32, shape_r);
+    b->write(fp_dst, shape_size(shape_r) * 4);
+    EXPECT_EQ((vector<float>{3.5f, 3.5f, 2.5f, 3.5f, 3.5f, 2.5f}), read_vector<float>(b));
+}
+TEST(cpu_test, convolution_simple_bf16)
+{
+    Shape shape_a{1, 2, 2, 2};
+    auto A = make_shared<op::Parameter>(element::bf16, shape_a);
+    Shape shape_b{2, 2, 1, 1};
+    auto B = make_shared<op::Parameter>(element::bf16, shape_b);
+    Shape shape_r{1, 2, 2, 2};
+    vector<float> input = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f};
+    vector<float> weights = {3.0f, 3.0f, 3.0f, 3.0f};
+    int input_size = input.size() * sizeof(float) / 2;
+    int weights_size = weights.size() * sizeof(float) / 2;
+    void* bf16_input_dst = std::malloc(input_size);
+    void* bf16_weights_dst = std::malloc(weights_size);
+    // convert float data to bfloat16
+    ngraph::test::float_to_bf16(input.data(), bf16_input_dst, input.size());
+    ngraph::test::float_to_bf16(weights.data(), bf16_weights_dst, weights.size());
+    auto conv1 = make_shared<op::Convolution>(A,
+                                              B,
+                                              Strides{1, 1},
+                                              Strides{1, 1},
+                                              CoordinateDiff{0, 0},
+                                              CoordinateDiff{0, 0},
+                                              Strides{1, 1});
+    auto f = make_shared<Function>(conv1, ParameterVector{A, B});
+    auto backend = runtime::Backend::create("CPU");
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::bf16, shape_a);
+    a->write(bf16_input_dst, input_size);
+    auto b = backend->create_tensor(element::bf16, shape_b);
+    b->write(bf16_weights_dst, weights_size);
+    auto result = backend->create_tensor(element::bf16, shape_r);
+    vector<float> expected_result{18.0f, 24.0f, 30.0f, 36.0f, 18.0f, 24.0f, 30.0f, 36.0f};
+    auto handle = backend->compile(f);
+    handle->call_with_validate({result}, {a, b});
+    // convert the output back to float
+    void* fp_dst = malloc(shape_size(shape_r) * 4);
+    ngraph::test::bf16_to_float(
+        static_pointer_cast<runtime::cpu::CPUTensorView>(result)->get_data_ptr(),
+        fp_dst,
+        shape_size(shape_r));
+    auto c = backend->create_tensor(element::f32, shape_r);
+    c->write(fp_dst, shape_size(shape_r) * 4);
+    EXPECT_TRUE(test::all_close_f(vector<float>{expected_result}, read_vector<float>(c)));
+}
+#endif
--- a/test/util/float_util.cpp
+++ b/test/util/float_util.cpp
@@ -184,3 +184,35 @@ double ngraph::test::bits_to_double(const std::string& s)
    du.i = static_cast<uint64_t>(bs.to_ullong());
    return du.d;
 }
+//
+//                f32 Mantissa
+//             <---------------------->
+//               bf16 Mantissa
+//   S   E      <------>
+//   0|00011110|0101010|1000011111111000
+//   1. Right shift number >> 16 which gives 0|00011110|0101010
+//   2. Logical & with 0xffff gives     &    1|11111111|1111111
+//                                         ---------------------
+//                                           0|00011110|0101010
+void ngraph::test::float_to_bf16(void* src, void* dst, int size)
+{
+    int* a = static_cast<int*>(src);
+    char16_t* b = static_cast<char16_t*>(dst);
+    for (; size != 0; b++, size--, a++)
+    {
+        *b = (a[0] >> 16) & 0xffff;
+    }
+}
+void ngraph::test::bf16_to_float(void* src, void* dst, int size)
+{
+    char16_t* a = static_cast<char16_t*>(src);
+    int* b = static_cast<int*>(dst);
+    for (; size != 0; a++, b++, size--)
+    {
+        *b = (a[0] & 0xffff) << 16;
+    }
+}
--- a/test/util/float_util.hpp
+++ b/test/util/float_util.hpp
@@ -57,5 +57,9 @@ namespace ngraph
        double bits_to_double(const std::string& s);
        float16 bits_to_float16(const std::string& s);
+        void float_to_bf16(void* src, void* dst, int size);
+        void bf16_to_float(void* src, void* dst, int size);
    }
 }