Merge branch 'master' into ayzhuang/batch_norm_infer_relu_fusion

60252edd · Scott Cyphers · GitHub · 341205cf · 47342339 · 60252edd
Unverified Commit 60252edd authored 5 years ago by Scott Cyphers Committed by GitHub 5 years ago
19 changed files
--- a/cmake/external_mkldnn.cmake
+++ b/cmake/external_mkldnn.cmake
@@ -18,10 +18,10 @@ include(ExternalProject)

 # Includes blas 3.8.0 in mkldnn
 set(NGRAPH_MKLDNN_SHORT_VERSION 0)
-set(NGRAPH_MKLDNN_FULL_VERSION 0.19.0.0)
-set(NGRAPH_MKLDNN_VERSION "v0.19")
+set(NGRAPH_MKLDNN_FULL_VERSION 0.20.0.0)
+set(NGRAPH_MKLDNN_VERSION "v0.20")
 set(NGRAPH_MKLDNN_SUB_VERSION "2019.0.5.20190502")
-set(NGRAPH_MKLDNN_GIT_TAG "027de76")
+set(NGRAPH_MKLDNN_GIT_TAG "v0.20")

 #------------------------------------------------------------------------------
 # Fetch and install MKL-DNN

--- a/cmake/mkldnn.patch
+++ b/cmake/mkldnn.patch
@@ -28,16 +28,3 @@ index f10feb20..05f47961 100644
 set_property(TARGET ${LIB_NAME} PROPERTY PUBLIC_HEADER ${HEADERS})
 
 target_include_directories(${LIB_NAME} PUBLIC
-diff --git a/src/cpu/jit_avx512_common_conv_kernel.cpp b/src/cpu/jit_avx512_common_conv_kernel.cpp
-index 1bb98fa43..b8b54401f 100644
--- a/src/cpu/jit_avx512_common_conv_kernel.cpp
-+++ b/src/cpu/jit_avx512_common_conv_kernel.cpp
-@@ -3055,7 +3055,7 @@ void jit_avx512_common_conv_bwd_weights_kernel_f32::bias_kernel_3d() {
- void jit_avx512_common_conv_bwd_weights_kernel_f32
-     ::compute_oh_loop_common()
- {
-    assert(jcp.harness == harness_mb_reduction);
-+    assert(one_of(jcp.harness, harness_mb_reduction, harness_3d_reduction));
-     int b_pad = jcp.b_pad;
-     int t_pad = jcp.t_pad;
-     bool is_dilated = jcp.dilate_h != 0;
--- a/doc/sphinx/conf.py
+++ b/doc/sphinx/conf.py
-
-
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 #
@@ -73,11 +71,11 @@ author = 'Intel Corporation'
 # built documents.
 #
 # The short X.Y version.
-version = '0.22'
+version = '0.23'

 # The Documentation full version, including alpha/beta/rc tags. Some features
 # available in the latest code will not necessarily be documented first
-release = '0.22.0'
+release = '0.23.0'

 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.

--- a/doc/sphinx/ngraph_theme/ngversions.html
+++ b/doc/sphinx/ngraph_theme/ngversions.html
@@ -9,11 +9,11 @@
        <dt>{{ _('Recent Versions') }}</dt>
        <dd><!-- Until our https://docs.ngraph.ai/ publishing is set up, we link to GitHub -->  
          <ul>
-           <li><a href="https://github.com/NervanaSystems/ngraph/releases/tag/v0.22.0">0.22</a></li>
+           <li><a href="https://github.com/NervanaSystems/ngraph/releases/tag/v0.23.0">0.23.0</a></li>
+           <li><a href="https://github.com/NervanaSystems/ngraph/releases/tag/v0.22.0">0.22.0</a></li>
           <li><a href="https://github.com/NervanaSystems/ngraph/releases/tag/v0.21.0">0.21.0</a></li>
           <li><a href="https://github.com/NervanaSystems/ngraph/releases/tag/v0.20.0">0.20.0</a></li>
           <li><a href="https://github.com/NervanaSystems/ngraph/releases/tag/v0.19.0">0.19.0</a></li>
-           <li><a href="https://github.com/NervanaSystems/ngraph/releases/tag/v0.18.1">0.18.1</a></li>
         </ul></dd>
      </dl>
      <dl>

--- a/doc/sphinx/source/project/release-notes.rst
+++ b/doc/sphinx/source/project/release-notes.rst
@@ -6,28 +6,30 @@ Release Notes
 nGraph is provided as source code, APIs, build scripts, and some binary formats 
 for various Compiler stack configurations and use cases. 

+For downloads formatted as ``.zip`` and ``tar.gz``, see 
+https://github.com/NervanaSystems/ngraph/releases.
+
 This page includes additional documentation updates.

 We are pleased to announce the release of version |version|-doc.


-==============================
-
 Core updates for |version|
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~

+ PlaidML support
 + More ONNX ops
-+ Optimizations
-+ Don't reseed RNG on each use
+ Elementwise divide defaults to Python semantics
+ GenerateMask seed optional

-0.22-doc
--------

-+ Initial doc and API for IntelGPU backend. 
-+ DynamicBackend API.
-+ Note deprecation of support of MXNet's ``ngraph-mxnet`` PyPI.
-+ Noted changes on graph inspection options resultant from PR 3016.
-+ Added better tips and details to doc-contributor-README.
+Latest doc updates
+~~~~~~~~~~~~~~~~~~
+
+ Document new debug tool
+ Note deprecation of MXNet's ``ngraph-mxnet`` PyPI
+ Note default change to `svg` files for graphs and visualization
+ Add more prominent tips for contributors who find the doc-contributor-README


 .. important:: Pre-releases (``-rc-0.*``) have newer features, and are less stable.  
@@ -36,8 +38,15 @@ Core updates for |version|
 Changelog on Previous Releases
 ==============================

-For downloads formatted as ``.zip`` and ``tar.gz``, see 
-https://github.com/NervanaSystems/ngraph/releases.
+0.22
+----
+
+ More ONNX ops
+ Optimizations
+ Don't reseed RNG on each use
+ Initial doc and API for IntelGPU backend 
+ DynamicBackend API
+

 0.21
 ----
@@ -51,12 +60,6 @@ https://github.com/NervanaSystems/ngraph/releases.
 + offset arg for tensor creation is deprecated
 + static linking support
 + Initial test of 0.21-doc
-
-0.21-doc
--------
-
-Summary of documentation-related changes:
-
 + Updated :doc:`doc-contributor-README` for new community-based contributions. 
 + Added instructions on how to test or display the installed nGraph version.
 + Added instructions on building nGraph bridge (ngraph-bridge).
@@ -82,8 +85,6 @@ Summary of documentation-related changes:
 0.19
 ----

-**Download** `0.19.0-rc.2`_
-
 + More dynamic shape preparation
 + Distributed interface factored out
 + fp16 and bfloat16 types
@@ -103,9 +104,6 @@ Summary of documentation-related changes:
 0.18
 ----

-**Download** `0.18.1`_
-
-
 + Python formatting issue
 + mkl-dnn work-around
 + Event tracing improvements
@@ -118,8 +116,6 @@ Summary of documentation-related changes:
 0.17
 ----

-**Download** `0.17.0-rc.1`_
-
 + Allow negative padding in more places
 + Add code generation for some quantized ops
 + Preliminary dynamic shape support
@@ -131,11 +127,6 @@ Summary of documentation-related changes:
 0.16
 ----

-* **Download**: `0.16.0-rc.3`_
-* **Download** `0.16.0-rc.2`_
-* **Download** `0.16.0-rc.1`_
-
-
 + NodeInput and NodeOutput classes prepare for simplifications of Node
 + Test improvements
 + Additional quantization ops
@@ -143,11 +134,3 @@ Summary of documentation-related changes:
 + Fix memory leak
 + Concat optimization
 + Doc updates
-
-.. _0.20.0-rc.0: https://github.com/NervanaSystems/ngraph/releases/tag/v0.20.0-rc.0_
-.. _0.19.0-rc.2: https://github.com/NervanaSystems/ngraph/releases/tag/v0.19.0-rc.2_
-.. _0.18.1: https://github.com/NervanaSystems/ngraph/releases/tag/v0.18.1_
-.. _0.17.0-rc.1: `https://github.com/NervanaSystems/ngraph/releases/tag/v0.17.0-rc.1
-.. _0.16.0-rc.3: https://github.com/NervanaSystems/ngraph/releases/tag/v0.16.0-rc.3
-.. _0.16.0-rc.2: https://github.com/NervanaSystems/ngraph/releases/tag/v0.16.0-rc.2
-.. _0.16.0-rc.1: https://github.com/NervanaSystems/ngraph/releases/tag/v0.16.0-rc.1
--- a/python/test_requirements.txt
+++ b/python/test_requirements.txt
 pytest
 tox
+pydocstyle==3.0.0
 flake8
 flake8-commas
 flake8-comprehensions

--- a/src/ngraph/CMakeLists.txt
+++ b/src/ngraph/CMakeLists.txt
@@ -370,7 +370,6 @@ set (SRC
    op/util/index_reduction.hpp
    op/util/logical_reduction.cpp
    op/util/logical_reduction.hpp
-    op/util/reshape.hpp
    op/util/rnn_cell_base.cpp
    op/util/rnn_cell_base.hpp
    op/util/unary_elementwise_arithmetic.cpp

--- a/src/ngraph/op/util/reshape.hpp
+++ b/src/ngraph/op/util/reshape.hpp
-//*****************************************************************************
-// Copyright 2017-2019 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//*****************************************************************************
-
-#pragma once
-
-#include <cstddef>
-#include <memory>
-#include <vector>
-
-#include "ngraph/builder/reshape.hpp"
-#include "ngraph/node.hpp"
-#include "ngraph/shape.hpp"
-
-namespace ngraph
-{
-    namespace op
-    {
-        namespace util
-        {
-            /// \brief      Change shape of input tensor.
-            ///
-            /// \param[in]  node   The node producing the tensor to be reshaped.
-            /// \param[in]  shape  The new shape for input tensor.
-            ///
-            /// \return     The node representing a Reshape operation.
-            ///
-            std::shared_ptr<ngraph::Node> reshape(const std::shared_ptr<ngraph::Node>& node,
-                                                  const Shape& shape)
-            {
-                return builder::reshape(node, shape);
-            }
-
-            /// \brief Permute axes according to specified axes_order parameter.
-            ///
-            /// \param node The node which axes we want to permute.
-            /// \param axes_order The permutation of node tensor axes.
-            ///
-            /// \return: New node with permuted axes.
-            std::shared_ptr<ngraph::Node> reorder_axes(const std::shared_ptr<ngraph::Node>& node,
-                                                       std::vector<std::size_t> axes_order)
-            {
-                return builder::reorder_axes(node, axes_order);
-            }
-
-            /// \brief Return transposed tensor (with axes in reversed order).
-            ///
-            /// \param node Input tensor we want to transpose
-            ///
-            /// \return: New node with reversed dimensions.
-            std::shared_ptr<ngraph::Node> transpose(const std::shared_ptr<ngraph::Node>& node)
-            {
-                return builder::transpose(node);
-            }
-
-            /// \brief Flatten the input tensor into a 2D matrix.
-            ///
-            /// \param node The tensor to be flattened.
-            /// \param axis The axis dividing shape.
-            ///
-            /// \return The new node will be a 2D matrix representing the flattened input node.
-            std::shared_ptr<ngraph::Node> flatten(const std::shared_ptr<ngraph::Node>& node,
-                                                  int axis)
-            {
-                return builder::flatten(node, axis);
-            }
-        } // namespace util
-    }     // namespace  op
-} // namespace  ngraph
--- a/src/ngraph/runtime/allocator.cpp
+++ b/src/ngraph/runtime/allocator.cpp
@@ -49,7 +49,8 @@ public:
    }
 };

-std::unique_ptr<ngraph::runtime::Allocator> ngraph::runtime::create_default_allocator()
+ngraph::runtime::Allocator* ngraph::runtime::get_default_allocator()
 {
-    return std::unique_ptr<DefaultAllocator>(new DefaultAllocator());
+    static std::unique_ptr<DefaultAllocator> allocator(new DefaultAllocator());
+    return allocator.get();
 }
--- a/src/ngraph/runtime/allocator.hpp
+++ b/src/ngraph/runtime/allocator.hpp
@@ -30,7 +30,7 @@ namespace ngraph
        class DefaultAllocator;
        /// \brief Create a default allocator that calls into system
        ///        allocation libraries
-        std::unique_ptr<Allocator> create_default_allocator();
+        ngraph::runtime::Allocator* get_default_allocator();
    }
 }


--- a/src/ngraph/runtime/cpu/cpu_backend.cpp
+++ b/src/ngraph/runtime/cpu/cpu_backend.cpp
@@ -185,7 +185,7 @@ runtime::Allocator* runtime::cpu::CPU_Backend::get_host_memory_allocator()
 {
    if (!m_allocator)
    {
-        m_allocator = create_default_allocator();
+        return runtime::get_default_allocator();
    }
    return m_allocator.get();
 }

--- a/src/ngraph/runtime/generic_cpu/CMakeLists.txt
+++ b/src/ngraph/runtime/generic_cpu/CMakeLists.txt
@@ -15,10 +15,10 @@
 # ******************************************************************************

 if (NGRAPH_GENERIC_CPU_ENABLE)
-    find_package(OpenMP)
-    if (OPENMP_FOUND)
-        add_compile_options(${OpenMP_CXX_FLAGS})
-    endif()
+    # find_package(OpenMP)
+    # if (OPENMP_FOUND)
+    #     add_compile_options(${OpenMP_CXX_FLAGS})
+    # endif()
    add_library(gcpu_backend SHARED gcpu_backend.cpp gcpu_executable.cpp node_wrapper.cpp)
    if(NGRAPH_LIB_VERSIONING_ENABLE)
        set_target_properties(gcpu_backend PROPERTIES

--- a/src/ngraph/runtime/generic_cpu/gcpu_backend.cpp
+++ b/src/ngraph/runtime/generic_cpu/gcpu_backend.cpp
@@ -52,14 +52,14 @@ runtime::gcpu::GCPUBackend::GCPUBackend(const vector<string>& unsupported_op_nam
 shared_ptr<runtime::Tensor> runtime::gcpu::GCPUBackend::create_tensor(const element::Type& type,
                                                                      const Shape& shape)
 {
-    return make_shared<runtime::HostTensor>(type, shape, this);
+    return make_shared<runtime::HostTensor>(type, shape);
 }

 shared_ptr<runtime::Tensor> runtime::gcpu::GCPUBackend::create_tensor(const element::Type& type,
                                                                      const Shape& shape,
                                                                      void* memory_pointer)
 {
-    return make_shared<runtime::HostTensor>(type, shape, memory_pointer, this);
+    return make_shared<runtime::HostTensor>(type, shape, memory_pointer);
 }

 shared_ptr<runtime::Executable>

--- a/src/ngraph/runtime/generic_cpu/gcpu_executable.cpp
+++ b/src/ngraph/runtime/generic_cpu/gcpu_executable.cpp
@@ -15,17 +15,22 @@
 //*****************************************************************************

 #include "ngraph/runtime/generic_cpu/gcpu_executable.hpp"
+#include "ngraph/cpio.hpp"
 #include "ngraph/descriptor/layout/dense_tensor_layout.hpp"
 #include "ngraph/except.hpp"
 #include "ngraph/op/convert.hpp"
 #include "ngraph/op/select.hpp"
 #include "ngraph/op/util/binary_elementwise_comparison.hpp"
 #include "ngraph/pass/assign_layout.hpp"
+#include "ngraph/pass/core_fusion.hpp"
+#include "ngraph/pass/fused_op_decomposition.hpp"
+#include "ngraph/pass/implicit_broadcast_elimination.hpp"
 #include "ngraph/pass/like_replacement.hpp"
 #include "ngraph/pass/liveness.hpp"
 #include "ngraph/pass/manager.hpp"
 #include "ngraph/pass/memory_layout.hpp"
 #include "ngraph/runtime/backend_manager.hpp"
+#include "ngraph/serializer.hpp"
 #include "ngraph/util.hpp"

 using namespace std;
@@ -35,21 +40,35 @@ using descriptor::layout::DenseTensorLayout;

 runtime::gcpu::GCPUExecutable::GCPUExecutable(const shared_ptr<Function>& function,
                                              bool enable_performance_collection)
+    : m_is_compiled{true}
+    , m_performance_counters_enabled{enable_performance_collection}
 {
-    {
-        m_is_compiled = true;
+    m_function = clone_function(*function);
    pass::Manager pass_manager;
    pass_manager.register_pass<pass::LikeReplacement>();
+    pass_manager.register_pass<pass::FusedOpDecomposition>();
+    pass_manager.register_pass<pass::ImplicitBroadcastElimination>();
    pass_manager.register_pass<pass::AssignLayout<DenseTensorLayout>>();
    pass_manager.register_pass<pass::Liveness>();
-        pass_manager.run_passes(function);
+    pass_manager.run_passes(m_function);

-        for (const shared_ptr<Node>& node : function->get_ordered_ops())
+    for (const shared_ptr<Node>& node : m_function->get_ordered_ops())
    {
        m_wrapped_nodes.emplace_back(node);
    }
+    set_parameters_and_results(*m_function);
+}
+
+runtime::gcpu::GCPUExecutable::GCPUExecutable(const std::string& model_string)
+    : m_is_compiled{true}
+    , m_performance_counters_enabled{false}
+{
+    m_function = deserialize(model_string);
+    for (const shared_ptr<Node>& node : m_function->get_ordered_ops())
+    {
+        m_wrapped_nodes.emplace_back(node);
    }
-    set_parameters_and_results(*function);
+    set_parameters_and_results(*m_function);
 }

 bool runtime::gcpu::GCPUExecutable::call(const vector<shared_ptr<runtime::Tensor>>& outputs,
@@ -82,7 +101,7 @@ bool runtime::gcpu::GCPUExecutable::call(const vector<shared_ptr<runtime::Tensor
    {
        for (size_t i = 0; i < param->get_output_size(); ++i)
        {
-            descriptor::Tensor* tensor = param->get_output_tensor_ptr(i).get();
+            descriptor::Tensor* tensor = &param->output(i).get_tensor();
            tensor_map.insert({tensor, func_inputs[input_count++]});
        }
    }
@@ -95,14 +114,14 @@ bool runtime::gcpu::GCPUExecutable::call(const vector<shared_ptr<runtime::Tensor
        {
            throw ngraph_error("One of function's outputs isn't op::Result");
        }
-        descriptor::Tensor* tensor = output->get_output_tensor_ptr(0).get();
+        descriptor::Tensor* tensor = &output->output(0).get_tensor();
        tensor_map.insert({tensor, func_outputs[output_count]});
    }

    // for each ordered op in the graph
    for (const NodeWrapper& wrapped : m_wrapped_nodes)
    {
-        const Node* op = &wrapped.get_node();
+        auto op = wrapped.get_node();
        auto type_id = wrapped.get_typeid();
        if (type_id == OP_TYPEID::Parameter)
        {
@@ -111,9 +130,9 @@ bool runtime::gcpu::GCPUExecutable::call(const vector<shared_ptr<runtime::Tensor

        // get op inputs from map
        vector<shared_ptr<HostTensor>> op_inputs;
-        for (const descriptor::Input& input : op->get_inputs())
+        for (auto input : op->inputs())
        {
-            descriptor::Tensor* tensor = input.get_output().get_tensor_ptr().get();
+            descriptor::Tensor* tensor = &input.get_tensor();
            op_inputs.push_back(tensor_map.at(tensor));
        }

@@ -121,14 +140,14 @@ bool runtime::gcpu::GCPUExecutable::call(const vector<shared_ptr<runtime::Tensor
        vector<shared_ptr<HostTensor>> op_outputs;
        for (size_t i = 0; i < op->get_output_size(); ++i)
        {
-            descriptor::Tensor* tensor = op->get_output_tensor_ptr(i).get();
+            descriptor::Tensor* tensor = &op->output(i).get_tensor();
            shared_ptr<HostTensor> host_tensor;
            auto it = tensor_map.find(tensor);
            if (it == tensor_map.end())
            {
                const Shape& shape = op->get_output_shape(i);
                const element::Type& type = op->get_output_element_type(i);
-                string name = op->get_output_tensor(i).get_name();
+                string name = op->output(i).get_tensor().get_name();
                host_tensor = make_shared<runtime::HostTensor>(type, shape, name);
                tensor_map.insert({tensor, host_tensor});
            }
@@ -177,7 +196,7 @@ bool runtime::gcpu::GCPUExecutable::call(const vector<shared_ptr<runtime::Tensor
        }
        if (m_nan_check_enabled)
        {
-            perform_nan_check(op_outputs, op);
+            perform_nan_check(op_outputs, op.get());
        }
    }

@@ -186,19 +205,9 @@ bool runtime::gcpu::GCPUExecutable::call(const vector<shared_ptr<runtime::Tensor

 void runtime::gcpu::GCPUExecutable::generate_calls(const element::Type& type,
                                                   const NodeWrapper& op,
-                                                   const vector<shared_ptr<HostTensor>>& outputs,
-                                                   const vector<shared_ptr<HostTensor>>& inputs)
+                                                   const vector<shared_ptr<HostTensor>>& out,
+                                                   const vector<shared_ptr<HostTensor>>& in)
 {
-    vector<void*> out;
-    vector<const void*> in;
-    for (auto t : outputs)
-    {
-        out.push_back(t->get_data_ptr());
-    }
-    for (auto t : inputs)
-    {
-        in.push_back(t->get_data_ptr());
-    }
    stringstream ss;
    switch (type.get_type_enum())
    {
@@ -216,7 +225,8 @@ void runtime::gcpu::GCPUExecutable::generate_calls(const element::Type& type,
    case element::Type_t::undefined:
    case element::Type_t::dynamic:
    case element::Type_t::bf16:
-        ss << "unsupported element type " << type << " op " << op.get_node().get_name();
+    case element::Type_t::f16:
+        ss << "unsupported element type " << type << " op " << op.get_node()->get_name();
        throw ngraph_error(ss.str());
    }
 }
@@ -229,11 +239,9 @@ void runtime::gcpu::GCPUExecutable::set_nan_check(bool enable)
 vector<runtime::PerformanceCounter> runtime::gcpu::GCPUExecutable::get_performance_data() const
 {
    vector<runtime::PerformanceCounter> rc;
-    for (const pair<const Node*, stopwatch> p : m_timer_map)
+    for (const pair<shared_ptr<const Node>, stopwatch> p : m_timer_map)
    {
-        rc.emplace_back(p.first->get_name().c_str(),
-                        p.second.get_total_microseconds(),
-                        p.second.get_call_count());
+        rc.emplace_back(p.first, p.second.get_total_microseconds(), p.second.get_call_count());
    }
    return rc;
 }
@@ -286,3 +294,12 @@ void runtime::gcpu::GCPUExecutable::perform_nan_check(const vector<shared_ptr<Ho
        arg_number++;
    }
 }
+
+void runtime::gcpu::GCPUExecutable::save(ostream& out)
+{
+    cpio::Writer writer(out);
+    string si = "INTERPRETER Save File 1.0";
+    writer.write("save_info", si.data(), si.size());
+    string model = serialize(m_function, 0);
+    writer.write("model", model.data(), model.size());
+}
--- a/src/ngraph/runtime/generic_cpu/gcpu_executable.hpp
+++ b/src/ngraph/runtime/generic_cpu/gcpu_executable.hpp
--- a/src/ngraph/runtime/generic_cpu/kernel/broadcast.hpp
+++ b/src/ngraph/runtime/generic_cpu/kernel/broadcast.hpp
@@ -140,6 +140,91 @@ namespace ngraph
                    }
                }

+                template <typename T>
+                void broadcast_5d(const T* in,
+                                  T* out,
+                                  const Shape& in_shape,
+                                  const Shape& out_shape,
+                                  const AxisSet& broadcast_axes)
+                {
+                    size_t index[5];
+                    size_t* out_index = 0;
+                    for (size_t i = 0; i < 5; i++)
+                    {
+                        if (broadcast_axes.count(i) == 0)
+                        {
+                            out_index = &index[i];
+                            break;
+                        }
+                    }
+                    for (index[0] = 0; index[0] < out_shape[0]; ++index[0])
+                    {
+                        for (index[1] = 0; index[1] < out_shape[1]; ++index[1])
+                        {
+                            for (index[2] = 0; index[2] < out_shape[2]; ++index[2])
+                            {
+                                for (index[3] = 0; index[3] < out_shape[3]; ++index[3])
+                                {
+                                    for (index[4] = 0; index[4] < out_shape[4]; ++index[4])
+                                    {
+                                        out[index[0] * out_shape[1] * out_shape[2] * out_shape[3] *
+                                                out_shape[4] +
+                                            index[1] * out_shape[2] * out_shape[3] * out_shape[4] +
+                                            index[2] * out_shape[3] * out_shape[4] +
+                                            index[3] * out_shape[4] + index[4]] = in[*out_index];
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+
+                template <typename T>
+                void broadcast_6d(const T* in,
+                                  T* out,
+                                  const Shape& in_shape,
+                                  const Shape& out_shape,
+                                  const AxisSet& broadcast_axes)
+                {
+                    size_t index[6];
+                    size_t* out_index = 0;
+                    for (size_t i = 0; i < 6; i++)
+                    {
+                        if (broadcast_axes.count(i) == 0)
+                        {
+                            out_index = &index[i];
+                            break;
+                        }
+                    }
+                    for (index[0] = 0; index[0] < out_shape[0]; ++index[0])
+                    {
+                        for (index[1] = 0; index[1] < out_shape[1]; ++index[1])
+                        {
+                            for (index[2] = 0; index[2] < out_shape[2]; ++index[2])
+                            {
+                                for (index[3] = 0; index[3] < out_shape[3]; ++index[3])
+                                {
+                                    for (index[4] = 0; index[4] < out_shape[4]; ++index[4])
+                                    {
+                                        for (index[5] = 0; index[5] < out_shape[5]; ++index[5])
+                                        {
+                                            out[index[0] * out_shape[1] * out_shape[2] *
+                                                    out_shape[3] * out_shape[4] * out_shape[5] +
+                                                index[1] * out_shape[2] * out_shape[3] *
+                                                    out_shape[4] * out_shape[5] +
+                                                index[2] * out_shape[3] * out_shape[4] *
+                                                    out_shape[5] +
+                                                index[3] * out_shape[4] * out_shape[5] +
+                                                index[4] * out_shape[5] + index[5]] =
+                                                in[*out_index];
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+
                template <typename T>
                void broadcast(const T* in,
                               T* out,
@@ -167,6 +252,16 @@ namespace ngraph
                        case 4:
                            broadcast_4d<T>(in, out, in_shape, out_shape, broadcast_axes);
                            break;
+                        case 5:
+                            broadcast_5d<T>(in, out, in_shape, out_shape, broadcast_axes);
+                            break;
+                        case 6:
+                            broadcast_6d<T>(in, out, in_shape, out_shape, broadcast_axes);
+                            break;
+                        default:
+                            runtime::reference::broadcast<T>(
+                                in, out, in_shape, out_shape, broadcast_axes);
+                            break;
                        }
                    }
                    else

--- a/src/ngraph/runtime/generic_cpu/kernel/reshape.hpp
+++ b/src/ngraph/runtime/generic_cpu/kernel/reshape.hpp
@@ -244,10 +244,7 @@ namespace ngraph
                    case 4: reshape_in4<T>(in, out, in_shape, in_axis_order, out_shape); break;
                    case 5: reshape_in5<T>(in, out, in_shape, in_axis_order, out_shape); break;
                    case 6: reshape_in6<T>(in, out, in_shape, in_axis_order, out_shape); break;
-                    default:
-                        NGRAPH_INFO << "reference::reshape";
-                        reference::reshape(in, out, in_shape, in_axis_order, out_shape);
-                        break;
+                    default: reference::reshape(in, out, in_shape, in_axis_order, out_shape); break;
                    }
                }
            }

--- a/src/ngraph/runtime/generic_cpu/kernel/result.hpp
+++ b/src/ngraph/runtime/generic_cpu/kernel/result.hpp
-//*****************************************************************************
-// Copyright 2017-2019 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//*****************************************************************************
-
-#pragma once
-
-#include <algorithm>
-#include <cmath>
-#include <numeric>
-#include <vector>
-#include "ngraph/shape.hpp"
-
-namespace ngraph
-{
-    namespace runtime
-    {
-        namespace gcpu
-        {
-            namespace kernel
-            {
-                template <typename T>
-                void result(const T* arg, T* out, size_t count)
-                {
-                    memcpy(out, arg, sizeof(T) * count);
-                }
-            }
-        }
-    }
-}
--- a/src/ngraph/runtime/generic_cpu/node_wrapper.hpp
+++ b/src/ngraph/runtime/generic_cpu/node_wrapper.hpp
@@ -51,7 +51,7 @@ class ngraph::runtime::gcpu::NodeWrapper
 public:
    NodeWrapper(const std::shared_ptr<const ngraph::Node>& node);

-    const Node& get_node() const { return *m_node; }
+    std::shared_ptr<const Node> get_node() const { return m_node; }
    ngraph::runtime::gcpu::OP_TYPEID get_typeid() const { return m_typeid; }
 private:
    std::shared_ptr<const ngraph::Node> m_node;