Merge branch 'master' into cyphers/dochow

cbdaab7e · Scott Cyphers · a982c793 · 57baec21 · cbdaab7e · cbdaab7e
Commit cbdaab7e authored Mar 02, 2018 by Scott Cyphers
39 changed files
--- a/cmake/external_llvm_prebuilt.cmake
+++ b/cmake/external_llvm_prebuilt.cmake
@@ -16,8 +16,11 @@
 include(ExternalProject)
-if((NGRAPH_CPU_ENABLE OR NGRAPH_GPU_ENABLE) AND (NOT ${CMAKE_SYSTEM_NAME} MATCHES "Darwin") AND
+if (NGRAPH_CPU_ENABLE AND (${CMAKE_SYSTEM_NAME} MATCHES "Darwin") OR (${CMAKE_SYSTEM_NAME} MATCHES "Windows"))
-                         (NOT ${CMAKE_SYSTEM_NAME} MATCHES "Windows"))
+    message(FATAL_ERROR "The NGRAPH_USE_PREBUILT_LLVM option is not supported on this platform.")
+endif()
+if (NGRAPH_CPU_ENABLE)
    message(STATUS "Fetching LLVM from llvm.org")
    # Override default LLVM binaries

--- a/doc/sphinx/source/ops/allreduce.rst
+++ b/doc/sphinx/source/ops/allreduce.rst
+.. allreduce.rst:
+###
+AllReduce
+###
+.. code-block:: cpp
+   AllReduce // Collective operation
+Description
+===========
+Combines values from all processes or devices and distributes the result back
+to all processes or devices.
+Inputs
+------
+-----------------+-------------------------+--------------------------------+
+| Name            | Element Type            | Shape                          |
+=================+=========================+================================+
+| ``arg``         | ``element::f32``        | Any                            |
+|                 | ``element::f64``        |                                |
+-----------------+-------------------------+--------------------------------+
+Outputs
+-------
+-----------------+-------------------------+--------------------------------+
+| Name            | Element Type            | Shape                          |
+=================+=========================+================================+
+| ``output``      | ``element::f32``        | Same as ``arg``                |
+|                 | ``element::f64``        |                                |
+-----------------+-------------------------+--------------------------------+
+C++ Interface
+=============
+.. doxygenclass:: ngraph::op::AllReduce
+   :project: ngraph
+   :members:
--- a/doc/sphinx/source/ops/index.rst
+++ b/doc/sphinx/source/ops/index.rst
@@ -52,6 +52,7 @@ Not currently a comprehensive list.
   abs.rst
   acos.rst
   add.rst
+   allreduce.rst
   asin.rst
   atan.rst
   avg_pool.rst

--- a/src/ngraph/CMakeLists.txt
+++ b/src/ngraph/CMakeLists.txt
@@ -34,6 +34,7 @@ set (SRC
    node.cpp
    ops/abs.cpp
    ops/add.cpp
+    ops/allreduce.cpp
    ops/avg_pool.cpp
    ops/batch_norm.cpp
    ops/broadcast.cpp
@@ -215,11 +216,6 @@ if(NGRAPH_DISTRIBUTED_ENABLE AND MPI_CXX_INCLUDE_PATH)
    include_directories(SYSTEM ${MPI_C_INCLUDE_PATH} ${MPI_CXX_INCLUDE_PATH})
    link_directories(${MPI_C_LIBRARIES} ${MPI_CXX_LIBRARIES})
-    # Add sources for distributed ngraph
-    # and all its dependencies
-    set(SRC ${SRC}
-        ops/allreduce.cpp
-    )
    set_property(SOURCE codegen/compiler.cpp APPEND PROPERTY COMPILE_DEFINITIONS
        "MPI_HEADER_PATH=\"${MPI_C_INCLUDE_PATH}\";")
 endif()
@@ -353,11 +349,11 @@ install(DIRECTORY
 if (NGRAPH_TBB_ENABLE)
    install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/tbb_build/tbb_release/
        DESTINATION ${NGRAPH_INSTALL_LIB}
-        FILES_MATCHING PATTERN "libtbb.so.*"
+        FILES_MATCHING REGEX "/libtbb${CMAKE_SHARED_LIBRARY_SUFFIX}(\\.[0-9]+)*$"
    )
    install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/tbb_build/tbb_debug/
        DESTINATION ${NGRAPH_INSTALL_LIB}
-        FILES_MATCHING PATTERN "libtbb_debug.so.*"
+        FILES_MATCHING REGEX "/libtbb_debug${CMAKE_SHARED_LIBRARY_SUFFIX}(\\.[0-9]+)*$"
    )
 endif()

--- a/src/ngraph/axis_set.hpp
+++ b/src/ngraph/axis_set.hpp
@@ -21,7 +21,6 @@
 namespace ngraph
 {
-#ifdef NO_GLOBAL_TYPE_ALIASES
    /// \brief A set of axes.
    class AxisSet : public std::set<size_t>
    {
@@ -54,8 +53,4 @@ namespace ngraph
            return *this;
        }
    };
-#else
-    // Deprecated definition
-    using AxisSet = std::set<size_t>;
-#endif
 }
--- a/src/ngraph/axis_vector.hpp
+++ b/src/ngraph/axis_vector.hpp
@@ -21,7 +21,6 @@
 namespace ngraph
 {
-#ifdef NO_GLOBAL_TYPE_ALIASES
    /// \brief A vector of axes.
    class AxisVector : public std::vector<size_t>
    {
@@ -64,8 +63,4 @@ namespace ngraph
            return *this;
        }
    };
-#else
-    // Deprecated definition
-    using AxisVector = std::vector<size_t>;
-#endif
 }
--- a/src/ngraph/coordinate.hpp
+++ b/src/ngraph/coordinate.hpp
@@ -24,7 +24,6 @@
 namespace ngraph
 {
-#ifdef NO_GLOBAL_TYPE_ALIASES
    /// \brief Coordinates for a tensor element
    class Coordinate : public std::vector<size_t>
    {
@@ -73,10 +72,6 @@ namespace ngraph
            return *this;
        }
    };
-#else
-    // Deprecated definition
-    using Coordinate = std::vector<size_t>;
-#endif
    // Removes some values from a vector of axis values
    template <typename AXIS_VALUES>

--- a/src/ngraph/coordinate_diff.hpp
+++ b/src/ngraph/coordinate_diff.hpp
@@ -21,7 +21,6 @@
 namespace ngraph
 {
-#ifdef NO_GLOBAL_TYPE_ALIASES
    /// \brief A difference (signed) of tensor element coordinates.
    class CoordinateDiff : public std::vector<std::ptrdiff_t>
    {
@@ -64,8 +63,4 @@ namespace ngraph
            return *this;
        }
    };
-#else
-    // Deprecated definition
-    using CoordinateDiff = std::vector<std::ptrdiff_t>;
-#endif
 }
--- a/src/ngraph/ngraph.hpp
+++ b/src/ngraph/ngraph.hpp
@@ -67,6 +67,7 @@
 #include "ngraph/ops/abs.hpp"
 #include "ngraph/ops/acos.hpp"
 #include "ngraph/ops/add.hpp"
+#include "ngraph/ops/allreduce.hpp"
 #include "ngraph/ops/asin.hpp"
 #include "ngraph/ops/atan.hpp"
 #include "ngraph/ops/avg_pool.hpp"
@@ -130,7 +131,3 @@
 #include "ngraph/shape.hpp"
 #include "ngraph/types/element_type.hpp"
 #include "ngraph/types/type.hpp"
-#ifdef NGRAPH_DISTRIBUTED
-#include "ngraph/ops/allreduce.hpp"
-#endif
--- a/src/ngraph/node_vector.hpp
+++ b/src/ngraph/node_vector.hpp
@@ -23,8 +23,6 @@ namespace ngraph
 {
    class Node;
-#ifdef NO_GLOBAL_TYPE_ALIASES
    /// \brief Zero or more nodes.
    class NodeVector : public std::vector<std::shared_ptr<Node>>
    {
@@ -46,10 +44,4 @@ namespace ngraph
        NodeVector() {}
    };
-#else
-    // Deprecated definitions
-    using NodeVector = std::vector<std::shared_ptr<Node>>;
-    using Nodes = NodeVector;
-#endif
 }
--- a/src/ngraph/ops/allreduce.cpp
+++ b/src/ngraph/ops/allreduce.cpp
@@ -12,8 +12,6 @@
 // See the License for the specific language governing permissions and
 // ----------------------------------------------------------------------------
-#ifdef NGRAPH_DISTRIBUTED
 #include "ngraph/ops/allreduce.hpp"
 using namespace std;
@@ -31,5 +29,3 @@ op::AllReduce::AllReduce(const std::shared_ptr<Node>& arg)
        throw ngraph_error("Unsupported data type for AllReduce");
    }
 }
-#endif
--- a/src/ngraph/ops/allreduce.hpp
+++ b/src/ngraph/ops/allreduce.hpp
@@ -14,8 +14,6 @@
 #pragma once
-#ifdef NGRAPH_DISTRIBUTED
 #include <memory>
 #include "ngraph/ops/util/requires_tensor_view_args.hpp"
@@ -40,5 +38,3 @@ namespace ngraph
        };
    }
 }
-#endif
--- a/src/ngraph/ops/parameter_vector.hpp
+++ b/src/ngraph/ops/parameter_vector.hpp
@@ -24,8 +24,6 @@ namespace ngraph
    {
        class Parameter;
-#ifdef NO_GLOBAL_TYPE_ALIASES
        /// \brief Zero or more nodes.
        class ParameterVector : public std::vector<std::shared_ptr<op::Parameter>>
        {
@@ -47,11 +45,5 @@ namespace ngraph
            ParameterVector() {}
        };
-#else
-        // Deprecated definitions
-        using ParameterVector = std::vector<std::shared_ptr<op::Parameter>>;
-        using Parameters = ParameterVector;
-#endif
    }
 }
--- a/src/ngraph/runtime/cpu/cpu_call_frame.cpp
+++ b/src/ngraph/runtime/cpu/cpu_call_frame.cpp
@@ -65,7 +65,9 @@ void runtime::cpu::CPU_CallFrame::tensor_call(
    if (runtime::cpu::IsTracingEnabled())
    {
-        GenerateTimeline(m_external_function->get_op_attrs(), ctx->op_durations);
+        GenerateTimeline(m_external_function->get_op_attrs(),
+                         ctx->op_durations,
+                         m_external_function->get_function_name() + ".timeline.json");
    }
 }

--- a/src/ngraph/runtime/cpu/cpu_emitter.cpp
+++ b/src/ngraph/runtime/cpu/cpu_emitter.cpp
--- a/src/ngraph/runtime/cpu/cpu_external_function.cpp
+++ b/src/ngraph/runtime/cpu/cpu_external_function.cpp
@@ -245,6 +245,11 @@ runtime::cpu::CPU_ExternalFunction::CPU_ExternalFunction(
    , m_compiled_function(nullptr)
    , m_emit_timing(std::getenv("NGRAPH_CPU_EMIT_TIMING") != nullptr)
    , m_use_tbb(std::getenv("NGRAPH_CPU_USE_TBB") != nullptr)
+    , m_function_name(function->get_name())
+{
+}
+runtime::cpu::CPU_ExternalFunction::~CPU_ExternalFunction()
 {
 }
@@ -255,16 +260,14 @@ void runtime::cpu::CPU_ExternalFunction::compile()
        return;
    }
-    string function_name = m_function->get_name();
+    m_mkldnn_emitter.reset(new MKLDNNEmitter());
-    m_mkldnn_emitter.reset(new MKLDNNEmitter(shared_from_this()));
    ngraph::pass::Manager pass_manager;
    pass_manager.register_pass<ngraph::pass::CoreFusion>();
    pass_manager.register_pass<runtime::cpu::pass::CPUFusion>();
-    pass_manager.register_pass<runtime::cpu::pass::CPUAssignment>(shared_from_this());
+    pass_manager.register_pass<runtime::cpu::pass::CPUAssignment>(this);
-    pass_manager.register_pass<runtime::cpu::pass::CPULayout>(shared_from_this());
+    pass_manager.register_pass<runtime::cpu::pass::CPULayout>(this);
    pass_manager.register_pass<ngraph::pass::Liveness>();
    pass_manager.register_pass<ngraph::pass::MemoryLayout>(s_memory_pool_alignment);
@@ -537,7 +540,7 @@ using namespace ngraph::runtime;
        }
        // Execution tracing support
-        if (runtime::cpu::IsTracingEnabled() && current_function->get_name() == function_name)
+        if (runtime::cpu::IsTracingEnabled() && current_function->get_name() == m_function_name)
        {
            writer << "cpu::Timestamp start_ts;\n"
                   << "int profiler_count = 0;\n\n";
@@ -688,7 +691,7 @@ using namespace ngraph::runtime;
            // Emit operation prologue
            if (!node->is_parameter() && !node->is_constant())
            {
-                if (current_function->get_name() == function_name)
+                if (current_function->get_name() == m_function_name)
                {
                    m_op_attrs.emplace_back(
                        node->description(), node_output_names, node_input_names);
@@ -706,7 +709,7 @@ using namespace ngraph::runtime;
                    emit_debug_function_entry(writer, node.get(), in, out);
                }
                if (runtime::cpu::IsTracingEnabled() &&
-                    current_function->get_name() == function_name)
+                    current_function->get_name() == m_function_name)
                {
                    writer << "start_ts = cpu::Clock::now();\n";
                }
@@ -750,7 +753,7 @@ using namespace ngraph::runtime;
                    emit_debug_function_exit(writer, node.get(), in, out);
                }
                if (runtime::cpu::IsTracingEnabled() &&
-                    current_function->get_name() == function_name)
+                    current_function->get_name() == m_function_name)
                {
                    writer << "ctx->op_durations[profiler_count++] = "
                           << "(std::chrono::duration_cast<cpu::Timescale>(cpu::Clock::now() - "
@@ -848,7 +851,7 @@ using namespace ngraph::runtime;
    // TODO: Cleanup and make this a utility function
    file_util::make_directory(s_output_dir);
-    string filename = file_util::path_join(s_output_dir, function_name + "_codegen.cpp");
+    string filename = file_util::path_join(s_output_dir, m_function_name + "_codegen.cpp");
    ofstream out(filename);
    string code = writer.get_code();
    out << code;
@@ -867,7 +870,7 @@ using namespace ngraph::runtime;
    }
    m_execution_engine->add_module(codegen_module);
    m_execution_engine->finalize();
-    m_compiled_function = m_execution_engine->find_function<EntryPoint_t>(function_name);
+    m_compiled_function = m_execution_engine->find_function<EntryPoint_t>(m_function_name);
    if (m_compiled_function == nullptr)
    {

--- a/src/ngraph/runtime/cpu/cpu_external_function.hpp
+++ b/src/ngraph/runtime/cpu/cpu_external_function.hpp
@@ -75,6 +75,7 @@ namespace ngraph
            public:
                CPU_ExternalFunction(const std::shared_ptr<ngraph::Function>& function,
                                     bool release_function = true);
+                ~CPU_ExternalFunction();
                std::shared_ptr<ngraph::runtime::CallFrame> make_call_frame();
                const LayoutDescriptorPtrs& get_parameter_layout_descriptors();
@@ -86,6 +87,7 @@ namespace ngraph
                    return m_mkldnn_emitter;
                }
+                const std::string& get_function_name() const { return m_function_name; }
            protected:
                void compile();
@@ -123,6 +125,8 @@ namespace ngraph
                std::vector<OpAttributes> m_op_attrs;
                std::unique_ptr<MKLDNNEmitter> m_mkldnn_emitter;
+                std::string m_function_name;
            };
        }
    }

--- a/src/ngraph/runtime/cpu/cpu_manager.cpp
+++ b/src/ngraph/runtime/cpu/cpu_manager.cpp
@@ -30,7 +30,8 @@ std::shared_ptr<ngraph::runtime::Backend> runtime::cpu::CPU_Manager::allocate_ba
 std::shared_ptr<ngraph::runtime::ExternalFunction>
    runtime::cpu::CPU_Manager::compile(const std::shared_ptr<ngraph::Function>& fun)
 {
-    return std::make_shared<CPU_ExternalFunction>(fun);
+    auto rc = std::make_shared<CPU_ExternalFunction>(fun);
+    return rc;
 }
 ngraph::runtime::Manager::Factory runtime::cpu::CPU_Manager::factory =

--- a/src/ngraph/runtime/cpu/cpu_tracing.cpp
+++ b/src/ngraph/runtime/cpu/cpu_tracing.cpp
@@ -42,11 +42,12 @@ void ngraph::runtime::cpu::to_json(nlohmann::json& json, const TraceEvent& event
 }
 void ngraph::runtime::cpu::GenerateTimeline(const std::vector<OpAttributes>& op_attrs,
-                                            int64_t* op_durations)
+                                            int64_t* op_durations,
+                                            const std::string& file_name)
 {
    nlohmann::json timeline;
    std::list<TraceEvent> trace;
-    std::ofstream out("timeline.json");
+    std::ofstream out(file_name);
    int64_t ts = 0;
    for (size_t i = 0; i < op_attrs.size(); i++)

--- a/src/ngraph/runtime/cpu/cpu_tracing.hpp
+++ b/src/ngraph/runtime/cpu/cpu_tracing.hpp
@@ -69,7 +69,9 @@ namespace ngraph
            void to_json(nlohmann::json& json, const TraceEvent& event);
-            void GenerateTimeline(const std::vector<OpAttributes>& op_attrs, int64_t* op_durations);
+            void GenerateTimeline(const std::vector<OpAttributes>& op_attrs,
+                                  int64_t* op_durations,
+                                  const std::string& file_name);
            bool IsTracingEnabled();
        }
    }

--- a/src/ngraph/runtime/cpu/mkldnn_emitter.cpp
+++ b/src/ngraph/runtime/cpu/mkldnn_emitter.cpp
--- a/src/ngraph/runtime/cpu/mkldnn_emitter.hpp
+++ b/src/ngraph/runtime/cpu/mkldnn_emitter.hpp
@@ -23,6 +23,7 @@
 #include <mkldnn.hpp>
 #include "ngraph/coordinate_diff.hpp"
+#include "ngraph/shape.hpp"
 #include "ngraph/strides.hpp"
 namespace ngraph
@@ -37,11 +38,7 @@ namespace ngraph
            class MKLDNNEmitter
            {
            public:
-                MKLDNNEmitter(std::shared_ptr<CPU_ExternalFunction> ef)
+                MKLDNNEmitter() {}
-                    : external_function(ef)
-                {
-                }
                const std::vector<mkldnn::primitive*>& get_mkldnn_primitives() const;
                size_t insert_primitive(mkldnn::primitive* primitive);
@@ -69,6 +66,37 @@ namespace ngraph
                                                 const ngraph::CoordinateDiff& padding_below,
                                                 const ngraph::CoordinateDiff& padding_above);
+                size_t
+                    build_convolution_backward_weights(const mkldnn::memory::desc& input_desc,
+                                                       const mkldnn::memory::desc& delta_desc,
+                                                       const mkldnn::memory::desc& result_desc,
+                                                       const ngraph::Strides& strides,
+                                                       const ngraph::Strides& dilation_strides,
+                                                       const ngraph::CoordinateDiff& padding_below,
+                                                       const ngraph::CoordinateDiff& padding_above);
+                size_t build_convolution_backward_data(const mkldnn::memory::desc& weights_desc,
+                                                       const mkldnn::memory::desc& delta_desc,
+                                                       const mkldnn::memory::desc& result_desc,
+                                                       const ngraph::Strides& strides,
+                                                       const ngraph::Strides& dilation_strides,
+                                                       const ngraph::CoordinateDiff& padding_below,
+                                                       const ngraph::CoordinateDiff& padding_above);
+                size_t build_pooling_forward(mkldnn::algorithm pooling_algorithm,
+                                             const mkldnn::memory::desc& input_desc,
+                                             const mkldnn::memory::desc& result_desc,
+                                             const ngraph::Strides& window_strides,
+                                             const ngraph::Shape& window_shape,
+                                             const ngraph::Shape& padding_below,
+                                             const ngraph::Shape& padding_above);
+                size_t build_reorder(const mkldnn::memory::desc& input_desc,
+                                     const mkldnn::memory::desc& result_desc);
+                size_t build_relu_forward(const mkldnn::memory::desc& input_desc,
+                                          const mkldnn::memory::desc& result_desc);
                size_t build_elementwise_add(
                    const mkldnn::memory::desc& input0_data_desc,
                    const mkldnn::memory::desc& input1_data_desc,
@@ -77,10 +105,9 @@ namespace ngraph
                    const std::vector<mkldnn::memory::primitive_desc>& input_pd);
            private:
-                std::shared_ptr<CPU_ExternalFunction> external_function;
+                std::vector<mkldnn::primitive*> m_mkldnn_primitives;
-                std::vector<mkldnn::primitive*> mkldnn_primitives;
+                std::vector<mkldnn::stream> m_mkldnn_streams;
-                std::vector<mkldnn::stream> mkldnn_streams;
+                std::unordered_map<size_t, std::vector<size_t>> m_primitive_deps;
-                std::unordered_map<size_t, std::vector<size_t>> primitive_deps;
            };
        }
    }

--- a/src/ngraph/runtime/cpu/pass/cpu_assignment.cpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_assignment.cpp
@@ -256,7 +256,7 @@ bool runtime::cpu::pass::CPUAssignment::run_on_call_graph(
        auto handler = s_dispatcher.find(TI(n));
        if (handler != s_dispatcher.end())
        {
-            handler->second(m_external_function.get(), node.get());
+            handler->second(m_external_function, node.get());
        }
    }

--- a/src/ngraph/runtime/cpu/pass/cpu_assignment.hpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_assignment.hpp
@@ -39,7 +39,7 @@ namespace ngraph
                class CPUAssignment : public ngraph::pass::CallGraphPass
                {
                public:
-                    CPUAssignment(std::shared_ptr<CPU_ExternalFunction> external_function)
+                    CPUAssignment(CPU_ExternalFunction* external_function)
                        : m_external_function(external_function)
                    {
                    }
@@ -56,7 +56,7 @@ namespace ngraph
                    }
                private:
-                    std::shared_ptr<CPU_ExternalFunction> m_external_function;
+                    CPU_ExternalFunction* m_external_function;
                };
            }
        }

--- a/src/ngraph/runtime/cpu/pass/cpu_layout.cpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_layout.cpp
@@ -719,11 +719,11 @@ bool runtime::cpu::pass::CPULayout::run_on_call_graph(const std::list<std::share
        auto handler = s_dispatcher.find(TI(n));
        if (handler != s_dispatcher.end())
        {
-            handler->second(m_external_function.get(), node);
+            handler->second(m_external_function, node);
        }
        else
        {
-            set_default_layouts(m_external_function.get(), node);
+            set_default_layouts(m_external_function, node);
        }
    }

--- a/src/ngraph/runtime/cpu/pass/cpu_layout.hpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_layout.hpp
@@ -39,7 +39,7 @@ namespace ngraph
                class CPULayout : public ngraph::pass::CallGraphPass
                {
                public:
-                    CPULayout(std::shared_ptr<CPU_ExternalFunction> external_function)
+                    CPULayout(CPU_ExternalFunction* external_function)
                        : m_external_function(external_function)
                    {
                    }
@@ -52,7 +52,7 @@ namespace ngraph
                               std::shared_ptr<ngraph::Node> node);
                private:
-                    std::shared_ptr<CPU_ExternalFunction> m_external_function;
+                    CPU_ExternalFunction* m_external_function;
                    static std::shared_ptr<Node> insert_input_conversions(
                        CPU_ExternalFunction* external_function,
                        std::shared_ptr<Node>& node,

--- a/src/ngraph/runtime/gpu/gpu_emitter.cpp
+++ b/src/ngraph/runtime/gpu/gpu_emitter.cpp
@@ -51,29 +51,6 @@
 using namespace std;
 using namespace ngraph;
-#define NVRTC_SAFE_CALL(x)                                                                         \
-    do                                                                                             \
-    {                                                                                              \
-        nvrtcResult result = x;                                                                    \
-        if (result != NVRTC_SUCCESS)                                                               \
-        {                                                                                          \
-            throw std::runtime_error("\nerror: " #x " failed with error " +                        \
-                                     nvrtcGetErrorString(result));                                 \
-        }                                                                                          \
-    } while (0)
-#define CUDA_SAFE_CALL(x)                                                                          \
-    do                                                                                             \
-    {                                                                                              \
-        CUresult result = x;                                                                       \
-        if (result != CUDA_SUCCESS)                                                                \
-        {                                                                                          \
-            const char* msg;                                                                       \
-            cuGetErrorName(result, &msg);                                                          \
-            throw std::runtime_error("\nerror: " #x " failed with error " + std::string(msg);      \
-        }                                                                                          \
-    } while (0)
 void runtime::gpu::GPU_Emitter::EmitNop(codegen::CodeWriter& writer,
                                        const ngraph::Node* n,
                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
@@ -523,7 +500,6 @@ void runtime::gpu::GPU_Emitter::EmitReshape(codegen::CodeWriter& writer,
    {
        result_shape_product *= i;
    }
    // If there is no layout change or we are just going from 1^n to 1^m or a zero-size tensor,
    //  we can just copy.
    if (same_layout || result_shape_product < 2)
@@ -531,7 +507,7 @@ void runtime::gpu::GPU_Emitter::EmitReshape(codegen::CodeWriter& writer,
        writer << "{   // " << n->get_name() << " 1\n";
        writer.indent++;
        writer << "runtime::gpu::cuda_memcpyDtD(" << out[0].get_name() << ", " << args[0].get_name()
-               << ", " << out[0].get_size() << "," << out[0].get_element_type().size() << ");\n";
+               << ", " << out[0].get_size() << " * " << out[0].get_element_type().size() << ");\n";
        writer.indent--;
        writer << "}\n";
    }
@@ -541,8 +517,9 @@ void runtime::gpu::GPU_Emitter::EmitReshape(codegen::CodeWriter& writer,
        // TODO Assert arg0_shape[0] == arg1_shape[0]?
        writer << "{   // " << n->get_name() << "\n";
        writer.indent++;
-        writer << "static const float alpha = 1.0;\n";
+        writer << "const float alpha = 1.0;\n";
-        writer << "static const float beta = 0.0;\n";
+        writer << "const float beta = 0;\n";
+        writer << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_HOST);\n";
        writer << "cublasSgeam("
               << "cublas_handle,"
               << "CUBLAS_OP_T,"
@@ -551,7 +528,8 @@ void runtime::gpu::GPU_Emitter::EmitReshape(codegen::CodeWriter& writer,
               << args[0].get_name() << "," << arg_shape[1] << ","
               << "&beta," // beta
               << args[0].get_name() << "," << arg_shape[1] << "," << out[0].get_name() << ","
-               << out[0].get_shape()[1] << ");\n";
+               << result_shape[1] << ");\n";
+        writer << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_DEVICE);\n";
        writer.indent--;
        writer << "}\n";
    }

--- a/src/ngraph/runtime/gpu/gpu_util.cpp
+++ b/src/ngraph/runtime/gpu/gpu_util.cpp
@@ -54,18 +54,17 @@ void* runtime::gpu::create_gpu_buffer(size_t buffer_size)
    return allocated_buffer_pool;
 }
-void runtime::gpu::cuda_memcpyDtD(void* d, void* s, size_t element_count, size_t element_size)
+void runtime::gpu::cuda_memcpyDtD(void* dst, void* src, size_t buffer_size)
 {
-    size_t size_in_bytes = element_size * element_count;
+    cudaMemcpy(dst, src, buffer_size, cudaMemcpyDeviceToDevice);
-    cudaMemcpy(d, s, size_in_bytes, cudaMemcpyDeviceToDevice);
 }
-void runtime::gpu::cuda_memcpyHtD(void* d, void* s, size_t buffer_size)
+void runtime::gpu::cuda_memcpyHtD(void* dst, void* src, size_t buffer_size)
 {
-    cudaMemcpy(d, s, buffer_size, cudaMemcpyHostToDevice);
+    cudaMemcpy(dst, src, buffer_size, cudaMemcpyHostToDevice);
 }
-void runtime::gpu::cuda_memset(void* d, int value, size_t buffer_size)
+void runtime::gpu::cuda_memset(void* dst, int value, size_t buffer_size)
 {
-    cudaMemset(d, value, buffer_size);
+    cudaMemset(dst, value, buffer_size);
 }
--- a/src/ngraph/runtime/gpu/gpu_util.hpp
+++ b/src/ngraph/runtime/gpu/gpu_util.hpp
@@ -61,9 +61,9 @@ namespace ngraph
            void print_gpu_f32_tensor(void* p, size_t element_count, size_t element_size);
            void check_cuda_errors(CUresult err);
            void* create_gpu_buffer(size_t buffer_size);
-            void cuda_memcpyDtD(void* d, void* s, size_t element_count, size_t element_size);
+            void cuda_memcpyDtD(void* dst, void* src, size_t buffer_size);
-            void cuda_memcpyHtD(void* d, void* s, size_t buffer_size);
+            void cuda_memcpyHtD(void* dst, void* src, size_t buffer_size);
-            void cuda_memset(void* d, int value, size_t buffer_size);
+            void cuda_memset(void* dst, int value, size_t buffer_size);
        }
    }
 }
--- a/src/ngraph/serializer.cpp
+++ b/src/ngraph/serializer.cpp
@@ -19,6 +19,7 @@
 #include "ngraph/ops/abs.hpp"
 #include "ngraph/ops/acos.hpp"
 #include "ngraph/ops/add.hpp"
+#include "ngraph/ops/allreduce.hpp"
 #include "ngraph/ops/asin.hpp"
 #include "ngraph/ops/atan.hpp"
 #include "ngraph/ops/avg_pool.hpp"
@@ -75,15 +76,27 @@
 #include "ngraph/ops/tan.hpp"
 #include "ngraph/ops/tanh.hpp"
 #include "ngraph/util.hpp"
+#include "nlohmann/json.hpp"
-#ifdef NGRAPH_DISTRIBUTED
-#include "ngraph/ops/allreduce.hpp"
-#endif
 using namespace ngraph;
 using namespace std;
 using json = nlohmann::json;
+template <typename T>
+T get_or_default(nlohmann::json& j, const std::string& key, const T& default_value)
+{
+    T rc;
+    try
+    {
+        rc = j.at(key).get<T>();
+    }
+    catch (...)
+    {
+        rc = default_value;
+    }
+    return rc;
+}
 static std::shared_ptr<ngraph::Function>
    read_function(const json&, std::unordered_map<std::string, std::shared_ptr<Function>>&);
@@ -263,12 +276,10 @@ static shared_ptr<ngraph::Function>
        {
            node = make_shared<op::Add>(args[0], args[1]);
        }
-#ifdef NGRAPH_DISTRIBUTED
        else if (node_op == "AllReduce")
        {
            node = make_shared<op::AllReduce>(args[0]);
        }
-#endif
        else if (node_op == "Asin")
        {
            node = make_shared<op::Asin>(args[0]);

--- a/src/ngraph/serializer.hpp
+++ b/src/ngraph/serializer.hpp
@@ -21,26 +21,10 @@
 #include "ngraph/function.hpp"
 #include "ngraph/node.hpp"
-#include "nlohmann/json.hpp"
 namespace ngraph
 {
    std::string serialize(std::shared_ptr<ngraph::Function>, size_t indent = 0);
    std::shared_ptr<ngraph::Function> deserialize(std::istream&);
    std::shared_ptr<ngraph::Function> deserialize(const std::string&);
-    template <typename T>
-    T get_or_default(nlohmann::json& j, const std::string& key, const T& default_value)
-    {
-        T rc;
-        try
-        {
-            rc = j.at(key).get<T>();
-        }
-        catch (...)
-        {
-            rc = default_value;
-        }
-        return rc;
-    }
 }
--- a/src/ngraph/shape.hpp
+++ b/src/ngraph/shape.hpp
@@ -25,7 +25,6 @@
 namespace ngraph
 {
-#ifdef NO_GLOBAL_TYPE_ALIASES
    /// \brief Shape for a tensor.
    class Shape : public std::vector<size_t>
    {
@@ -68,10 +67,6 @@ namespace ngraph
            return *this;
        }
    };
-#else
-    // Deprecated definition
-    using Shape = std::vector<size_t>;
-#endif
    /// Number of elements in spanned by a shape
    size_t shape_size(const Shape& shape);
@@ -81,5 +76,4 @@ namespace ngraph
    inline bool is_scalar(const Shape& shape) { return 0 == shape.size(); }
    inline bool is_vector(const Shape& shape) { return 1 == shape.size(); }
-    Shape project_shape(const Shape& shape, const AxisSet& deleted_axes);
 }
--- a/src/ngraph/strides.hpp
+++ b/src/ngraph/strides.hpp
@@ -20,7 +20,6 @@
 namespace ngraph
 {
-#ifdef NO_GLOBAL_TYPE_ALIASES
    /// \brief Strides for a tensor.
    class Strides : public std::vector<size_t>
    {
@@ -63,8 +62,4 @@ namespace ngraph
            return *this;
        }
    };
-#else
-    // Deprecated definition
-    using Strides = std::vector<size_t>;
-#endif
 }
--- a/src/tools/CMakeLists.txt
+++ b/src/tools/CMakeLists.txt
@@ -14,26 +14,4 @@
 # limitations under the License.
 # ******************************************************************************
-if(MKLDNN_INCLUDE_DIR)
+add_subdirectory(nbench)
-    link_directories(${MKLDNN_LIB_DIR})
-endif()
-if (NGRAPH_CPU_ENABLE)
-    set (SRC
-        nbench.cpp
-        ${PROJECT_SOURCE_DIR}/test/util/benchmark.cpp
-    )
-    add_executable(nbench ${SRC})
-    add_dependencies(nbench ngraph)
-    set(HEADER_SEARCH_DEFINES
-        "NGRAPH_HEADERS_PATH=\"${NGRAPH_INCLUDE_PATH}\""
-    )
-    target_link_libraries(nbench ngraph)
-    include_directories(SYSTEM ${JSON_INCLUDE_DIR})
-    set_source_files_properties(nbench.cpp PROPERTIES COMPILE_DEFINITIONS "${HEADER_SEARCH_DEFINES}")
-endif()
--- a/src/tools/nbench/CMakeLists.txt
+++ b/src/tools/nbench/CMakeLists.txt
+# ******************************************************************************
+# Copyright 2017-2018 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ******************************************************************************
+if(MKLDNN_INCLUDE_DIR)
+    link_directories(${MKLDNN_LIB_DIR})
+endif()
+if (NGRAPH_CPU_ENABLE)
+    set (SRC
+        nbench.cpp
+        ${PROJECT_SOURCE_DIR}/test/util/benchmark.cpp
+    )
+    add_executable(nbench ${SRC})
+    add_dependencies(nbench ngraph)
+    set(HEADER_SEARCH_DEFINES
+        "NGRAPH_HEADERS_PATH=\"${NGRAPH_INCLUDE_PATH}\""
+    )
+    target_link_libraries(nbench ngraph)
+    include_directories("${PROJECT_SOURCE_DIR}/test")
+    set_source_files_properties(nbench.cpp PROPERTIES COMPILE_DEFINITIONS "${HEADER_SEARCH_DEFINES}")
+endif()
--- a/src/tools/nbench.cpp
+++ b/src/tools/nbench.cpp
@@ -24,8 +24,8 @@
 #include <ngraph/runtime/backend.hpp>
 #include <ngraph/runtime/call_frame.hpp>
 #include <ngraph/runtime/manager.hpp>
-#include "../../test/util/benchmark.hpp"
+#include "util/benchmark.hpp"
-#include "../../test/util/test_tools.hpp"
+#include "util/test_tools.hpp"
 using namespace std;
 int main(int argc, char** argv)
@@ -66,16 +66,16 @@ int main(int argc, char** argv)
    if (failed)
    {
        cout << R"###(
-DESCRIPTION                                                         
+DESCRIPTION
-    Benchmark ngraph json model with given backend.                 
+    Benchmark ngraph json model with given backend.
-SYNOPSIS                                                            
+SYNOPSIS
        nbench [-f <filename>] [-b <backend>] [-i <iterations>]
-OPTIONS                                                             
+OPTIONS
-        -f          model json file to use (default: model.json)    
+        -f          model json file to use (default: model.json)
-        -b          Backend to use (default: INTERPRETER)           
+        -b          Backend to use (default: INTERPRETER)
-        -i          Iterations (default: 10)                        
+        -i          Iterations (default: 10)
 )###";
        return 1;
    }

--- a/test/backend_test.in.cpp
+++ b/test/backend_test.in.cpp
@@ -43,6 +43,29 @@ static const vector<element::Type> s_known_element_types = {element::from<float>
                                                            element::from<uint32_t>(),
                                                            element::from<uint64_t>()};
+TEST(${BACKEND_NAME}, component_cleanup)
+{
+    shared_ptr<runtime::Backend> backend;
+    shared_ptr<runtime::ExternalFunction> external;
+    shared_ptr<runtime::CallFrame> cf;
+    {
+        Shape shape{2, 2};
+        auto A = make_shared<op::Parameter>(element::f32, shape);
+        auto B = make_shared<op::Parameter>(element::f32, shape);
+        auto f = make_shared<Function>(A + B, op::ParameterVector{A, B});
+        auto manager = runtime::Manager::get("${BACKEND_NAME}");
+        external = manager->compile(f);
+        backend = manager->allocate_backend();
+        cf = backend->make_call_frame(external);
+    }
+    EXPECT_EQ(cf.use_count(), 1);
+    cf = nullptr;
+    EXPECT_EQ(backend.use_count(), 1);
+    backend = nullptr;
+    EXPECT_EQ(external.use_count(), 1);
+}
 TEST(${BACKEND_NAME}, aliased_output)
 {
    Shape shape{2, 2};
@@ -138,7 +161,7 @@ TEST(${BACKEND_NAME}, abc)
    auto f = make_shared<Function>((A + B) * C, op::ParameterVector{A, B, C});
    auto manager = runtime::Manager::get("${BACKEND_NAME}");
-    auto external = manager->compile(f);
+    shared_ptr<runtime::ExternalFunction> external = manager->compile(f);
    auto backend = manager->allocate_backend();
    auto cf = backend->make_call_frame(external);
@@ -2373,7 +2396,6 @@ TEST(${BACKEND_NAME}, reshape_s2t)
 TEST(${BACKEND_NAME}, reshape_v2m_col)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    Shape shape_a{3};
    auto A = make_shared<op::Parameter>(element::f32, shape_a);
    Shape shape_r{3, 1};
@@ -2396,7 +2418,6 @@ TEST(${BACKEND_NAME}, reshape_v2m_col)
 TEST(${BACKEND_NAME}, reshape_v2m_row)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    Shape shape_a{3};
    auto A = make_shared<op::Parameter>(element::f32, shape_a);
    Shape shape_r{1, 3};
@@ -2442,7 +2463,6 @@ TEST(${BACKEND_NAME}, reshape_v2t_middle)
 TEST(${BACKEND_NAME}, reshape_m2m_same)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    Shape shape_a{3, 3};
    auto A = make_shared<op::Parameter>(element::f32, shape_a);
    Shape shape_r{3, 3};
@@ -2465,7 +2485,6 @@ TEST(${BACKEND_NAME}, reshape_m2m_same)
 TEST(${BACKEND_NAME}, reshape_m2m_transpose)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    Shape shape_a{3, 3};
    auto A = make_shared<op::Parameter>(element::f32, shape_a);
    Shape shape_r{3, 3};
@@ -2488,7 +2507,6 @@ TEST(${BACKEND_NAME}, reshape_m2m_transpose)
 TEST(${BACKEND_NAME}, reshape_m2m_dim_change_transpose)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    Shape shape_a{3, 2};
    auto A = make_shared<op::Parameter>(element::f32, shape_a);
    Shape shape_r{2, 3};

--- a/test/serialize.cpp
+++ b/test/serialize.cpp
@@ -30,6 +30,21 @@ using namespace std;
 using namespace ngraph;
 using json = nlohmann::json;
+template <typename T>
+T get_or_default(nlohmann::json& j, const std::string& key, const T& default_value)
+{
+    T rc;
+    try
+    {
+        rc = j.at(key).get<T>();
+    }
+    catch (...)
+    {
+        rc = default_value;
+    }
+    return rc;
+}
 TEST(serialize, main)
 {
    // First create "f(A,B,C) = (A+B)*C".

--- a/test/util/benchmark.cpp
+++ b/test/util/benchmark.cpp
@@ -13,6 +13,9 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *******************************************************************************/
+#include <iomanip>
 #include "benchmark.hpp"
 #include "ngraph/runtime/backend.hpp"
 #include "ngraph/runtime/call_frame.hpp"