Hybrid GPU Backend (#2240)

* Add GPUH hybrid backend * update manifests * update node operator<< * fix GOE * remove debug * remove debug * more cleanup * add parent support to cpu and intel gpu backend tensors * cleanup * fix odd failure when printing node during construction * fix node output * address review comments * style

Hybrid GPU Backend (#2240)
* Add GPUH hybrid backend * update manifests * update node operator<< * fix GOE * remove debug * remove debug * more cleanup * add parent support to cpu and intel gpu backend tensors * cleanup * fix odd failure when printing node during construction * fix node output * address review comments * style
90503652 · Robert Kimball · GitHub · 42f16035 · 90503652 · 90503652
Unverified Commit 90503652 authored Dec 23, 2018 by Robert Kimball Committed by GitHub Dec 23, 2018
33 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -75,7 +75,6 @@ set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/Modules/")

 include(var_functions)

-set(NGRAPH_HYBRID_ENABLE TRUE)
 option(NGRAPH_UNIT_TEST_ENABLE "Control the building of unit tests" TRUE)
 option(NGRAPH_TOOLS_ENABLE "Control the building of tool" TRUE)
 option(NGRAPH_CPU_ENABLE "Control the building of the CPU backend" TRUE)
@@ -92,6 +91,10 @@ option(NGRAPH_CODE_COVERAGE_ENABLE "Enable code coverage data collection" FALSE)
 option(NGRAPH_LIB_VERSIONING_ENABLE "Enable shared library versioning" FALSE)
 option(NGRAPH_PYTHON_BUILD_ENABLE   "Enable build nGraph python package wheel" FALSE)

+if (NGRAPH_GPUH_ENABLE)
+    set(NGRAPH_GPU_ENABLE TRUE)
+endif()
+
 message(STATUS "NGRAPH_UNIT_TEST_ENABLE:      ${NGRAPH_UNIT_TEST_ENABLE}")
 message(STATUS "NGRAPH_TOOLS_ENABLE:          ${NGRAPH_TOOLS_ENABLE}")
 message(STATUS "NGRAPH_CPU_ENABLE:            ${NGRAPH_CPU_ENABLE}")
@@ -108,10 +111,6 @@ message(STATUS "NGRAPH_CODE_COVERAGE_ENABLE:  ${NGRAPH_CODE_COVERAGE_ENABLE}")
 message(STATUS "NGRAPH_LIB_VERSIONING_ENABLE: ${NGRAPH_LIB_VERSIONING_ENABLE}")
 message(STATUS "NGRAPH_PYTHON_BUILD_ENABLE:   ${NGRAPH_PYTHON_BUILD_ENABLE}")

-if (NGRAPH_HYBRID_ENABLE)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DNGRAPH_HYBRID_ENABLE")
-endif()
-
 if (NGRAPH_ONNX_IMPORT_ENABLE)
    option(NGRAPH_USE_SYSTEM_PROTOBUF "Use system provided Protobuf shared object" FALSE)
    option(NGRAPH_ONNXIFI_ENABLE "Enable ONNX Interface for Framework Integration" TRUE)

--- a/src/ngraph/node.cpp
+++ b/src/ngraph/node.cpp
@@ -238,7 +238,14 @@ std::ostream& Node::write_long_description(std::ostream& out) const
    {
        out << sep << NodeDescription(*arg, true) << ": "
            << pretty_element_type(arg->get_output_element_type(0))
-            << arg->get_output_partial_shape(0) << "";
+            << arg->get_output_partial_shape(0);
+        sep = ", ";
+    }
+    out << ") -> (";
+    sep = "";
+    for (const auto& o : get_outputs())
+    {
+        out << sep << pretty_element_type(o.get_element_type()) << o.get_partial_shape();
        sep = ", ";
    }
    out << ")";

--- a/src/ngraph/runtime/CMakeLists.txt
+++ b/src/ngraph/runtime/CMakeLists.txt
@@ -15,10 +15,7 @@
 # ******************************************************************************

 add_subdirectory(interpreter)
-
-if (NGRAPH_HYBRID_ENABLE)
-    add_subdirectory(hybrid)
-endif()
+add_subdirectory(hybrid)

 if (NGRAPH_CPU_ENABLE)
    add_subdirectory(cpu)

--- a/src/ngraph/runtime/cpu/cpu_backend.cpp
+++ b/src/ngraph/runtime/cpu/cpu_backend.cpp
@@ -58,13 +58,13 @@ shared_ptr<runtime::cpu::CPU_CallFrame> runtime::cpu::CPU_Backend::make_call_fra
 shared_ptr<runtime::Tensor>
    runtime::cpu::CPU_Backend::create_tensor(const element::Type& element_type, const Shape& shape)
 {
-    return make_shared<runtime::cpu::CPUTensorView>(element_type, shape);
+    return make_shared<runtime::cpu::CPUTensorView>(element_type, shape, this);
 }

 shared_ptr<runtime::Tensor> runtime::cpu::CPU_Backend::create_tensor(
    const element::Type& element_type, const Shape& shape, void* memory_pointer)
 {
-    return make_shared<runtime::cpu::CPUTensorView>(element_type, shape, memory_pointer);
+    return make_shared<runtime::cpu::CPUTensorView>(element_type, shape, memory_pointer, this);
 }

 runtime::Handle runtime::cpu::CPU_Backend::compile(shared_ptr<Function> func)

--- a/src/ngraph/runtime/cpu/cpu_tensor_view.cpp
+++ b/src/ngraph/runtime/cpu/cpu_tensor_view.cpp
@@ -35,8 +35,9 @@ using namespace std;
 runtime::cpu::CPUTensorView::CPUTensorView(const ngraph::element::Type& element_type,
                                           const Shape& shape,
                                           void* memory_pointer,
-                                           const string& name)
-    : runtime::Tensor(std::make_shared<ngraph::descriptor::Tensor>(element_type, shape, name))
+                                           const runtime::Backend* parent)
+    : runtime::Tensor(std::make_shared<ngraph::descriptor::Tensor>(element_type, shape, "external"),
+                      parent)
    , buffer(nullptr)
    , aligned_buffer(nullptr)
 {
@@ -77,8 +78,8 @@ runtime::cpu::CPUTensorView::CPUTensorView(const ngraph::element::Type& element_

 runtime::cpu::CPUTensorView::CPUTensorView(const ngraph::element::Type& element_type,
                                           const Shape& shape,
-                                           const string& name)
-    : CPUTensorView(element_type, shape, nullptr, name)
+                                           const runtime::Backend* parent)
+    : CPUTensorView(element_type, shape, nullptr, parent)
 {
 }


--- a/src/ngraph/runtime/cpu/cpu_tensor_view.hpp
+++ b/src/ngraph/runtime/cpu/cpu_tensor_view.hpp
@@ -35,11 +35,11 @@ namespace ngraph
            public:
                CPUTensorView(const ngraph::element::Type& element_type,
                              const Shape& shape,
-                              const std::string& name = "external");
+                              const runtime::Backend* parent);
                CPUTensorView(const ngraph::element::Type& element_type,
                              const Shape& shape,
                              void* memory_pointer,
-                              const std::string& name = "external");
+                              const runtime::Backend* parent);
                virtual ~CPUTensorView() override;

                char* get_data_ptr();

--- a/src/ngraph/runtime/gpu/CMakeLists.txt
+++ b/src/ngraph/runtime/gpu/CMakeLists.txt
@@ -159,11 +159,6 @@ if (NGRAPH_GPU_ENABLE)
            ${CUDA_LIBRARIES}
            ${CUDA_CUBLAS_LIBRARIES}
            ${CUDNN_LIBRARIES})
-    if (NGRAPH_HYBRID_ENABLE)
-        target_link_libraries(gpu_backend
-            PRIVATE
-                hybrid_backend)
-    endif()

    set_target_properties(gpu_backend PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${NGRAPH_BUILD_DIR})


--- a/src/ngraph/runtime/gpu/gpu_backend.cpp
+++ b/src/ngraph/runtime/gpu/gpu_backend.cpp
@@ -20,6 +20,7 @@
 #include <cudnn.h>

 #include "ngraph/graph_util.hpp"
+#include "ngraph/op/batch_norm.hpp"
 #include "ngraph/runtime/gpu/gpu_backend.hpp"
 #include "ngraph/runtime/gpu/gpu_external_function.hpp"
 #include "ngraph/runtime/gpu/gpu_primitive_emitter.hpp"
@@ -107,13 +108,13 @@ runtime::gpu::GPU_Backend::BackendContext::~BackendContext()
 shared_ptr<runtime::Tensor>
    runtime::gpu::GPU_Backend::create_tensor(const element::Type& element_type, const Shape& shape)
 {
-    return make_shared<runtime::gpu::GPUTensor>(element_type, shape);
+    return make_shared<runtime::gpu::GPUTensor>(element_type, shape, this);
 }

 shared_ptr<runtime::Tensor> runtime::gpu::GPU_Backend::create_tensor(
    const element::Type& element_type, const Shape& shape, void* memory_pointer)
 {
-    return make_shared<runtime::gpu::GPUTensor>(element_type, shape, memory_pointer);
+    return make_shared<runtime::gpu::GPUTensor>(element_type, shape, memory_pointer, this);
 }

 runtime::Handle runtime::gpu::GPU_Backend::compile(shared_ptr<Function> func)
@@ -222,33 +223,53 @@ vector<runtime::PerformanceCounter>
    return rc;
 }

-bool runtime::gpu::GPU_Backend::is_supported(const Node& node) const
+bool runtime::gpu::GPU_Backend::is_supported(const Node& op) const
 {
-    bool rc = true;
+    set<string> unsupported_ops = {"Quantize",
+                                   "Dequantize",
+                                   "ShapeOf",
+                                   "All",
+                                   "Any",
+                                   "AllReduce",
+                                   "SelectAndScatter",
+                                   "StopGradient",
+                                   "EmbeddingLookup",
+                                   "GenerateMask"};

-    // get op type
-    element::Type type;
-    if (node.description() == "Select")
-    {
-        type = node.get_input_element_type(1);
-    }
-    else if (node.description() == "Constant")
+    set<string> float_only = {"MaxPoolBackprop", "AvgPoolBackprop", "MaxPool", "Dot"};
+
+    if (unsupported_ops.find(op.description()) != unsupported_ops.end())
    {
-        type = node.get_outputs().at(0).get_element_type();
+        return false;
    }
-    else if (node.description() == "Parameter")
+
+    if (float_only.find(op.description()) != float_only.end())
    {
-        type = node.get_outputs().at(0).get_element_type();
+        if (op.get_output_element_type(0) != element::f32 &&
+            op.get_output_element_type(0) != element::f64)
+        {
+            return false;
+        }
    }
-    else
+
+    if (op.description() == "BatchNormInference")
    {
-        type = node.get_input_element_type(0);
+        const ngraph::op::BatchNormInference* bn =
+            static_cast<const ngraph::op::BatchNormInference*>(&op);
+        if (bn->get_eps_value() < CUDNN_BN_MIN_EPSILON)
+        {
+            return false;
+        }
    }
-
-    if (type != element::f32)
+    else if (op.description() == "BatchNormTraining")
    {
-        rc = false;
+        const ngraph::op::BatchNormTraining* bn =
+            static_cast<const ngraph::op::BatchNormTraining*>(&op);
+        if (bn->get_eps_value() < CUDNN_BN_MIN_EPSILON)
+        {
+            return false;
+        }
    }

-    return rc;
+    return true;
 }
--- a/src/ngraph/runtime/gpu/gpu_tensor.cpp
+++ b/src/ngraph/runtime/gpu/gpu_tensor.cpp
@@ -29,8 +29,10 @@ using namespace std;

 runtime::gpu::GPUTensor::GPUTensor(const ngraph::element::Type& element_type,
                                   const Shape& shape,
-                                   void* memory_pointer)
-    : runtime::Tensor(std::make_shared<ngraph::descriptor::Tensor>(element_type, shape, "external"))
+                                   void* memory_pointer,
+                                   const Backend* backend)
+    : runtime::Tensor(std::make_shared<ngraph::descriptor::Tensor>(element_type, shape, "external"),
+                      backend)
    , m_custom_memory(false)
 {
    m_descriptor->set_tensor_layout(
@@ -48,8 +50,10 @@ runtime::gpu::GPUTensor::GPUTensor(const ngraph::element::Type& element_type,
    }
 }

-runtime::gpu::GPUTensor::GPUTensor(const ngraph::element::Type& element_type, const Shape& shape)
-    : GPUTensor(element_type, shape, nullptr)
+runtime::gpu::GPUTensor::GPUTensor(const ngraph::element::Type& element_type,
+                                   const Shape& shape,
+                                   const Backend* backend)
+    : GPUTensor(element_type, shape, nullptr, backend)
 {
 }


--- a/src/ngraph/runtime/gpu/gpu_tensor.hpp
+++ b/src/ngraph/runtime/gpu/gpu_tensor.hpp
@@ -16,9 +16,9 @@

 #pragma once

-#include <cuda.h>
 #include <memory>

+#include "ngraph/runtime/backend.hpp"
 #include "ngraph/runtime/tensor.hpp"
 #include "ngraph/type/element_type.hpp"

@@ -36,8 +36,11 @@ namespace ngraph
 class ngraph::runtime::gpu::GPUTensor : public ngraph::runtime::Tensor
 {
 public:
-    GPUTensor(const ngraph::element::Type& element_type, const Shape& shape);
-    GPUTensor(const ngraph::element::Type& element_type, const Shape& shape, void* memory_pointer);
+    GPUTensor(const ngraph::element::Type& element_type, const Shape& shape, const Backend* parent);
+    GPUTensor(const ngraph::element::Type& element_type,
+              const Shape& shape,
+              void* memory_pointer,
+              const Backend* parent);
    virtual ~GPUTensor() override;

    /// \brief Write bytes directly into the tensor

--- a/src/ngraph/runtime/gpu/unit_test.manifest
+++ b/src/ngraph/runtime/gpu/unit_test.manifest
-#int64 is not supprted by cuDNN
-batch_norm_one_output
-batch_norm_three_outputs
-backwards_batch_norm_three_outputs
-#need to check
+# need to check
 computation_reuse
-#cuda does not support throw
+# cuda does not support throw
 divide_by_zero_int32
-#int64 is not supprted by cuDNN
+# int64 is not supprted by cuDNN
 dot_matrix_vector_int64
 generate_mask
-#error throw is not the same on GPU, not supported yet
-one_hot_scalar_fp_nonint_in_3
-one_hot_scalar_oob_in_3
-one_hot_vector_1_barely_oob
-one_hot_vector_1_far_oob
-one_hot_vector_1_fp_nonint
-#select_and_scatter is deprecated
+# select_and_scatter is deprecated
 select_and_scatter_3d_without_overlap
 select_and_scatter_with_overlap
 select_and_scatter_without_overlap
-#custom_mem is not implemented on GPU
+# custom_mem is not implemented on GPU
 tensorview_custom_mem
-#integer is not supported by cuDNN on backward pooling
+# integer is not supported by cuDNN on backward pooling
 backwards_maxpool_n4_c1_hw4_2x2_max
 backwards_maxpool_n2_c1_hw5_3x3_str2_max
 backwards_avgpool_n1_c1_hw2x2

--- a/src/ngraph/runtime/gpuh/CMakeLists.txt
+++ b/src/ngraph/runtime/gpuh/CMakeLists.txt
@@ -21,7 +21,7 @@ if (NGRAPH_GPUH_ENABLE)
            VERSION ${NGRAPH_VERSION}
            SOVERSION ${NGRAPH_API_VERSION})
    endif()
-    target_link_libraries(gpuh_backend PUBLIC ngraph)
+    target_link_libraries(gpuh_backend PUBLIC ngraph hybrid_base gpu_backend)
    set_target_properties(gpuh_backend PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${NGRAPH_BUILD_DIR})

    install(TARGETS gpuh_backend

--- a/src/ngraph/runtime/gpuh/gpuh_backend.cpp
+++ b/src/ngraph/runtime/gpuh/gpuh_backend.cpp
@@ -18,6 +18,7 @@
 #include "ngraph/graph_util.hpp"
 #include "ngraph/pass/assign_placement.hpp"
 #include "ngraph/pass/manager.hpp"
+#include "ngraph/runtime/gpu/gpu_backend.hpp"
 #include "ngraph/runtime/interpreter/int_backend.hpp"
 #include "ngraph/runtime/tensor.hpp"

@@ -34,7 +35,13 @@ extern "C" runtime::Backend* new_backend(const char* configuration_string)
    return new runtime::gpuh::GPUHBackend();
 }

+vector<string> get_excludes()
+{
+    return vector<string>{{"Not"}};
+}
+
 runtime::gpuh::GPUHBackend::GPUHBackend()
-    : HybridBackend({{"INTERPRETER", make_shared<ngraph::runtime::interpreter::INTBackend>()}})
+    : HybridBackend({make_shared<ngraph::runtime::gpu::GPU_Backend>(),
+                     make_shared<ngraph::runtime::interpreter::INTBackend>()})
 {
 }
--- a/src/ngraph/runtime/gpuh/unit_test.manifest
+++ b/src/ngraph/runtime/gpuh/unit_test.manifest
+computation_reuse
+tensorview_custom_mem
+batch_norm_inference_f64
+batch_norm_inference_f32
+divide_by_zero_int32
--- a/src/ngraph/runtime/host_tensor.cpp
+++ b/src/ngraph/runtime/host_tensor.cpp
@@ -26,8 +26,10 @@ using namespace std;
 runtime::HostTensor::HostTensor(const ngraph::element::Type& element_type,
                                const Shape& shape,
                                void* memory_pointer,
-                                const string& name)
-    : runtime::Tensor(std::make_shared<ngraph::descriptor::Tensor>(element_type, shape, name))
+                                const string& name,
+                                const Backend* parent)
+    : runtime::Tensor(std::make_shared<ngraph::descriptor::Tensor>(element_type, shape, name),
+                      parent)
    , m_allocated_buffer_pool(nullptr)
    , m_aligned_buffer_pool(nullptr)

@@ -56,8 +58,24 @@ runtime::HostTensor::HostTensor(const ngraph::element::Type& element_type,

 runtime::HostTensor::HostTensor(const ngraph::element::Type& element_type,
                                const Shape& shape,
-                                const string& name)
-    : HostTensor(element_type, shape, nullptr, name)
+                                const string& name,
+                                const Backend* parent)
+    : HostTensor(element_type, shape, nullptr, name, parent)
+{
+}
+
+runtime::HostTensor::HostTensor(const ngraph::element::Type& element_type,
+                                const Shape& shape,
+                                const Backend* parent)
+    : HostTensor(element_type, shape, nullptr, "external", parent)
+{
+}
+
+runtime::HostTensor::HostTensor(const ngraph::element::Type& element_type,
+                                const Shape& shape,
+                                void* memory_pointer,
+                                const Backend* parent)
+    : HostTensor(element_type, shape, memory_pointer, "external", parent)
 {
 }


--- a/src/ngraph/runtime/host_tensor.hpp
+++ b/src/ngraph/runtime/host_tensor.hpp
@@ -18,6 +18,7 @@

 #include <memory>

+#include "ngraph/runtime/backend.hpp"
 #include "ngraph/runtime/tensor.hpp"
 #include "ngraph/type/element_type.hpp"

@@ -36,11 +37,20 @@ class ngraph::runtime::HostTensor : public ngraph::runtime::Tensor
 public:
    HostTensor(const ngraph::element::Type& element_type,
               const Shape& shape,
-               const std::string& name = "external");
+               const std::string& name = "external",
+               const Backend* parent = nullptr);
    HostTensor(const ngraph::element::Type& element_type,
               const Shape& shape,
               void* memory_pointer,
-               const std::string& name = "external");
+               const std::string& name = "external",
+               const Backend* parent = nullptr);
+    HostTensor(const ngraph::element::Type& element_type,
+               const Shape& shape,
+               const Backend* parent);
+    HostTensor(const ngraph::element::Type& element_type,
+               const Shape& shape,
+               void* memory_pointer,
+               const Backend* parent);
    virtual ~HostTensor() override;

    char* get_data_ptr();

--- a/src/ngraph/runtime/hybrid/CMakeLists.txt
+++ b/src/ngraph/runtime/hybrid/CMakeLists.txt
@@ -14,21 +14,14 @@
 # limitations under the License.
 # ******************************************************************************

-if (NGRAPH_HYBRID_ENABLE)
-    add_library(hybrid_backend SHARED
-        hybrid_backend.cpp
-        hybrid_util.cpp
-        pass/assign_placement.cpp)
-    if(NGRAPH_LIB_VERSIONING_ENABLE)
-        set_target_properties(hybrid_backend PROPERTIES
-            VERSION ${NGRAPH_VERSION}
-            SOVERSION ${NGRAPH_API_VERSION})
-    endif()
-    target_link_libraries(hybrid_backend PUBLIC ngraph)
-    set_target_properties(hybrid_backend PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${NGRAPH_BUILD_DIR})
+add_library(hybrid_base STATIC
+    hybrid_backend.cpp
+    hybrid_util.cpp
+    pass/assign_placement.cpp
+    pass/fix_get_output_element.cpp)
+target_link_libraries(hybrid_base PUBLIC ngraph)
+set_target_properties(hybrid_base PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${NGRAPH_BUILD_DIR})

-    install(TARGETS hybrid_backend
-        LIBRARY DESTINATION "${NGRAPH_INSTALL_LIB}"
-        ARCHIVE DESTINATION "${NGRAPH_INSTALL_LIB}"
-    )
-endif()
+install(TARGETS hybrid_base
+    ARCHIVE DESTINATION "${NGRAPH_INSTALL_LIB}"
+)
--- a/src/ngraph/runtime/hybrid/hybrid_backend.cpp
+++ b/src/ngraph/runtime/hybrid/hybrid_backend.cpp
--- a/src/ngraph/runtime/hybrid/hybrid_backend.hpp
+++ b/src/ngraph/runtime/hybrid/hybrid_backend.hpp
@@ -37,8 +37,7 @@ namespace ngraph
 class ngraph::runtime::hybrid::HybridBackend : public ngraph::runtime::Backend
 {
 public:
-    HybridBackend(
-        const std::vector<std::pair<std::string, std::shared_ptr<runtime::Backend>>>& backend_list);
+    HybridBackend(const std::vector<std::shared_ptr<runtime::Backend>>& backend_list);

    std::shared_ptr<ngraph::runtime::Tensor>
        create_tensor(const ngraph::element::Type& element_type,
@@ -69,5 +68,9 @@ private:
    };

    std::map<std::shared_ptr<ngraph::Function>, FunctionInstance> m_function_map;
-    std::vector<std::pair<std::string, std::shared_ptr<runtime::Backend>>> m_backend_list;
+    std::vector<std::shared_ptr<runtime::Backend>> m_backend_list;
+
+    std::string get_placement_name(const runtime::Tensor* t);
+    std::string get_placement_name(const runtime::Backend* t);
+    size_t get_placement(const runtime::Tensor* t);
 };
--- a/src/ngraph/runtime/hybrid/hybrid_util.cpp
+++ b/src/ngraph/runtime/hybrid/hybrid_util.cpp
@@ -15,11 +15,13 @@
 //*****************************************************************************

 #include "ngraph/runtime/hybrid/hybrid_util.hpp"
+#include "ngraph/pass/manager.hpp"
+#include "ngraph/pass/visualize_tree.hpp"

 using namespace ngraph;
 using namespace std;

-static Node* take_independent_node_with_placement_priority_size(
+static Node* take_independent_node_with_placement_priority(
    map<size_t, deque<Node*>>& independent_nodes_by_placement, size_t placement)
 {
    Node* selected_node = nullptr;
@@ -45,7 +47,7 @@ static Node* take_independent_node_with_placement_priority_size(
 }

 static vector<unordered_set<shared_ptr<Node>>>
-    group_function_nodes_to_clusters_size(const shared_ptr<Function>& f)
+    group_function_nodes_to_clusters(const shared_ptr<Function>& f)
 {
    // Topologically sort nodes by picking independent node with the same placement as the
    // previously picked node greedily
@@ -66,7 +68,7 @@ static vector<unordered_set<shared_ptr<Node>>>

    list<shared_ptr<Node>> sorted_nodes;
    size_t previous_placement = 0;
-    while (Node* independent_node = take_independent_node_with_placement_priority_size(
+    while (Node* independent_node = ::take_independent_node_with_placement_priority(
               independent_nodes_by_placement, previous_placement))
    {
        previous_placement = independent_node->get_placement_index();
@@ -148,42 +150,48 @@ static vector<unordered_set<shared_ptr<Node>>>
 // |     <------[3]------+     |  |  |     <------[7]------+     |  |     <------[11]-----+     |
 // +-----+               +-----+  |  +-----+               +-----+  +-----+               +-----+

-// Suffix *_size  as a part of function name is temporary, this suffix
-//  will be removed when the backends move to the latest Hybrid backend
-pair<shared_ptr<op::Result>, shared_ptr<op::Parameter>>
-    insert_result_parameter_split_size(const shared_ptr<Node>& src_node,
-                                       const shared_ptr<Node>& dst_node)
+static map<shared_ptr<op::Result>, shared_ptr<op::Parameter>>
+    insert_result_parameter_split(const shared_ptr<Node>& src_node,
+                                  const shared_ptr<Node>& dst_node)
 {
-    if (src_node->get_output_size() != 1)
+    map<shared_ptr<op::Result>, shared_ptr<op::Parameter>> result_map;
+
+    for (descriptor::Input& input : dst_node->get_inputs())
    {
-        throw ngraph_error("Multiple output per op not supported in graph partition yet.");
-    }
+        if (input.get_output().get_node() == src_node)
+        {
+            descriptor::Input* dst_input = &input;
+            descriptor::Output* src_output = &input.get_output();

-    // Make parameter node
-    shared_ptr<op::Parameter> par_node = make_shared<op::Parameter>(
-        src_node->get_output_element_type(0), src_node->get_output_shape(0));
-    par_node->set_placement_index(dst_node->get_placement_index());
+            // Make parameter node
+            shared_ptr<op::Parameter> par_node =
+                make_shared<op::Parameter>(src_output->get_element_type(), src_output->get_shape());
+            par_node->set_placement_index(dst_node->get_placement_index());

-    // Fix input / output among src, dst and par
-    descriptor::Input* dst_input = dst_node->get_input_from(src_node);
-    descriptor::Output* src_output = src_node->get_output_to(dst_node);
-    src_output->remove_input(dst_input);    // Remove [0]
-    dst_input->replace_output(par_node, 0); // Remove [0] (again), add [8], remove [1], add [9]
+            // Fix input / output among src, dst and par
+            // Remove [0]
+            src_output->remove_input(dst_input);

-    // Add res node
-    shared_ptr<op::Result> res_node = make_shared<op::Result>(src_node); // Add [4], [5], [6], [7]
-    res_node->set_placement_index(src_node->get_placement_index());
+            // Remove [0] (again), add [8], remove [1], add [9]
+            dst_input->replace_output(par_node, 0);

-    return make_pair(res_node, par_node);
+            // Add res node
+            shared_ptr<op::Result> res_node =
+                make_shared<op::Result>(src_node); // Add [4], [5], [6], [7]
+            res_node->set_placement_index(src_node->get_placement_index());
+
+            result_map.insert({res_node, par_node});
+        }
+    }
+    return result_map;
 }

-// Suffix *_size  as a part of function name is temporary, this suffix
 //  will be removed when the backends move to the latest Hybrid backend
 pair<vector<shared_ptr<Function>>, unordered_map<shared_ptr<op::Parameter>, shared_ptr<op::Result>>>
-    runtime::hybrid::split_function_by_placement_size(const shared_ptr<Function>& f)
+    runtime::hybrid::split_function_by_placement(const shared_ptr<Function>& f)
 {
    // Split functions to clusters of nodes that can be computed together
-    vector<unordered_set<shared_ptr<Node>>> clusters = group_function_nodes_to_clusters_size(f);
+    vector<unordered_set<shared_ptr<Node>>> clusters = ::group_function_nodes_to_clusters(f);

    // Map from (intermediate) parameter to result node, for guiding data copy among devices
    unordered_map<shared_ptr<op::Parameter>, shared_ptr<op::Result>> map_parameter_to_result;
@@ -208,15 +216,18 @@ pair<vector<shared_ptr<Function>>, unordered_map<shared_ptr<op::Parameter>, shar
            if (src_cluster != dst_cluster)
            {
                // Split src_node and dst_node
-                pair<shared_ptr<op::Result>, shared_ptr<op::Parameter>> res_par_pair =
-                    insert_result_parameter_split_size(src_node, dst_node);
-                shared_ptr<op::Result> res_node = res_par_pair.first;
-                shared_ptr<op::Parameter> par_node = res_par_pair.second;
-                map_parameter_to_result[par_node] = res_node;
+                map<shared_ptr<op::Result>, shared_ptr<op::Parameter>> res_par_pair_map =
+                    ::insert_result_parameter_split(src_node, dst_node);
+                for (const auto& res_par_pair : res_par_pair_map)
+                {
+                    shared_ptr<op::Result> res_node = res_par_pair.first;
+                    shared_ptr<op::Parameter> par_node = res_par_pair.second;
+                    map_parameter_to_result[par_node] = res_node;

-                // Insert newly created nodes into clusters
-                src_cluster->insert(res_node);
-                dst_cluster->insert(par_node);
+                    // Insert newly created nodes into clusters
+                    src_cluster->insert(res_node);
+                    dst_cluster->insert(par_node);
+                }
            }
        }
    }
@@ -240,15 +251,19 @@ pair<vector<shared_ptr<Function>>, unordered_map<shared_ptr<op::Parameter>, shar
        }
        auto sub_function = make_shared<Function>(res_vector, par_vector);
        sub_functions.push_back(sub_function);
+#ifdef HYBRID_DEBUG
+        ngraph::pass::Manager pass_manager;
+        pass_manager.register_pass<ngraph::pass::VisualizeTree>("subgraph_" + to_string(index++) +
+                                                                ".png");
+        pass_manager.run_passes(sub_function);
+#endif
    }

    return make_pair(sub_functions, map_parameter_to_result);
 }

-// Suffix *_size  as a part of function name is temporary, this suffix
-//  will be removed when the backends move to the latest Hybrid backend
 // Assert that nodes in the function is colocated and return that placement
-size_t runtime::hybrid::get_colocated_function_placement_size(shared_ptr<Function> func)
+size_t runtime::hybrid::get_colocated_function_placement(shared_ptr<Function> func)
 {
    auto ops = func->get_ops();

@@ -259,7 +274,7 @@ size_t runtime::hybrid::get_colocated_function_placement_size(shared_ptr<Functio
        size_t node_placement = op->get_placement_index();
        if (node_placement == Node::placement_invalid)
        {
-            throw ngraph_error("Node should have a device placement");
+            throw ngraph_error("Node " + op->get_name() + " should have a device placement");
        }
        if (function_placement != node_placement)
        {

--- a/src/ngraph/runtime/hybrid/hybrid_util.hpp
+++ b/src/ngraph/runtime/hybrid/hybrid_util.hpp
@@ -34,10 +34,10 @@ namespace ngraph
            std::pair<
                std::vector<std::shared_ptr<Function>>,
                std::unordered_map<std::shared_ptr<op::Parameter>, std::shared_ptr<op::Result>>>
-                split_function_by_placement_size(const std::shared_ptr<Function>& f);
+                split_function_by_placement(const std::shared_ptr<Function>& f);

            // Assert that nodes in the function is colocated and return that placement
-            size_t get_colocated_function_placement_size(std::shared_ptr<Function> func);
+            size_t get_colocated_function_placement(std::shared_ptr<Function> func);
        }
    }
 }
--- a/src/ngraph/runtime/hybrid/pass/assign_placement.cpp
+++ b/src/ngraph/runtime/hybrid/pass/assign_placement.cpp
@@ -24,7 +24,7 @@ using namespace ngraph;
 using namespace std;

 runtime::hybrid::pass::AssignPlacement::AssignPlacement(
-    vector<shared_ptr<runtime::Backend>> placement_backends)
+    const vector<shared_ptr<runtime::Backend>>& placement_backends)
    : m_placement_backends(placement_backends)
 {
 }

--- a/src/ngraph/runtime/hybrid/pass/assign_placement.hpp
+++ b/src/ngraph/runtime/hybrid/pass/assign_placement.hpp
@@ -39,8 +39,8 @@ namespace ngraph
 class ngraph::runtime::hybrid::pass::AssignPlacement : public ngraph::pass::NodePass
 {
 public:
-    // TODO: make policy a class
-    AssignPlacement(std::vector<std::shared_ptr<ngraph::runtime::Backend>> placement_backends);
+    AssignPlacement(
+        const std::vector<std::shared_ptr<ngraph::runtime::Backend>>& placement_backends);

 private:
    bool run_on_node(std::shared_ptr<Node> node) override;

--- a/src/ngraph/runtime/hybrid/pass/fix_get_output_element.cpp
+++ b/src/ngraph/runtime/hybrid/pass/fix_get_output_element.cpp
+//*****************************************************************************
+// Copyright 2017-2018 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+
+#include "ngraph/runtime/hybrid/pass/fix_get_output_element.hpp"
+#include "ngraph/log.hpp"
+#include "ngraph/node.hpp"
+#include "ngraph/placement.hpp"
+#include "ngraph/runtime/backend.hpp"
+
+using namespace ngraph;
+using namespace std;
+
+runtime::hybrid::pass::FixGetOutputElement::FixGetOutputElement()
+{
+}
+
+bool runtime::hybrid::pass::FixGetOutputElement::run_on_node(shared_ptr<Node> node)
+{
+    if (node->description() == "GetOutputElement")
+    {
+        auto parent = node->get_arguments().at(0);
+        node->set_placement_index(parent->get_placement_index());
+    }
+    return false;
+}
--- a/src/ngraph/runtime/hybrid/pass/fix_get_output_element.hpp
+++ b/src/ngraph/runtime/hybrid/pass/fix_get_output_element.hpp
+//*****************************************************************************
+// Copyright 2017-2018 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+
+#pragma once
+
+#include <exception>
+#include <functional>
+#include <sstream>
+
+#include "ngraph/pass/pass.hpp"
+
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace hybrid
+        {
+            namespace pass
+            {
+                class FixGetOutputElement;
+            }
+        }
+    }
+}
+
+class ngraph::runtime::hybrid::pass::FixGetOutputElement : public ngraph::pass::NodePass
+{
+public:
+    FixGetOutputElement();
+
+private:
+    bool run_on_node(std::shared_ptr<Node> node) override;
+};
--- a/src/ngraph/runtime/hybrid/unit_test.manifest
+++ b/src/ngraph/runtime/hybrid/unit_test.manifest
--- a/src/ngraph/runtime/intelgpu/intelgpu_backend.cpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_backend.cpp
@@ -390,14 +390,15 @@ shared_ptr<runtime::Tensor>
    runtime::intelgpu::IntelGPUBackend::create_tensor(const element::Type& element_type,
                                                      const Shape& shape)
 {
-    return make_shared<runtime::intelgpu::IntelGPUTensorView>(element_type, shape, *ocl_engine);
+    return make_shared<runtime::intelgpu::IntelGPUTensorView>(
+        element_type, shape, *ocl_engine, nullptr, this);
 }

 shared_ptr<runtime::Tensor> runtime::intelgpu::IntelGPUBackend::create_tensor(
    const element::Type& element_type, const Shape& shape, void* memory_pointer)
 {
    return make_shared<runtime::intelgpu::IntelGPUTensorView>(
-        element_type, shape, *ocl_engine, memory_pointer);
+        element_type, shape, *ocl_engine, memory_pointer, this);
 }

 runtime::Handle runtime::intelgpu::IntelGPUBackend::compile(shared_ptr<Function> func)

--- a/src/ngraph/runtime/intelgpu/intelgpu_tensor_view.cpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_tensor_view.cpp
@@ -28,8 +28,9 @@ using namespace std;
 runtime::intelgpu::IntelGPUTensorView::IntelGPUTensorView(const element::Type& element_type,
                                                          const Shape& shape,
                                                          const cldnn::engine& backend_engine,
-                                                          void* memory_pointer)
-    : runtime::Tensor(make_shared<descriptor::Tensor>(element_type, shape, "external"))
+                                                          void* memory_pointer,
+                                                          const runtime::Backend* parent)
+    : runtime::Tensor(make_shared<descriptor::Tensor>(element_type, shape, "external"), parent)
 {
    const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(element_type, shape);


--- a/src/ngraph/runtime/intelgpu/intelgpu_tensor_view.hpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_tensor_view.hpp
@@ -38,7 +38,8 @@ public:
    IntelGPUTensorView(const element::Type& element_type,
                       const Shape& shape,
                       const cldnn::engine& backend_engine,
-                       void* memory_pointer = nullptr);
+                       void* memory_pointer,
+                       const runtime::Backend* parent);

    /// \brief Write bytes directly into the tensor
    /// \param p Pointer to source of data

--- a/src/ngraph/runtime/interpreter/int_backend.cpp
+++ b/src/ngraph/runtime/interpreter/int_backend.cpp
@@ -43,16 +43,25 @@ extern "C" runtime::Backend* new_backend(const char* configuration_string)
    return new runtime::interpreter::INTBackend();
 }

+runtime::interpreter::INTBackend::INTBackend()
+{
+}
+
+runtime::interpreter::INTBackend::INTBackend(const vector<string>& unsupported_op_name_list)
+    : m_unsupported_op_name_list{unsupported_op_name_list.begin(), unsupported_op_name_list.end()}
+{
+}
+
 shared_ptr<runtime::Tensor>
    runtime::interpreter::INTBackend::create_tensor(const element::Type& type, const Shape& shape)
 {
-    return make_shared<runtime::HostTensor>(type, shape, "external");
+    return make_shared<runtime::HostTensor>(type, shape, this);
 }

 shared_ptr<runtime::Tensor> runtime::interpreter::INTBackend::create_tensor(
    const element::Type& type, const Shape& shape, void* memory_pointer)
 {
-    return make_shared<runtime::HostTensor>(type, shape, memory_pointer, "external");
+    return make_shared<runtime::HostTensor>(type, shape, memory_pointer, this);
 }

 runtime::Handle runtime::interpreter::INTBackend::compile(shared_ptr<Function> function)
@@ -336,3 +345,8 @@ void runtime::interpreter::INTBackend::perform_nan_check(
        arg_number++;
    }
 }
+
+bool runtime::interpreter::INTBackend::is_supported(const Node& node) const
+{
+    return m_unsupported_op_name_list.find(node.description()) == m_unsupported_op_name_list.end();
+}
--- a/src/ngraph/runtime/interpreter/int_backend.hpp
+++ b/src/ngraph/runtime/interpreter/int_backend.hpp
@@ -16,6 +16,7 @@

 #pragma once

+#include <initializer_list>
 #include <memory>
 #include <sstream>
 #include <string>
@@ -156,6 +157,12 @@ namespace ngraph
 class ngraph::runtime::interpreter::INTBackend : public Backend
 {
 public:
+    INTBackend();
+    INTBackend(const std::vector<std::string>& unsupported_op_name_list);
+    INTBackend(const INTBackend&) = delete;
+    INTBackend(INTBackend&&) = delete;
+    INTBackend& operator=(const INTBackend&) = delete;
+
    std::shared_ptr<Tensor>
        create_tensor(const element::Type& type, const Shape& shape, void* memory_pointer) override;

@@ -173,7 +180,8 @@ public:
    std::vector<PerformanceCounter>
        get_performance_data(std::shared_ptr<Function> func) const override;

-    bool is_supported(const Node& node) const override { return true; }
+    bool is_supported(const Node& node) const override;
+
 private:
    int get_alignment() const { return 64; }
    class FunctionInstance
@@ -190,6 +198,7 @@ private:
        void* get_temporary_pointer(size_t offset) { return m_temporary_memory->get_ptr(offset); }
    };
    std::map<std::shared_ptr<Function>, FunctionInstance> m_function_map;
+    std::set<std::string> m_unsupported_op_name_list;

    static void perform_nan_check(const std::vector<std::shared_ptr<HostTensor>>&,
                                  const Node* op = nullptr);

--- a/src/ngraph/runtime/tensor.hpp
+++ b/src/ngraph/runtime/tensor.hpp
@@ -21,6 +21,7 @@

 #include "ngraph/descriptor/layout/tensor_layout.hpp"
 #include "ngraph/descriptor/tensor.hpp"
+#include "ngraph/runtime/backend.hpp"
 #include "ngraph/shape.hpp"
 #include "ngraph/strides.hpp"
 #include "ngraph/type/element_type.hpp"
@@ -37,9 +38,11 @@ namespace ngraph
        class Tensor
        {
        protected:
-            Tensor(const std::shared_ptr<ngraph::descriptor::Tensor>& descriptor)
+            Tensor(const std::shared_ptr<ngraph::descriptor::Tensor>& descriptor,
+                   const Backend* parent)
                : m_descriptor(descriptor)
                , m_stale(true)
+                , m_parent(parent)
            {
            }

@@ -104,9 +107,11 @@ namespace ngraph
            /// \param source The source tensor
            virtual void copy_from(const ngraph::runtime::Tensor& source);

+            const Backend* get_parent() const { return m_parent; }
        protected:
            std::shared_ptr<ngraph::descriptor::Tensor> m_descriptor;
            bool m_stale;
+            const Backend* m_parent;
        };

        using TensorViewPtrs = std::vector<std::shared_ptr<Tensor>>;

--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -225,10 +225,6 @@ if (NGRAPH_INTERPRETER_ENABLE)
    target_link_libraries(unit-test PRIVATE interpreter_backend)
 endif()

-if (NGRAPH_HYBRID_ENABLE)
-    target_link_libraries(unit-test PRIVATE hybrid_backend)
-endif()
-
 if (NGRAPH_GPU_ENABLE)
    target_link_libraries(unit-test PRIVATE gpu_backend)
 endif()