Hybrid GPU Backend (#2240)

* Add GPUH hybrid backend * update manifests * update node operator<< * fix GOE * remove debug * remove debug * more cleanup * add parent support to cpu and intel gpu backend tensors * cleanup * fix odd failure when printing node during construction * fix node output * address review comments * style

Hybrid GPU Backend (#2240)
* Add GPUH hybrid backend * update manifests * update node operator<< * fix GOE * remove debug * remove debug * more cleanup * add parent support to cpu and intel gpu backend tensors * cleanup * fix odd failure when printing node during construction * fix node output * address review comments * style
90503652 · Robert Kimball · GitHub · 42f16035 · 90503652 · 90503652
Unverified Commit 90503652 authored Dec 23, 2018 by Robert Kimball Committed by GitHub Dec 23, 2018
33 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -75,7 +75,6 @@ set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/Modules/")
 include(var_functions)
-set(NGRAPH_HYBRID_ENABLE TRUE)
 option(NGRAPH_UNIT_TEST_ENABLE "Control the building of unit tests" TRUE)
 option(NGRAPH_TOOLS_ENABLE "Control the building of tool" TRUE)
 option(NGRAPH_CPU_ENABLE "Control the building of the CPU backend" TRUE)
@@ -92,6 +91,10 @@ option(NGRAPH_CODE_COVERAGE_ENABLE "Enable code coverage data collection" FALSE)
 option(NGRAPH_LIB_VERSIONING_ENABLE "Enable shared library versioning" FALSE)
 option(NGRAPH_PYTHON_BUILD_ENABLE   "Enable build nGraph python package wheel" FALSE)
+if (NGRAPH_GPUH_ENABLE)
+    set(NGRAPH_GPU_ENABLE TRUE)
+endif()
 message(STATUS "NGRAPH_UNIT_TEST_ENABLE:      ${NGRAPH_UNIT_TEST_ENABLE}")
 message(STATUS "NGRAPH_TOOLS_ENABLE:          ${NGRAPH_TOOLS_ENABLE}")
 message(STATUS "NGRAPH_CPU_ENABLE:            ${NGRAPH_CPU_ENABLE}")
@@ -108,10 +111,6 @@ message(STATUS "NGRAPH_CODE_COVERAGE_ENABLE:  ${NGRAPH_CODE_COVERAGE_ENABLE}")
 message(STATUS "NGRAPH_LIB_VERSIONING_ENABLE: ${NGRAPH_LIB_VERSIONING_ENABLE}")
 message(STATUS "NGRAPH_PYTHON_BUILD_ENABLE:   ${NGRAPH_PYTHON_BUILD_ENABLE}")
-if (NGRAPH_HYBRID_ENABLE)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DNGRAPH_HYBRID_ENABLE")
-endif()
 if (NGRAPH_ONNX_IMPORT_ENABLE)
    option(NGRAPH_USE_SYSTEM_PROTOBUF "Use system provided Protobuf shared object" FALSE)
    option(NGRAPH_ONNXIFI_ENABLE "Enable ONNX Interface for Framework Integration" TRUE)

--- a/src/ngraph/node.cpp
+++ b/src/ngraph/node.cpp
@@ -238,7 +238,14 @@ std::ostream& Node::write_long_description(std::ostream& out) const
    {
        out << sep << NodeDescription(*arg, true) << ": "
            << pretty_element_type(arg->get_output_element_type(0))
-            << arg->get_output_partial_shape(0) << "";
+            << arg->get_output_partial_shape(0);
+        sep = ", ";
+    }
+    out << ") -> (";
+    sep = "";
+    for (const auto& o : get_outputs())
+    {
+        out << sep << pretty_element_type(o.get_element_type()) << o.get_partial_shape();
        sep = ", ";
    }
    out << ")";

--- a/src/ngraph/runtime/CMakeLists.txt
+++ b/src/ngraph/runtime/CMakeLists.txt
@@ -15,10 +15,7 @@
 # ******************************************************************************
 add_subdirectory(interpreter)
+add_subdirectory(hybrid)
-if (NGRAPH_HYBRID_ENABLE)
-    add_subdirectory(hybrid)
-endif()
 if (NGRAPH_CPU_ENABLE)
    add_subdirectory(cpu)

--- a/src/ngraph/runtime/cpu/cpu_backend.cpp
+++ b/src/ngraph/runtime/cpu/cpu_backend.cpp
@@ -58,13 +58,13 @@ shared_ptr<runtime::cpu::CPU_CallFrame> runtime::cpu::CPU_Backend::make_call_fra
 shared_ptr<runtime::Tensor>
    runtime::cpu::CPU_Backend::create_tensor(const element::Type& element_type, const Shape& shape)
 {
-    return make_shared<runtime::cpu::CPUTensorView>(element_type, shape);
+    return make_shared<runtime::cpu::CPUTensorView>(element_type, shape, this);
 }
 shared_ptr<runtime::Tensor> runtime::cpu::CPU_Backend::create_tensor(
    const element::Type& element_type, const Shape& shape, void* memory_pointer)
 {
-    return make_shared<runtime::cpu::CPUTensorView>(element_type, shape, memory_pointer);
+    return make_shared<runtime::cpu::CPUTensorView>(element_type, shape, memory_pointer, this);
 }
 runtime::Handle runtime::cpu::CPU_Backend::compile(shared_ptr<Function> func)

--- a/src/ngraph/runtime/cpu/cpu_tensor_view.cpp
+++ b/src/ngraph/runtime/cpu/cpu_tensor_view.cpp
@@ -35,8 +35,9 @@ using namespace std;
 runtime::cpu::CPUTensorView::CPUTensorView(const ngraph::element::Type& element_type,
                                           const Shape& shape,
                                           void* memory_pointer,
-                                           const string& name)
+                                           const runtime::Backend* parent)
-    : runtime::Tensor(std::make_shared<ngraph::descriptor::Tensor>(element_type, shape, name))
+    : runtime::Tensor(std::make_shared<ngraph::descriptor::Tensor>(element_type, shape, "external"),
+                      parent)
    , buffer(nullptr)
    , aligned_buffer(nullptr)
 {
@@ -77,8 +78,8 @@ runtime::cpu::CPUTensorView::CPUTensorView(const ngraph::element::Type& element_
 runtime::cpu::CPUTensorView::CPUTensorView(const ngraph::element::Type& element_type,
                                           const Shape& shape,
-                                           const string& name)
+                                           const runtime::Backend* parent)
-    : CPUTensorView(element_type, shape, nullptr, name)
+    : CPUTensorView(element_type, shape, nullptr, parent)
 {
 }

--- a/src/ngraph/runtime/cpu/cpu_tensor_view.hpp
+++ b/src/ngraph/runtime/cpu/cpu_tensor_view.hpp
@@ -35,11 +35,11 @@ namespace ngraph
            public:
                CPUTensorView(const ngraph::element::Type& element_type,
                              const Shape& shape,
-                              const std::string& name = "external");
+                              const runtime::Backend* parent);
                CPUTensorView(const ngraph::element::Type& element_type,
                              const Shape& shape,
                              void* memory_pointer,
-                              const std::string& name = "external");
+                              const runtime::Backend* parent);
                virtual ~CPUTensorView() override;
                char* get_data_ptr();

--- a/src/ngraph/runtime/gpu/CMakeLists.txt
+++ b/src/ngraph/runtime/gpu/CMakeLists.txt
@@ -159,11 +159,6 @@ if (NGRAPH_GPU_ENABLE)
            ${CUDA_LIBRARIES}
            ${CUDA_CUBLAS_LIBRARIES}
            ${CUDNN_LIBRARIES})
-    if (NGRAPH_HYBRID_ENABLE)
-        target_link_libraries(gpu_backend
-            PRIVATE
-                hybrid_backend)
-    endif()
    set_target_properties(gpu_backend PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${NGRAPH_BUILD_DIR})

--- a/src/ngraph/runtime/gpu/gpu_backend.cpp
+++ b/src/ngraph/runtime/gpu/gpu_backend.cpp
@@ -20,6 +20,7 @@
 #include <cudnn.h>
 #include "ngraph/graph_util.hpp"
+#include "ngraph/op/batch_norm.hpp"
 #include "ngraph/runtime/gpu/gpu_backend.hpp"
 #include "ngraph/runtime/gpu/gpu_external_function.hpp"
 #include "ngraph/runtime/gpu/gpu_primitive_emitter.hpp"
@@ -107,13 +108,13 @@ runtime::gpu::GPU_Backend::BackendContext::~BackendContext()
 shared_ptr<runtime::Tensor>
    runtime::gpu::GPU_Backend::create_tensor(const element::Type& element_type, const Shape& shape)
 {
-    return make_shared<runtime::gpu::GPUTensor>(element_type, shape);
+    return make_shared<runtime::gpu::GPUTensor>(element_type, shape, this);
 }
 shared_ptr<runtime::Tensor> runtime::gpu::GPU_Backend::create_tensor(
    const element::Type& element_type, const Shape& shape, void* memory_pointer)
 {
-    return make_shared<runtime::gpu::GPUTensor>(element_type, shape, memory_pointer);
+    return make_shared<runtime::gpu::GPUTensor>(element_type, shape, memory_pointer, this);
 }
 runtime::Handle runtime::gpu::GPU_Backend::compile(shared_ptr<Function> func)
@@ -222,33 +223,53 @@ vector<runtime::PerformanceCounter>
    return rc;
 }
-bool runtime::gpu::GPU_Backend::is_supported(const Node& node) const
+bool runtime::gpu::GPU_Backend::is_supported(const Node& op) const
 {
-    bool rc = true;
+    set<string> unsupported_ops = {"Quantize",
+                                   "Dequantize",
+                                   "ShapeOf",
+                                   "All",
+                                   "Any",
+                                   "AllReduce",
+                                   "SelectAndScatter",
+                                   "StopGradient",
+                                   "EmbeddingLookup",
+                                   "GenerateMask"};
-    // get op type
+    set<string> float_only = {"MaxPoolBackprop", "AvgPoolBackprop", "MaxPool", "Dot"};
-    element::Type type;
-    if (node.description() == "Select")
+    if (unsupported_ops.find(op.description()) != unsupported_ops.end())
    {
-        type = node.get_input_element_type(1);
+        return false;
    }
-    else if (node.description() == "Constant")
+    if (float_only.find(op.description()) != float_only.end())
    {
-        type = node.get_outputs().at(0).get_element_type();
+        if (op.get_output_element_type(0) != element::f32 &&
-    }
+            op.get_output_element_type(0) != element::f64)
-    else if (node.description() == "Parameter")
        {
-        type = node.get_outputs().at(0).get_element_type();
+            return false;
        }
-    else
-    {
-        type = node.get_input_element_type(0);
    }
-    if (type != element::f32)
+    if (op.description() == "BatchNormInference")
+    {
+        const ngraph::op::BatchNormInference* bn =
+            static_cast<const ngraph::op::BatchNormInference*>(&op);
+        if (bn->get_eps_value() < CUDNN_BN_MIN_EPSILON)
        {
-        rc = false;
+            return false;
+        }
+    }
+    else if (op.description() == "BatchNormTraining")
+    {
+        const ngraph::op::BatchNormTraining* bn =
+            static_cast<const ngraph::op::BatchNormTraining*>(&op);
+        if (bn->get_eps_value() < CUDNN_BN_MIN_EPSILON)
+        {
+            return false;
+        }
    }
-    return rc;
+    return true;
 }
--- a/src/ngraph/runtime/gpu/gpu_tensor.cpp
+++ b/src/ngraph/runtime/gpu/gpu_tensor.cpp
@@ -29,8 +29,10 @@ using namespace std;
 runtime::gpu::GPUTensor::GPUTensor(const ngraph::element::Type& element_type,
                                   const Shape& shape,
-                                   void* memory_pointer)
+                                   void* memory_pointer,
-    : runtime::Tensor(std::make_shared<ngraph::descriptor::Tensor>(element_type, shape, "external"))
+                                   const Backend* backend)
+    : runtime::Tensor(std::make_shared<ngraph::descriptor::Tensor>(element_type, shape, "external"),
+                      backend)
    , m_custom_memory(false)
 {
    m_descriptor->set_tensor_layout(
@@ -48,8 +50,10 @@ runtime::gpu::GPUTensor::GPUTensor(const ngraph::element::Type& element_type,
    }
 }
-runtime::gpu::GPUTensor::GPUTensor(const ngraph::element::Type& element_type, const Shape& shape)
+runtime::gpu::GPUTensor::GPUTensor(const ngraph::element::Type& element_type,
-    : GPUTensor(element_type, shape, nullptr)
+                                   const Shape& shape,
+                                   const Backend* backend)
+    : GPUTensor(element_type, shape, nullptr, backend)
 {
 }

--- a/src/ngraph/runtime/gpu/gpu_tensor.hpp
+++ b/src/ngraph/runtime/gpu/gpu_tensor.hpp
@@ -16,9 +16,9 @@
 #pragma once
-#include <cuda.h>
 #include <memory>
+#include "ngraph/runtime/backend.hpp"
 #include "ngraph/runtime/tensor.hpp"
 #include "ngraph/type/element_type.hpp"
@@ -36,8 +36,11 @@ namespace ngraph
 class ngraph::runtime::gpu::GPUTensor : public ngraph::runtime::Tensor
 {
 public:
-    GPUTensor(const ngraph::element::Type& element_type, const Shape& shape);
+    GPUTensor(const ngraph::element::Type& element_type, const Shape& shape, const Backend* parent);
-    GPUTensor(const ngraph::element::Type& element_type, const Shape& shape, void* memory_pointer);
+    GPUTensor(const ngraph::element::Type& element_type,
+              const Shape& shape,
+              void* memory_pointer,
+              const Backend* parent);
    virtual ~GPUTensor() override;
    /// \brief Write bytes directly into the tensor

--- a/src/ngraph/runtime/gpu/unit_test.manifest
+++ b/src/ngraph/runtime/gpu/unit_test.manifest
-#int64 is not supprted by cuDNN
+# need to check
-batch_norm_one_output
-batch_norm_three_outputs
-backwards_batch_norm_three_outputs
-#need to check
 computation_reuse
-#cuda does not support throw
+# cuda does not support throw
 divide_by_zero_int32
-#int64 is not supprted by cuDNN
+# int64 is not supprted by cuDNN
 dot_matrix_vector_int64
 generate_mask
-#error throw is not the same on GPU, not supported yet
+# select_and_scatter is deprecated
-one_hot_scalar_fp_nonint_in_3
-one_hot_scalar_oob_in_3
-one_hot_vector_1_barely_oob
-one_hot_vector_1_far_oob
-one_hot_vector_1_fp_nonint
-#select_and_scatter is deprecated
 select_and_scatter_3d_without_overlap
 select_and_scatter_with_overlap
 select_and_scatter_without_overlap
-#custom_mem is not implemented on GPU
+# custom_mem is not implemented on GPU
 tensorview_custom_mem
-#integer is not supported by cuDNN on backward pooling
+# integer is not supported by cuDNN on backward pooling
 backwards_maxpool_n4_c1_hw4_2x2_max
 backwards_maxpool_n2_c1_hw5_3x3_str2_max
 backwards_avgpool_n1_c1_hw2x2

--- a/src/ngraph/runtime/gpuh/CMakeLists.txt
+++ b/src/ngraph/runtime/gpuh/CMakeLists.txt
@@ -21,7 +21,7 @@ if (NGRAPH_GPUH_ENABLE)
            VERSION ${NGRAPH_VERSION}
            SOVERSION ${NGRAPH_API_VERSION})
    endif()
-    target_link_libraries(gpuh_backend PUBLIC ngraph)
+    target_link_libraries(gpuh_backend PUBLIC ngraph hybrid_base gpu_backend)
    set_target_properties(gpuh_backend PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${NGRAPH_BUILD_DIR})
    install(TARGETS gpuh_backend

--- a/src/ngraph/runtime/gpuh/gpuh_backend.cpp
+++ b/src/ngraph/runtime/gpuh/gpuh_backend.cpp
@@ -18,6 +18,7 @@
 #include "ngraph/graph_util.hpp"
 #include "ngraph/pass/assign_placement.hpp"
 #include "ngraph/pass/manager.hpp"
+#include "ngraph/runtime/gpu/gpu_backend.hpp"
 #include "ngraph/runtime/interpreter/int_backend.hpp"
 #include "ngraph/runtime/tensor.hpp"
@@ -34,7 +35,13 @@ extern "C" runtime::Backend* new_backend(const char* configuration_string)
    return new runtime::gpuh::GPUHBackend();
 }
+vector<string> get_excludes()
+{
+    return vector<string>{{"Not"}};
+}
 runtime::gpuh::GPUHBackend::GPUHBackend()
-    : HybridBackend({{"INTERPRETER", make_shared<ngraph::runtime::interpreter::INTBackend>()}})
+    : HybridBackend({make_shared<ngraph::runtime::gpu::GPU_Backend>(),
+                     make_shared<ngraph::runtime::interpreter::INTBackend>()})
 {
 }
--- a/src/ngraph/runtime/gpuh/unit_test.manifest
+++ b/src/ngraph/runtime/gpuh/unit_test.manifest
+computation_reuse
+tensorview_custom_mem
+batch_norm_inference_f64
+batch_norm_inference_f32
+divide_by_zero_int32
--- a/src/ngraph/runtime/host_tensor.cpp
+++ b/src/ngraph/runtime/host_tensor.cpp
@@ -26,8 +26,10 @@ using namespace std;
 runtime::HostTensor::HostTensor(const ngraph::element::Type& element_type,
                                const Shape& shape,
                                void* memory_pointer,
-                                const string& name)
+                                const string& name,
-    : runtime::Tensor(std::make_shared<ngraph::descriptor::Tensor>(element_type, shape, name))
+                                const Backend* parent)
+    : runtime::Tensor(std::make_shared<ngraph::descriptor::Tensor>(element_type, shape, name),
+                      parent)
    , m_allocated_buffer_pool(nullptr)
    , m_aligned_buffer_pool(nullptr)
@@ -56,8 +58,24 @@ runtime::HostTensor::HostTensor(const ngraph::element::Type& element_type,
 runtime::HostTensor::HostTensor(const ngraph::element::Type& element_type,
                                const Shape& shape,
-                                const string& name)
+                                const string& name,
-    : HostTensor(element_type, shape, nullptr, name)
+                                const Backend* parent)
+    : HostTensor(element_type, shape, nullptr, name, parent)
+{
+}
+runtime::HostTensor::HostTensor(const ngraph::element::Type& element_type,
+                                const Shape& shape,
+                                const Backend* parent)
+    : HostTensor(element_type, shape, nullptr, "external", parent)
+{
+}
+runtime::HostTensor::HostTensor(const ngraph::element::Type& element_type,
+                                const Shape& shape,
+                                void* memory_pointer,
+                                const Backend* parent)
+    : HostTensor(element_type, shape, memory_pointer, "external", parent)
 {
 }

--- a/src/ngraph/runtime/host_tensor.hpp
+++ b/src/ngraph/runtime/host_tensor.hpp
@@ -18,6 +18,7 @@
 #include <memory>
+#include "ngraph/runtime/backend.hpp"
 #include "ngraph/runtime/tensor.hpp"
 #include "ngraph/type/element_type.hpp"
@@ -36,11 +37,20 @@ class ngraph::runtime::HostTensor : public ngraph::runtime::Tensor
 public:
    HostTensor(const ngraph::element::Type& element_type,
               const Shape& shape,
-               const std::string& name = "external");
+               const std::string& name = "external",
+               const Backend* parent = nullptr);
    HostTensor(const ngraph::element::Type& element_type,
               const Shape& shape,
               void* memory_pointer,
-               const std::string& name = "external");
+               const std::string& name = "external",
+               const Backend* parent = nullptr);
+    HostTensor(const ngraph::element::Type& element_type,
+               const Shape& shape,
+               const Backend* parent);
+    HostTensor(const ngraph::element::Type& element_type,
+               const Shape& shape,
+               void* memory_pointer,
+               const Backend* parent);
    virtual ~HostTensor() override;
    char* get_data_ptr();

--- a/src/ngraph/runtime/hybrid/CMakeLists.txt
+++ b/src/ngraph/runtime/hybrid/CMakeLists.txt
@@ -14,21 +14,14 @@
 # limitations under the License.
 # ******************************************************************************
-if (NGRAPH_HYBRID_ENABLE)
+add_library(hybrid_base STATIC
-    add_library(hybrid_backend SHARED
    hybrid_backend.cpp
    hybrid_util.cpp
-        pass/assign_placement.cpp)
+    pass/assign_placement.cpp
-    if(NGRAPH_LIB_VERSIONING_ENABLE)
+    pass/fix_get_output_element.cpp)
-        set_target_properties(hybrid_backend PROPERTIES
+target_link_libraries(hybrid_base PUBLIC ngraph)
-            VERSION ${NGRAPH_VERSION}
+set_target_properties(hybrid_base PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${NGRAPH_BUILD_DIR})
-            SOVERSION ${NGRAPH_API_VERSION})
-    endif()
-    target_link_libraries(hybrid_backend PUBLIC ngraph)
-    set_target_properties(hybrid_backend PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${NGRAPH_BUILD_DIR})
-    install(TARGETS hybrid_backend
+install(TARGETS hybrid_base
-        LIBRARY DESTINATION "${NGRAPH_INSTALL_LIB}"
    ARCHIVE DESTINATION "${NGRAPH_INSTALL_LIB}"
-    )
+)
-endif()
--- a/src/ngraph/runtime/hybrid/hybrid_backend.cpp
+++ b/src/ngraph/runtime/hybrid/hybrid_backend.cpp
@@ -17,36 +17,21 @@
 #include "ngraph/runtime/hybrid/hybrid_backend.hpp"
 #include "ngraph/graph_util.hpp"
 #include "ngraph/pass/manager.hpp"
+#include "ngraph/pass/visualize_tree.hpp"
+#include "ngraph/runtime/gpu/gpu_backend.hpp"
+#include "ngraph/runtime/gpu/gpu_tensor.hpp"
+#include "ngraph/runtime/host_tensor.hpp"
 #include "ngraph/runtime/hybrid/hybrid_util.hpp"
 #include "ngraph/runtime/hybrid/pass/assign_placement.hpp"
+#include "ngraph/runtime/hybrid/pass/fix_get_output_element.hpp"
+#include "ngraph/runtime/interpreter/int_backend.hpp"
 #include "ngraph/runtime/tensor.hpp"
 using namespace ngraph;
 using namespace std;
-template <typename T>
-void copy_data(std::shared_ptr<ngraph::runtime::Tensor> tv, const std::vector<T>& data)
-{
-    size_t data_size = data.size() * sizeof(T);
-    tv->write(data.data(), 0, data_size);
-}
-template <typename T>
-std::vector<T> read_vector(std::shared_ptr<ngraph::runtime::Tensor> tv)
-{
-    if (ngraph::element::from<T>() != tv->get_tensor_layout()->get_element_type())
-    {
-        throw std::invalid_argument("read_vector type must match Tensor type");
-    }
-    size_t element_count = ngraph::shape_size(tv->get_shape());
-    size_t size = element_count * sizeof(T);
-    std::vector<T> rc(element_count);
-    tv->read(rc.data(), 0, size);
-    return rc;
-}
 runtime::hybrid::HybridBackend::HybridBackend(
-    const std::vector<std::pair<std::string, std::shared_ptr<runtime::Backend>>>& backend_list)
+    const std::vector<std::shared_ptr<runtime::Backend>>& backend_list)
    : m_backend_list{backend_list}
 {
 }
@@ -56,46 +41,44 @@ shared_ptr<runtime::Tensor>
                                                  const Shape& shape)
 {
    auto it = m_backend_list.begin();
-    return it->second->create_tensor(element_type, shape);
+    return (*it)->create_tensor(element_type, shape);
 }
 shared_ptr<runtime::Tensor> runtime::hybrid::HybridBackend::create_tensor(
    const element::Type& element_type, const Shape& shape, void* memory_pointer)
 {
    auto it = m_backend_list.begin();
-    return it->second->create_tensor(element_type, shape, memory_pointer);
+    return (*it)->create_tensor(element_type, shape, memory_pointer);
 }
 runtime::Handle runtime::hybrid::HybridBackend::compile(shared_ptr<Function> func)
 {
    if (m_function_map.find(func) == m_function_map.end())
    {
-        vector<shared_ptr<runtime::Backend>> backend_list;
-        for (auto p : m_backend_list)
-        {
-            backend_list.push_back(p.second);
-        }
        // Clone function
        FunctionInstance instance;
        instance.m_function = clone_function(*func);
        // Run placement pass
        ngraph::pass::Manager pass_manager;
-        pass_manager.register_pass<runtime::hybrid::pass::AssignPlacement>(backend_list);
+        pass_manager.register_pass<runtime::hybrid::pass::AssignPlacement>(m_backend_list);
+        pass_manager.register_pass<runtime::hybrid::pass::FixGetOutputElement>();
+#ifdef GPUH_DEBUG
+        pass_manager.register_pass<ngraph::pass::VisualizeTree>("graph.png");
+#endif
        pass_manager.run_passes(instance.m_function);
        // Split function to sub_functions
        tie(instance.m_sub_functions, instance.m_map_parameter_to_result) =
-            split_function_by_placement_size(instance.m_function);
+            runtime::hybrid::split_function_by_placement(instance.m_function);
        m_function_map.insert({func, instance});
        // Compile subfunctions in corresponding backends
        for (shared_ptr<Function>& sub_function : instance.m_sub_functions)
        {
-            size_t placement = get_colocated_function_placement_size(sub_function);
+            size_t placement = runtime::hybrid::get_colocated_function_placement(sub_function);
            auto backend = m_backend_list[placement];
-            backend.second->compile(sub_function);
+            backend->compile(sub_function);
            // Compile will replace nodes so we need to make one more pass through all
            // ops to reset placement
@@ -116,70 +99,103 @@ bool runtime::hybrid::HybridBackend::call(shared_ptr<Function> func,
    // Get FunctionInstance
    bool rc = true;
-    auto it = m_function_map.find(func);
+    using node_map_t = unordered_map<shared_ptr<Node>, shared_ptr<runtime::Tensor>>;
-    if (it == m_function_map.end())
+    auto fit = m_function_map.find(func);
+    if (fit == m_function_map.end())
    {
        throw runtime_error("compile() must be called before call().");
    }
-    FunctionInstance& instance = it->second;
+    FunctionInstance& instance = fit->second;
    // Parameter and result node in sub_function maps to one Tensor
-    unordered_map<shared_ptr<Node>, shared_ptr<runtime::Tensor>> map_node_to_tensor_view;
+    node_map_t map_node_to_tensor;
    for (size_t i = 0; i < inputs.size(); ++i)
    {
-        map_node_to_tensor_view[instance.m_function->get_parameters()[i]] = inputs[i];
+        map_node_to_tensor[instance.m_function->get_parameters()[i]] = inputs[i];
    }
    for (size_t i = 0; i < outputs.size(); ++i)
    {
-        map_node_to_tensor_view[instance.m_function->get_results()[i]] = outputs[i];
+        map_node_to_tensor[instance.m_function->get_results()[i]] = outputs[i];
    }
    // Call subfunctions
-    for (shared_ptr<Function>& sub_function : instance.m_sub_functions)
+    for (const shared_ptr<Function>& sub_function : instance.m_sub_functions)
    {
        // Init backend
-        size_t placement = get_colocated_function_placement_size(sub_function);
+        size_t placement = runtime::hybrid::get_colocated_function_placement(sub_function);
-        auto backend = m_backend_list[placement].second;
+        auto backend = m_backend_list[placement];
-        // Prepare parameter TensorViews
+        // Prepare parameter Tensors
-        vector<shared_ptr<runtime::Tensor>> parameter_tvs;
+        vector<shared_ptr<runtime::Tensor>> parameters;
-        for (auto parameter_node : sub_function->get_parameters())
+        for (const shared_ptr<op::Parameter>& parameter_node : sub_function->get_parameters())
        {
-            if (map_node_to_tensor_view.find(parameter_node) != map_node_to_tensor_view.end())
+            auto it = map_node_to_tensor.find(parameter_node);
+            if (it != map_node_to_tensor.end())
            {
-                parameter_tvs.push_back(map_node_to_tensor_view.at(parameter_node));
+                if (it->second->get_parent() == backend.get())
+                {
+                    parameters.push_back(it->second);
                }
                else
                {
+                    auto parameter = backend->create_tensor(parameter_node->get_element_type(),
+                                                            parameter_node->get_shape());
+                    parameter->copy_from(*(it->second));
+                    parameters.push_back(parameter);
+                }
+            }
+            else
+            {
+                // Handle temporary tensors that go between subgraphs
                auto result_node = instance.m_map_parameter_to_result.at(parameter_node);
-                auto result_tv = map_node_to_tensor_view.at(result_node);
+                auto result = map_node_to_tensor.at(result_node);
-                auto parameter_tv = backend->create_tensor(parameter_node->get_element_type(),
+                auto parameter = backend->create_tensor(parameter_node->get_element_type(),
                                                        parameter_node->get_shape());
-                copy_data(parameter_tv, read_vector<float>(result_tv));
+                parameter->copy_from(*result);
-                map_node_to_tensor_view[parameter_node] = parameter_tv;
+                map_node_to_tensor[parameter_node] = parameter;
-                parameter_tvs.push_back(parameter_tv);
+                parameters.push_back(parameter);
            }
        }
-        // Prepare result TensorViews
+        // Prepare result Tensors
-        vector<shared_ptr<runtime::Tensor>> result_tvs;
+        vector<shared_ptr<runtime::Tensor>> results;
-        for (auto result_node : sub_function->get_results())
+        map<runtime::Tensor*, runtime::Tensor*> copy_back;
+        for (const shared_ptr<op::Result>& result_node : sub_function->get_results())
+        {
+            auto it = map_node_to_tensor.find(result_node);
+            if (it != map_node_to_tensor.end())
            {
-            if (map_node_to_tensor_view.find(result_node) != map_node_to_tensor_view.end())
+                if (it->second->get_parent() == backend.get())
                {
-                result_tvs.push_back(map_node_to_tensor_view.at(result_node));
+                    results.push_back(it->second);
                }
                else
                {
-                auto result_tv = backend->create_tensor(result_node->get_element_type(),
+                    auto result = backend->create_tensor(result_node->get_element_type(),
                                                         result_node->get_shape());
-                map_node_to_tensor_view[result_node] = result_tv;
+                    results.push_back(result);
-                result_tvs.push_back(result_tv);
+                    copy_back.insert({result.get(), it->second.get()});
+                }
+            }
+            else
+            {
+                // Handle temporary tensors that go between subgraphs
+                auto result = backend->create_tensor(result_node->get_element_type(),
+                                                     result_node->get_shape());
+                map_node_to_tensor[result_node] = result;
+                results.push_back(result);
            }
        }
        // Call
-        backend->call_with_validate(sub_function, result_tvs, parameter_tvs);
+        backend->call(sub_function, results, parameters);
+        // Need to copy any results to the correct device
+        for (const auto& p : copy_back)
+        {
+            p.second->copy_from(*p.first);
+        }
    }
    return rc;
 }
@@ -188,3 +204,43 @@ bool runtime::hybrid::HybridBackend::is_supported(const Node& node) const
 {
    return true;
 }
+string runtime::hybrid::HybridBackend::get_placement_name(const runtime::Tensor* t)
+{
+    string rc;
+    if (dynamic_cast<const runtime::HostTensor*>(t) != nullptr)
+    {
+        rc = "HostTensor";
+    }
+    else if (dynamic_cast<const runtime::gpu::GPUTensor*>(t) != nullptr)
+    {
+        rc = "GPUTensor";
+    }
+    return rc;
+}
+string runtime::hybrid::HybridBackend::get_placement_name(const runtime::Backend* t)
+{
+    string rc;
+    if (dynamic_cast<const runtime::interpreter::INTBackend*>(t) != nullptr)
+    {
+        rc = "INTBackend";
+    }
+    else if (dynamic_cast<const runtime::gpu::GPU_Backend*>(t) != nullptr)
+    {
+        rc = "GPU_Backend";
+    }
+    return rc;
+}
+size_t runtime::hybrid::HybridBackend::get_placement(const runtime::Tensor* t)
+{
+    size_t index = 0;
+    for (const shared_ptr<ngraph::runtime::Backend>& be : m_backend_list)
+    {
+        if (t->get_parent() == be.get())
+        {
+            return index;
+        }
+        index++;
+    }
+    return -1;
+}
--- a/src/ngraph/runtime/hybrid/hybrid_backend.hpp
+++ b/src/ngraph/runtime/hybrid/hybrid_backend.hpp
@@ -37,8 +37,7 @@ namespace ngraph
 class ngraph::runtime::hybrid::HybridBackend : public ngraph::runtime::Backend
 {
 public:
-    HybridBackend(
+    HybridBackend(const std::vector<std::shared_ptr<runtime::Backend>>& backend_list);
-        const std::vector<std::pair<std::string, std::shared_ptr<runtime::Backend>>>& backend_list);
    std::shared_ptr<ngraph::runtime::Tensor>
        create_tensor(const ngraph::element::Type& element_type,
@@ -69,5 +68,9 @@ private:
    };
    std::map<std::shared_ptr<ngraph::Function>, FunctionInstance> m_function_map;
-    std::vector<std::pair<std::string, std::shared_ptr<runtime::Backend>>> m_backend_list;
+    std::vector<std::shared_ptr<runtime::Backend>> m_backend_list;
+    std::string get_placement_name(const runtime::Tensor* t);
+    std::string get_placement_name(const runtime::Backend* t);
+    size_t get_placement(const runtime::Tensor* t);
 };
--- a/src/ngraph/runtime/hybrid/hybrid_util.cpp
+++ b/src/ngraph/runtime/hybrid/hybrid_util.cpp
@@ -15,11 +15,13 @@
 //*****************************************************************************
 #include "ngraph/runtime/hybrid/hybrid_util.hpp"
+#include "ngraph/pass/manager.hpp"
+#include "ngraph/pass/visualize_tree.hpp"
 using namespace ngraph;
 using namespace std;
-static Node* take_independent_node_with_placement_priority_size(
+static Node* take_independent_node_with_placement_priority(
    map<size_t, deque<Node*>>& independent_nodes_by_placement, size_t placement)
 {
    Node* selected_node = nullptr;
@@ -45,7 +47,7 @@ static Node* take_independent_node_with_placement_priority_size(
 }
 static vector<unordered_set<shared_ptr<Node>>>
-    group_function_nodes_to_clusters_size(const shared_ptr<Function>& f)
+    group_function_nodes_to_clusters(const shared_ptr<Function>& f)
 {
    // Topologically sort nodes by picking independent node with the same placement as the
    // previously picked node greedily
@@ -66,7 +68,7 @@ static vector<unordered_set<shared_ptr<Node>>>
    list<shared_ptr<Node>> sorted_nodes;
    size_t previous_placement = 0;
-    while (Node* independent_node = take_independent_node_with_placement_priority_size(
+    while (Node* independent_node = ::take_independent_node_with_placement_priority(
               independent_nodes_by_placement, previous_placement))
    {
        previous_placement = independent_node->get_placement_index();
@@ -148,42 +150,48 @@ static vector<unordered_set<shared_ptr<Node>>>
 // |     <------[3]------+     |  |  |     <------[7]------+     |  |     <------[11]-----+     |
 // +-----+               +-----+  |  +-----+               +-----+  +-----+               +-----+
-// Suffix *_size  as a part of function name is temporary, this suffix
+static map<shared_ptr<op::Result>, shared_ptr<op::Parameter>>
-//  will be removed when the backends move to the latest Hybrid backend
+    insert_result_parameter_split(const shared_ptr<Node>& src_node,
-pair<shared_ptr<op::Result>, shared_ptr<op::Parameter>>
-    insert_result_parameter_split_size(const shared_ptr<Node>& src_node,
                                  const shared_ptr<Node>& dst_node)
 {
-    if (src_node->get_output_size() != 1)
+    map<shared_ptr<op::Result>, shared_ptr<op::Parameter>> result_map;
+    for (descriptor::Input& input : dst_node->get_inputs())
    {
-        throw ngraph_error("Multiple output per op not supported in graph partition yet.");
+        if (input.get_output().get_node() == src_node)
-    }
+        {
+            descriptor::Input* dst_input = &input;
+            descriptor::Output* src_output = &input.get_output();
            // Make parameter node
-    shared_ptr<op::Parameter> par_node = make_shared<op::Parameter>(
+            shared_ptr<op::Parameter> par_node =
-        src_node->get_output_element_type(0), src_node->get_output_shape(0));
+                make_shared<op::Parameter>(src_output->get_element_type(), src_output->get_shape());
            par_node->set_placement_index(dst_node->get_placement_index());
            // Fix input / output among src, dst and par
-    descriptor::Input* dst_input = dst_node->get_input_from(src_node);
+            // Remove [0]
-    descriptor::Output* src_output = src_node->get_output_to(dst_node);
+            src_output->remove_input(dst_input);
-    src_output->remove_input(dst_input);    // Remove [0]
-    dst_input->replace_output(par_node, 0); // Remove [0] (again), add [8], remove [1], add [9]
+            // Remove [0] (again), add [8], remove [1], add [9]
+            dst_input->replace_output(par_node, 0);
            // Add res node
-    shared_ptr<op::Result> res_node = make_shared<op::Result>(src_node); // Add [4], [5], [6], [7]
+            shared_ptr<op::Result> res_node =
+                make_shared<op::Result>(src_node); // Add [4], [5], [6], [7]
            res_node->set_placement_index(src_node->get_placement_index());
-    return make_pair(res_node, par_node);
+            result_map.insert({res_node, par_node});
+        }
+    }
+    return result_map;
 }
-// Suffix *_size  as a part of function name is temporary, this suffix
 //  will be removed when the backends move to the latest Hybrid backend
 pair<vector<shared_ptr<Function>>, unordered_map<shared_ptr<op::Parameter>, shared_ptr<op::Result>>>
-    runtime::hybrid::split_function_by_placement_size(const shared_ptr<Function>& f)
+    runtime::hybrid::split_function_by_placement(const shared_ptr<Function>& f)
 {
    // Split functions to clusters of nodes that can be computed together
-    vector<unordered_set<shared_ptr<Node>>> clusters = group_function_nodes_to_clusters_size(f);
+    vector<unordered_set<shared_ptr<Node>>> clusters = ::group_function_nodes_to_clusters(f);
    // Map from (intermediate) parameter to result node, for guiding data copy among devices
    unordered_map<shared_ptr<op::Parameter>, shared_ptr<op::Result>> map_parameter_to_result;
@@ -208,8 +216,10 @@ pair<vector<shared_ptr<Function>>, unordered_map<shared_ptr<op::Parameter>, shar
            if (src_cluster != dst_cluster)
            {
                // Split src_node and dst_node
-                pair<shared_ptr<op::Result>, shared_ptr<op::Parameter>> res_par_pair =
+                map<shared_ptr<op::Result>, shared_ptr<op::Parameter>> res_par_pair_map =
-                    insert_result_parameter_split_size(src_node, dst_node);
+                    ::insert_result_parameter_split(src_node, dst_node);
+                for (const auto& res_par_pair : res_par_pair_map)
+                {
                    shared_ptr<op::Result> res_node = res_par_pair.first;
                    shared_ptr<op::Parameter> par_node = res_par_pair.second;
                    map_parameter_to_result[par_node] = res_node;
@@ -220,6 +230,7 @@ pair<vector<shared_ptr<Function>>, unordered_map<shared_ptr<op::Parameter>, shar
                }
            }
        }
+    }
    // Create functions from clusters
    vector<shared_ptr<Function>> sub_functions;
@@ -240,15 +251,19 @@ pair<vector<shared_ptr<Function>>, unordered_map<shared_ptr<op::Parameter>, shar
        }
        auto sub_function = make_shared<Function>(res_vector, par_vector);
        sub_functions.push_back(sub_function);
+#ifdef HYBRID_DEBUG
+        ngraph::pass::Manager pass_manager;
+        pass_manager.register_pass<ngraph::pass::VisualizeTree>("subgraph_" + to_string(index++) +
+                                                                ".png");
+        pass_manager.run_passes(sub_function);
+#endif
    }
    return make_pair(sub_functions, map_parameter_to_result);
 }
-// Suffix *_size  as a part of function name is temporary, this suffix
-//  will be removed when the backends move to the latest Hybrid backend
 // Assert that nodes in the function is colocated and return that placement
-size_t runtime::hybrid::get_colocated_function_placement_size(shared_ptr<Function> func)
+size_t runtime::hybrid::get_colocated_function_placement(shared_ptr<Function> func)
 {
    auto ops = func->get_ops();
@@ -259,7 +274,7 @@ size_t runtime::hybrid::get_colocated_function_placement_size(shared_ptr<Functio
        size_t node_placement = op->get_placement_index();
        if (node_placement == Node::placement_invalid)
        {
-            throw ngraph_error("Node should have a device placement");
+            throw ngraph_error("Node " + op->get_name() + " should have a device placement");
        }
        if (function_placement != node_placement)
        {

--- a/src/ngraph/runtime/hybrid/hybrid_util.hpp
+++ b/src/ngraph/runtime/hybrid/hybrid_util.hpp
@@ -34,10 +34,10 @@ namespace ngraph
            std::pair<
                std::vector<std::shared_ptr<Function>>,
                std::unordered_map<std::shared_ptr<op::Parameter>, std::shared_ptr<op::Result>>>
-                split_function_by_placement_size(const std::shared_ptr<Function>& f);
+                split_function_by_placement(const std::shared_ptr<Function>& f);
            // Assert that nodes in the function is colocated and return that placement
-            size_t get_colocated_function_placement_size(std::shared_ptr<Function> func);
+            size_t get_colocated_function_placement(std::shared_ptr<Function> func);
        }
    }
 }
--- a/src/ngraph/runtime/hybrid/pass/assign_placement.cpp
+++ b/src/ngraph/runtime/hybrid/pass/assign_placement.cpp
@@ -24,7 +24,7 @@ using namespace ngraph;
 using namespace std;
 runtime::hybrid::pass::AssignPlacement::AssignPlacement(
-    vector<shared_ptr<runtime::Backend>> placement_backends)
+    const vector<shared_ptr<runtime::Backend>>& placement_backends)
    : m_placement_backends(placement_backends)
 {
 }

--- a/src/ngraph/runtime/hybrid/pass/assign_placement.hpp
+++ b/src/ngraph/runtime/hybrid/pass/assign_placement.hpp
@@ -39,8 +39,8 @@ namespace ngraph
 class ngraph::runtime::hybrid::pass::AssignPlacement : public ngraph::pass::NodePass
 {
 public:
-    // TODO: make policy a class
+    AssignPlacement(
-    AssignPlacement(std::vector<std::shared_ptr<ngraph::runtime::Backend>> placement_backends);
+        const std::vector<std::shared_ptr<ngraph::runtime::Backend>>& placement_backends);
 private:
    bool run_on_node(std::shared_ptr<Node> node) override;

--- a/src/ngraph/runtime/hybrid/pass/fix_get_output_element.cpp
+++ b/src/ngraph/runtime/hybrid/pass/fix_get_output_element.cpp
+//*****************************************************************************
+// Copyright 2017-2018 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+#include "ngraph/runtime/hybrid/pass/fix_get_output_element.hpp"
+#include "ngraph/log.hpp"
+#include "ngraph/node.hpp"
+#include "ngraph/placement.hpp"
+#include "ngraph/runtime/backend.hpp"
+using namespace ngraph;
+using namespace std;
+runtime::hybrid::pass::FixGetOutputElement::FixGetOutputElement()
+{
+}
+bool runtime::hybrid::pass::FixGetOutputElement::run_on_node(shared_ptr<Node> node)
+{
+    if (node->description() == "GetOutputElement")
+    {
+        auto parent = node->get_arguments().at(0);
+        node->set_placement_index(parent->get_placement_index());
+    }
+    return false;
+}
--- a/src/ngraph/runtime/hybrid/pass/fix_get_output_element.hpp
+++ b/src/ngraph/runtime/hybrid/pass/fix_get_output_element.hpp
+//*****************************************************************************
+// Copyright 2017-2018 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+#pragma once
+#include <exception>
+#include <functional>
+#include <sstream>
+#include "ngraph/pass/pass.hpp"
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace hybrid
+        {
+            namespace pass
+            {
+                class FixGetOutputElement;
+            }
+        }
+    }
+}
+class ngraph::runtime::hybrid::pass::FixGetOutputElement : public ngraph::pass::NodePass
+{
+public:
+    FixGetOutputElement();
+private:
+    bool run_on_node(std::shared_ptr<Node> node) override;
+};
--- a/src/ngraph/runtime/hybrid/unit_test.manifest
+++ b/src/ngraph/runtime/hybrid/unit_test.manifest
-abc
-abc_int64
-abs
-acos
-add
-add_overload
-aliased_output
-argmax_3D_axis_0
-argmax_3D_axis_1
-argmax_3D_axis_2
-argmax_4D_axis_3
-argmax_trivial
-argmin_4D_axis_3
-argmin_trivial
-asin
-atan
-avg_pool_1d_1channel_1image
-avg_pool_1d_1channel_2image
-avg_pool_1d_2channel_2image
-avg_pool_2d_1channel_1image_padded_do_not_include_in_computation
-avg_pool_2d_1channel_1image_padded_include_in_computation
-avg_pool_2d_1channel_1image_strided
-avg_pool_2d_2channel_2image
-avg_pool_2d_2channel_2image_3x3_padded_do_not_include_in_computation
-avg_pool_2d_2channel_2image_3x3_padded_include_in_computation
-avg_pool_2d_2channel_2image_3x3_strided_padded_do_not_include_in_computation
-avg_pool_2d_2channel_2image_3x3_strided_padded_include_in_computation
-avg_pool_2d_2channel_2image_3x3_strided_uneven_padded_do_not_include_in_computation
-avg_pool_2d_2channel_2image_3x3_strided_uneven_padded_include_in_computation
-avg_pool_2d_2channel_2image_padded_do_not_include_in_computation
-avg_pool_2d_2channel_2image_padded_include_in_computation
-avg_pool_2d_2channel_2image_padded_only_above_do_not_include_in_computation
-avg_pool_2d_2channel_2image_padded_only_above_include_in_computation
-avg_pool_2d_2channel_2image_padded_only_below_do_not_include_in_computation
-avg_pool_2d_2channel_2image_padded_only_below_include_in_computation
-avg_pool_3d_strided_uneven_padded_do_not_include_in_computation
-avg_pool_3d_uneven_strided_padded_include_in_computation
-backwards_abc
-backwards_abs
-backwards_acos
-backwards_add
-backwards_add_nested
-backwards_asin
-backwards_atan
-backwards_avgpool_n1_c1_hw2x2
-backwards_avgpool_n1_c1_hw4x4
-backwards_avgpool_n2_c2_hw2x2_win_2x2_str_1x1_padding_numeric
-backwards_avgpool_n2_c2_hw4x4
-backwards_avgpool_n2_c2_hw4x4_numeric
-backwards_avgpool_n2_c2_hw4x4_win_2x2_str_1x1_numeric
-backwards_batch_norm_three_outputs
-backwards_broadcast0
-backwards_broadcast1
-backwards_ceiling
-backwards_concat_axis_0
-backwards_concat_axis_1
-backwards_concat_vector
-backwards_cos
-backwards_cosh
-backwards_divide
-backwards_dot_scalar_scalar
-backwards_dot_scalar_tensor
-backwards_dot_tensor_scalar
-backwards_dot_tensor_vector
-backwards_dot_tensor2_tensor2
-backwards_dot_tensor3_tensor3
-backwards_dot_vector_vector
-backwards_exp
-backwards_floor
-backwards_log
-backwards_maximum
-backwards_maxpool_n2_c1_hw5_3x3_str2_max
-backwards_maxpool_n2_c1_hw5_3x3_str2_max_pad1x2_2x3
-backwards_maxpool_n2c1h5w5_kh3kw3_sh2sw2
-backwards_maxpool_n4_c1_hw4_2x2_max
-backwards_maxpool_n4c1h4w4_kh2kw2_sh1sw1
-backwards_minimum
-backwards_multiply
-backwards_negative
-backwards_parameter
-backwards_power
-backwards_relu
-backwards_replace_slice
-backwards_reshape
-backwards_reverse_3d_02
-backwards_reverse_sequence_n3_c2_h3
-backwards_reverse_sequence_n4d2c3h2w2
-backwards_select
-backwards_select_nested
-backwards_sigmoid
-backwards_sign
-backwards_sin
-backwards_sinh
-backwards_slice
-backwards_softmax_3d
-backwards_softmax_all
-backwards_softmax_axis
-backwards_softmax_underflow
-backwards_subtract
-backwards_sum_m2s
-backwards_sum_m2v_0
-backwards_sum_m2v_1
-backwards_sum_v2s
-backwards_tan
-backwards_tanh
-batch_norm_one_output
-batch_norm_three_outputs
-batchnorm_bprop_n4c3h2w2
-batchnorm_fprop_b1c2h2w2
-batchnorm_fprop_b2c2h2w1
-batchnorm_fprop_globalstats_b2c2w2h1
-batchnorm_fprop_inference_b2c2h2w1
-broadcast_algo_3d_backward
-broadcast_algo_3d_stride_1
-broadcast_algo_3d_stride_2
-broadcast_algo_matrix_backward_4
-broadcast_algo_matrix_stride_1
-broadcast_algo_matrix_stride_2
-broadcast_algo_matrix_stride_3
-broadcast_algo_scalar
-broadcast_algo_vector_backward_2
-broadcast_algo_vector_backward_3
-broadcast_algo_vector_backward_4
-broadcast_algo_vector_forward_2
-broadcast_algo_vector_forward_3
-broadcast_algo_vector_forward_4
-broadcast_algo_vector_middle
-broadcast_matrix_0
-broadcast_matrix_1
-broadcast_matrix_2
-broadcast_scalar_matrix
-broadcast_scalar_tensor
-broadcast_scalar_to_matrix_int32
-broadcast_scalar_to_matrix_int64
-broadcast_scalar_vector
-broadcast_to_non_existent_axis
-broadcast_trivial
-broadcast_vector_colwise
-broadcast_vector_rowwise
-broadcast_vector_rowwise_int64
-broadcast_vector_rowwise_reversed
-ceiling
-computation_reuse
-concat_2d_tensor
-concat_4d_tensor
-concat_5d
-concat_matrix_colwise
-concat_matrix_int64
-concat_matrix_rowwise
-concat_vector
-concat_zero_length_1d_last
-concat_zero_length_1d_middle
-concat_zero_length_4d_middle
-constant_broadcast
-constant_equality_bool
-constant_multi_use
-convert_float32_bool
-convert_int32_bool
-convert_int32_float32
-convert_uint16_float32
-convolution_2d_1item
-convolution_2d_1item_1o1i_data_dilated
-convolution_2d_1item_2o1i_data_dilated
-convolution_2d_1item_2o2i_data_dilated
-convolution_2d_1item_5o3i_data_dilated
-convolution_2d_1item_padded_1_1x1_1
-convolution_2d_1item_padded_2_3x4_5
-convolution_2d_2item_5o3i_data_dilated
-convolution_2d_2items
-convolution_2d_2items_dilated
-convolution_2d_2items_dilated_padded
-convolution_2d_2items_strided
-convolution_2d_2items_strided_padded
-convolution_2d_2items_strided_padded_same
-convolution_2d_8item_large_5o3i_data_dilated
-convolution_2d_8item_large_5o3i_uneven_filter_data_dilated
-convolution_2d_8item_large_5o3i_uneven_filter_uneven_data_dilation_data_dilated
-convolution_3d_1item_large_5o3i_padded_uneven_filter_uneven_data_dilation_data_dilated
-convolution_3d_2item_large_5o3i_padded_strided_uneven_filter_uneven_data_dilation_data_dilated
-convolution_3d_2item_large_5o3i_padded_strided_uneven_filter_uneven_data_dilation_filter_dilated_data_dilated
-convolution_3d_2item_large_5o3i_uneven_filter_uneven_data_dilation_data_dilated
-convolution_3d_2items
-convolution_4d_2items
-convolution_4d_4items
-convolution_4d_4items_dilated
-convolution_4d_4items_padded_neg
-convolution_4d_4items_strided
-convolution_4d_4items_strided_dilated
-convolution_4d_4items_strided_dilated_padded
-convolution_4d_4items_strided_dilated_padded_neg
-convolution_4d_4items_strided_dilated_padded_same
-convolution_outlining
-cos
-cosh
-dequantize
-dequantize_axes
-dequantize_int8
-divide
-divide_adjoint_stability
-divide_by_zero_float32
-divide_by_zero_int32
-divide_overload
-dot_0_0
-dot_2x0_0
-dot_3d_multi_axis
-dot_3d_one_axis_arbitrary
-dot_4d_5d_multi_axis
-dot_4d_5d_multi_axis_more
-dot_matrix_0x2_2x0
-dot_matrix_2x0_0x2
-dot_matrix_3x2_2x0
-dot_matrix_vector
-dot_matrix_vector_4_3
-dot_matrix_vector_int64
-dot_scalar_0x2
-dot_scalar_scalar
-dot_scalar_tensor_arg0
-dot_scalar_tensor_arg1
-dot1d
-dot2d
-dot3d_2d
-dot3d_3d
-equal
-exp
-floor
-function_call
-function_name
-fuse_max_with_constant_zero_input_as_relu
-greater
-greatereq
-generate_mask
-kahan_sum_3d_to_vector
-kahan_sum_to_scalar
-less
-lesseq
-lesseq_bool
-log
-logical_and
-logical_or
-lrn
-max_3d_eliminate_zero_dim
-max_3d_to_matrix_least_sig
-max_3d_to_matrix_most_sig
-max_3d_to_scalar
-max_3d_to_vector
-max_matrix_cols_zero
-max_matrix_columns
-max_matrix_rows
-max_matrix_rows_zero
-max_matrix_to_scalar_zero_by_zero
-max_pool_1d_1channel_1image
-max_pool_1d_1channel_2image
-max_pool_1d_2channel_2image
-max_pool_2d_1channel_1image_overpadded
-max_pool_2d_1channel_1image_padded
-max_pool_2d_1channel_1image_padded_negative_values
-max_pool_2d_1channel_1image_strided
-max_pool_2d_2channel_2image
-max_pool_2d_2channel_2image_asym_pad
-max_pool_3d
-max_to_scalar
-max_trivial
-max_trivial_5d
-max_vector_zero
-maximum
-maximum_int32
-maximum_int64
-min_3d_eliminate_zero_dim
-min_3d_to_matrix_least_sig
-min_3d_to_matrix_most_sig
-min_3d_to_scalar
-min_3d_to_vector
-min_matrix_cols_zero
-min_matrix_columns
-min_matrix_rows
-min_matrix_rows_zero
-min_matrix_to_scalar_zero_by_zero
-min_to_scalar
-min_trivial
-min_trivial_5d
-min_vector_zero
-minimum
-minimum_int32
-minimum_int64
-multiple_backends
-multiple_result
-multiply
-multiply_overload
-negative
-node_name
-not
-notequal
-numeric_double_inf
-numeric_double_nan
-numeric_float_inf
-numeric_float_nan
-one_hot_matrix_0
-one_hot_scalar_0_in_3
-one_hot_scalar_1_in_3
-one_hot_scalar_2_in_3
-one_hot_scalar_fp_nonint_in_3
-one_hot_scalar_oob_in_3
-one_hot_vector_0
-one_hot_vector_1
-one_hot_vector_1_barely_oob
-one_hot_vector_1_far_oob
-one_hot_vector_1_fp
-one_hot_vector_1_fp_nonint
-pad_2channel_2image_asym
-pad_exterior_1d
-pad_exterior_2d_0x0
-pad_exterior_2d_0x3
-pad_exterior_2d_3x0
-pad_exterior_4d_1x2x2x2
-pad_interior_1d
-pad_interior_exterior_1d
-pad_interior_exterior_2d
-pad_interior_exterior_4d_2x0x3x2
-parameter_as_output
-power
-product_3d_eliminate_zero_dim
-product_3d_to_matrix_least_sig
-product_3d_to_matrix_most_sig
-product_3d_to_scalar
-product_3d_to_vector
-product_matrix_cols_zero
-product_matrix_columns
-product_matrix_rows
-product_matrix_rows_zero
-product_matrix_to_scalar_zero_by_zero
-product_to_scalar
-product_trivial
-product_trivial_5d
-product_vector_zero
-quantize
-quantize_axes
-quantize_clamp
-quantize_int8
-reduce_3d_to_vector
-reduce_matrix_cols_zero
-reduce_matrix_columns
-reduce_matrix_rows
-reduce_matrix_rows_zero
-reduce_matrix_to_scalar_zero_by_zero
-reduce_to_scalar
-reduce_trivial
-reduce_vector_zero
-reduce_window_emulating_max_pool_1d_1channel_1image
-reduce_window_emulating_max_pool_1d_1channel_2image
-reduce_window_emulating_max_pool_1d_2channel_2image
-reduce_window_emulating_max_pool_2d_1channel_1image_strided
-reduce_window_emulating_max_pool_2d_2channel_2image
-relu_2Dbackprop
-relu_2Dfprop
-relu_4Dbackprop
-relu_4Dfprop
-replace_slice_3d
-replace_slice_3d_strided
-replace_slice_3d_strided_different_strides
-replace_slice_matrix
-replace_slice_matrix_inplace
-replace_slice_scalar
-replace_slice_vector
-reshape_3d_transpose_021
-reshape_3d_transpose_102
-reshape_3d_transpose_120
-reshape_3d_transpose_201
-reshape_3d_transpose_210
-reshape_4d_no_transpose
-reshape_4d_transpose
-reshape_6d
-reshape_m2m_dim_change_transpose
-reshape_m2m_same
-reshape_m2m_transpose
-reshape_s2t
-reshape_s2t1
-reshape_t2s_012
-reshape_t2s_120
-reshape_t2v_012
-reshape_transposed_shape_change
-reshape_v2m_col
-reshape_v2m_row
-reshape_v2t_middle
-reverse_0d
-reverse_1d_0
-reverse_1d_nochange
-reverse_2d_0
-reverse_2d_01
-reverse_2d_1
-reverse_2d_nochange
-reverse_3d_0
-reverse_3d_01
-reverse_3d_012
-reverse_3d_02
-reverse_3d_1
-reverse_3d_12
-reverse_3d_2
-reverse_3d_nochange
-reverse_sequence_n2c3h4w2
-reverse_sequence_n4c3h2w2
-reverse_sequence_n4d2c3h2w2
-scalar_constant_float32
-scalar_constant_int64
-select
-select_and_scatter_3d_without_overlap
-select_and_scatter_with_overlap
-select_and_scatter_without_overlap
-sigmoid_bprop_n1c1h4
-sigmoid_n1c1h2w2
-sigmoid_n1c1h4
-sign
-sin
-sinh
-slice_3d
-slice_3d_strided
-slice_3d_strided_different_strides
-slice_matrix
-slice_matrix_strided
-slice_scalar
-slice_vector
-softmax_all
-softmax_axis
-softmax_axis_2
-softmax_axis_3d
-softmax_axis_3d_trivial
-softmax_underflow
-sqrt
-subtract
-subtract_overload
-sum_3d_eliminate_zero_dim
-sum_3d_to_matrix_least_sig
-sum_3d_to_matrix_most_sig
-sum_3d_to_scalar
-sum_3d_to_vector
-sum_5d_to_scalar
-sum_large_1d_to_scalar
-sum_matrix_6d
-sum_matrix_cols_zero
-sum_matrix_columns
-sum_matrix_rows
-sum_matrix_rows_zero
-sum_matrix_to_scalar_zero_by_zero
-sum_to_scalar
-sum_trivial
-sum_trivial_5d
-sum_vector_zero
-tan
-tanh
-tensor_2constant
-tensor_constant
-tensor_constant_float32
-tensor_constant_int64
-tensor_constant_with_op
-tensorview_custom_mem
-topk_1d_max_all
-topk_1d_max_one
-topk_1d_max_partial
-topk_1d_min_all
-topk_1d_min_one
-topk_1d_min_partial
-topk_2d_max_all
-topk_2d_max_one
-topk_2d_max_partial
-topk_2d_min_all
-topk_2d_min_one
-topk_2d_min_partial
-topk_3d_max_all
-topk_3d_max_one
-topk_3d_max_partial
-topk_3d_min_all
-topk_3d_min_one
-topk_3d_min_partial
-unhandled_op
-validate_call_input_count
-validate_call_input_shape
-validate_call_input_type
-validate_call_output_count
-validate_call_output_shape
-validate_call_output_type
-zero_sized_abs
-zero_sized_acos
-zero_sized_add
-zero_sized_asin
-zero_sized_atan
-zero_sized_ceiling
-zero_sized_cos
-zero_sized_cosh
-zero_sized_divide
-zero_sized_eq
-zero_sized_exp
-zero_sized_floor
-zero_sized_greater
-zero_sized_greatereq
-zero_sized_less
-zero_sized_lesseq
-zero_sized_log
-zero_sized_maximum
-zero_sized_minimum
-zero_sized_multiply
-zero_sized_negative
-zero_sized_not
-zero_sized_not_equal
-zero_sized_power
-zero_sized_sign
-zero_sized_sin
-zero_sized_sinh
-zero_sized_sqrt
-zero_sized_subtract
-zero_sized_tan
-zero_sized_tanh
-shape_of_scalar
-shape_of_vector
-shape_of_matrix
-shape_of_5d
--- a/src/ngraph/runtime/intelgpu/intelgpu_backend.cpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_backend.cpp
@@ -390,14 +390,15 @@ shared_ptr<runtime::Tensor>
    runtime::intelgpu::IntelGPUBackend::create_tensor(const element::Type& element_type,
                                                      const Shape& shape)
 {
-    return make_shared<runtime::intelgpu::IntelGPUTensorView>(element_type, shape, *ocl_engine);
+    return make_shared<runtime::intelgpu::IntelGPUTensorView>(
+        element_type, shape, *ocl_engine, nullptr, this);
 }
 shared_ptr<runtime::Tensor> runtime::intelgpu::IntelGPUBackend::create_tensor(
    const element::Type& element_type, const Shape& shape, void* memory_pointer)
 {
    return make_shared<runtime::intelgpu::IntelGPUTensorView>(
-        element_type, shape, *ocl_engine, memory_pointer);
+        element_type, shape, *ocl_engine, memory_pointer, this);
 }
 runtime::Handle runtime::intelgpu::IntelGPUBackend::compile(shared_ptr<Function> func)

--- a/src/ngraph/runtime/intelgpu/intelgpu_tensor_view.cpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_tensor_view.cpp
@@ -28,8 +28,9 @@ using namespace std;
 runtime::intelgpu::IntelGPUTensorView::IntelGPUTensorView(const element::Type& element_type,
                                                          const Shape& shape,
                                                          const cldnn::engine& backend_engine,
-                                                          void* memory_pointer)
+                                                          void* memory_pointer,
-    : runtime::Tensor(make_shared<descriptor::Tensor>(element_type, shape, "external"))
+                                                          const runtime::Backend* parent)
+    : runtime::Tensor(make_shared<descriptor::Tensor>(element_type, shape, "external"), parent)
 {
    const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(element_type, shape);

--- a/src/ngraph/runtime/intelgpu/intelgpu_tensor_view.hpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_tensor_view.hpp
@@ -38,7 +38,8 @@ public:
    IntelGPUTensorView(const element::Type& element_type,
                       const Shape& shape,
                       const cldnn::engine& backend_engine,
-                       void* memory_pointer = nullptr);
+                       void* memory_pointer,
+                       const runtime::Backend* parent);
    /// \brief Write bytes directly into the tensor
    /// \param p Pointer to source of data

--- a/src/ngraph/runtime/interpreter/int_backend.cpp
+++ b/src/ngraph/runtime/interpreter/int_backend.cpp
@@ -43,16 +43,25 @@ extern "C" runtime::Backend* new_backend(const char* configuration_string)
    return new runtime::interpreter::INTBackend();
 }
+runtime::interpreter::INTBackend::INTBackend()
+{
+}
+runtime::interpreter::INTBackend::INTBackend(const vector<string>& unsupported_op_name_list)
+    : m_unsupported_op_name_list{unsupported_op_name_list.begin(), unsupported_op_name_list.end()}
+{
+}
 shared_ptr<runtime::Tensor>
    runtime::interpreter::INTBackend::create_tensor(const element::Type& type, const Shape& shape)
 {
-    return make_shared<runtime::HostTensor>(type, shape, "external");
+    return make_shared<runtime::HostTensor>(type, shape, this);
 }
 shared_ptr<runtime::Tensor> runtime::interpreter::INTBackend::create_tensor(
    const element::Type& type, const Shape& shape, void* memory_pointer)
 {
-    return make_shared<runtime::HostTensor>(type, shape, memory_pointer, "external");
+    return make_shared<runtime::HostTensor>(type, shape, memory_pointer, this);
 }
 runtime::Handle runtime::interpreter::INTBackend::compile(shared_ptr<Function> function)
@@ -336,3 +345,8 @@ void runtime::interpreter::INTBackend::perform_nan_check(
        arg_number++;
    }
 }
+bool runtime::interpreter::INTBackend::is_supported(const Node& node) const
+{
+    return m_unsupported_op_name_list.find(node.description()) == m_unsupported_op_name_list.end();
+}
--- a/src/ngraph/runtime/interpreter/int_backend.hpp
+++ b/src/ngraph/runtime/interpreter/int_backend.hpp
@@ -16,6 +16,7 @@
 #pragma once
+#include <initializer_list>
 #include <memory>
 #include <sstream>
 #include <string>
@@ -156,6 +157,12 @@ namespace ngraph
 class ngraph::runtime::interpreter::INTBackend : public Backend
 {
 public:
+    INTBackend();
+    INTBackend(const std::vector<std::string>& unsupported_op_name_list);
+    INTBackend(const INTBackend&) = delete;
+    INTBackend(INTBackend&&) = delete;
+    INTBackend& operator=(const INTBackend&) = delete;
    std::shared_ptr<Tensor>
        create_tensor(const element::Type& type, const Shape& shape, void* memory_pointer) override;
@@ -173,7 +180,8 @@ public:
    std::vector<PerformanceCounter>
        get_performance_data(std::shared_ptr<Function> func) const override;
-    bool is_supported(const Node& node) const override { return true; }
+    bool is_supported(const Node& node) const override;
 private:
    int get_alignment() const { return 64; }
    class FunctionInstance
@@ -190,6 +198,7 @@ private:
        void* get_temporary_pointer(size_t offset) { return m_temporary_memory->get_ptr(offset); }
    };
    std::map<std::shared_ptr<Function>, FunctionInstance> m_function_map;
+    std::set<std::string> m_unsupported_op_name_list;
    static void perform_nan_check(const std::vector<std::shared_ptr<HostTensor>>&,
                                  const Node* op = nullptr);

--- a/src/ngraph/runtime/tensor.hpp
+++ b/src/ngraph/runtime/tensor.hpp
@@ -21,6 +21,7 @@
 #include "ngraph/descriptor/layout/tensor_layout.hpp"
 #include "ngraph/descriptor/tensor.hpp"
+#include "ngraph/runtime/backend.hpp"
 #include "ngraph/shape.hpp"
 #include "ngraph/strides.hpp"
 #include "ngraph/type/element_type.hpp"
@@ -37,9 +38,11 @@ namespace ngraph
        class Tensor
        {
        protected:
-            Tensor(const std::shared_ptr<ngraph::descriptor::Tensor>& descriptor)
+            Tensor(const std::shared_ptr<ngraph::descriptor::Tensor>& descriptor,
+                   const Backend* parent)
                : m_descriptor(descriptor)
                , m_stale(true)
+                , m_parent(parent)
            {
            }
@@ -104,9 +107,11 @@ namespace ngraph
            /// \param source The source tensor
            virtual void copy_from(const ngraph::runtime::Tensor& source);
+            const Backend* get_parent() const { return m_parent; }
        protected:
            std::shared_ptr<ngraph::descriptor::Tensor> m_descriptor;
            bool m_stale;
+            const Backend* m_parent;
        };
        using TensorViewPtrs = std::vector<std::shared_ptr<Tensor>>;

--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -225,10 +225,6 @@ if (NGRAPH_INTERPRETER_ENABLE)
    target_link_libraries(unit-test PRIVATE interpreter_backend)
 endif()
-if (NGRAPH_HYBRID_ENABLE)
-    target_link_libraries(unit-test PRIVATE hybrid_backend)
-endif()
 if (NGRAPH_GPU_ENABLE)
    target_link_libraries(unit-test PRIVATE gpu_backend)
 endif()