Framework for Hybrid GPU backend (#2196)

* add empty framework for hybrid GPU, or GPUH * move placement to the runtime directory * wip * skeleton for hybrid GPU backend. most unit tests pass. * cleanup * move hybrid code into hybrid dir/namespace * move hybrid functions * move more hybrid functions to hybrid directory * fix placement after compile. All unit tests passing * fix gpu backend ctor

Framework for Hybrid GPU backend (#2196)
* add empty framework for hybrid GPU, or GPUH * move placement to the runtime directory * wip * skeleton for hybrid GPU backend. most unit tests pass. * cleanup * move hybrid code into hybrid dir/namespace * move hybrid functions * move more hybrid functions to hybrid directory * fix placement after compile. All unit tests passing * fix gpu backend ctor
af2c4c7d · Robert Kimball · Scott Cyphers · 9234cc69 · af2c4c7d · af2c4c7d
Commit af2c4c7d authored Dec 11, 2018 by Robert Kimball Committed by Scott Cyphers Dec 11, 2018
25 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -71,14 +71,15 @@ set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/Modules/")
 include(var_functions)
+set(NGRAPH_HYBRID_ENABLE TRUE)
 option(NGRAPH_UNIT_TEST_ENABLE "Control the building of unit tests" TRUE)
 option(NGRAPH_TOOLS_ENABLE "Control the building of tool" TRUE)
 option(NGRAPH_CPU_ENABLE "Control the building of the CPU backend" TRUE)
 option(NGRAPH_INTELGPU_ENABLE "Control the building of the Intel GPU backend with clDNN" FALSE)
 option(NGRAPH_GPU_ENABLE "Control the building of the GPU backend" FALSE)
 option(NGRAPH_INTERPRETER_ENABLE "Control the building of the INTERPRETER backend" TRUE)
-option(NGRAPH_HYBRID_ENABLE "Control the building of the HYBRID backend" FALSE)
 option(NGRAPH_NOP_ENABLE "Control the building of the NOP backend" TRUE)
+option(NGRAPH_GPUH_ENABLE "Control the building of the Hybrid GPU backend" FALSE)
 option(NGRAPH_DISTRIBUTED_ENABLE "Add distributed mode to the CPU backend" FALSE)
 option(NGRAPH_DEBUG_ENABLE "Enable output for NGRAPH_DEBUG statements" FALSE)
 option(NGRAPH_ONNX_IMPORT_ENABLE "Enable ONNX importer" FALSE)
@@ -93,8 +94,8 @@ message(STATUS "NGRAPH_CPU_ENABLE:            ${NGRAPH_CPU_ENABLE}")
 message(STATUS "NGRAPH_INTELGPU_ENABLE:       ${NGRAPH_INTELGPU_ENABLE}")
 message(STATUS "NGRAPH_GPU_ENABLE:            ${NGRAPH_GPU_ENABLE}")
 message(STATUS "NGRAPH_INTERPRETER_ENABLE:    ${NGRAPH_INTERPRETER_ENABLE}")
-message(STATUS "NGRAPH_HYBRID_ENABLE:         ${NGRAPH_HYBRID_ENABLE}")
 message(STATUS "NGRAPH_NOP_ENABLE:            ${NGRAPH_NOP_ENABLE}")
+message(STATUS "NGRAPH_GPUH_ENABLE:           ${NGRAPH_GPUH_ENABLE}")
 message(STATUS "NGRAPH_DISTRIBUTED_ENABLE:    ${NGRAPH_DISTRIBUTED_ENABLE}")
 message(STATUS "NGRAPH_DEBUG_ENABLE:          ${NGRAPH_DEBUG_ENABLE}")
 message(STATUS "NGRAPH_ONNX_IMPORT_ENABLE:    ${NGRAPH_ONNX_IMPORT_ENABLE}")

--- a/src/ngraph/graph_util.cpp
+++ b/src/ngraph/graph_util.cpp
@@ -355,35 +355,6 @@ pair<shared_ptr<op::Result>, shared_ptr<op::Parameter>>
    return make_pair(res_node, par_node);
 }
-// Suffix *_size  as a part of function name is temporary, this suffix
-//  will be removed when the backends move to the latest Hybrid backend
-pair<shared_ptr<op::Result>, shared_ptr<op::Parameter>>
-    ngraph::insert_result_parameter_split_size(const shared_ptr<Node>& src_node,
-                                               const shared_ptr<Node>& dst_node)
-{
-    if (src_node->get_output_size() != 1)
-    {
-        throw ngraph_error("Multiple output per op not supported in graph partition yet.");
-    }
-    // Make parameter node
-    shared_ptr<op::Parameter> par_node = make_shared<op::Parameter>(
-        src_node->get_output_element_type(0), src_node->get_output_shape(0));
-    par_node->set_placement(dst_node->get_placement_size());
-    // Fix input / output among src, dst and par
-    descriptor::Input* dst_input = dst_node->get_input_from(src_node);
-    descriptor::Output* src_output = src_node->get_output_to(dst_node);
-    src_output->remove_input(dst_input);    // Remove [0]
-    dst_input->replace_output(par_node, 0); // Remove [0] (again), add [8], remove [1], add [9]
-    // Add res node
-    shared_ptr<op::Result> res_node = make_shared<op::Result>(src_node); // Add [4], [5], [6], [7]
-    res_node->set_placement(src_node->get_placement_size());
-    return make_pair(res_node, par_node);
-}
 // Insert unary node between two nodes like S->D => S->N->D
 // Before:                        |  After:
 // +-----+---+       +---+-----+  |  +-----+---+       +---+-----+---+       +---+-----+
@@ -458,31 +429,6 @@ Placement ngraph::get_colocated_function_placement(shared_ptr<Function> func)
    return function_placement;
 }
-// Suffix *_size  as a part of function name is temporary, this suffix
-//  will be removed when the backends move to the latest Hybrid backend
-// Assert that nodes in the function is colocated and return that placement
-size_t ngraph::get_colocated_function_placement_size(shared_ptr<Function> func)
-{
-    auto ops = func->get_ops();
-    //it's okay to not do Placement::DEFAULT check; the same node will be checked in the loop below
-    size_t function_placement = ops.front()->get_placement_size();
-    for (auto op : ops)
-    {
-        size_t node_placement = op->get_placement_size();
-        if (node_placement == 0)
-        {
-            throw ngraph_error("Node should have a device placement, not Placement::DEFAULT");
-        }
-        if (function_placement != node_placement)
-        {
-            throw ngraph_error("Function contains nodes of two different placements");
-        }
-    }
-    return function_placement;
-}
 std::shared_ptr<Node> ngraph::make_zero(const element::Type& element_type, const Shape& shape)
 {
    std::shared_ptr<Node> zero = op::Constant::create(element_type, Shape{}, {0.0});

--- a/src/ngraph/graph_util.hpp
+++ b/src/ngraph/graph_util.hpp
@@ -286,16 +286,11 @@ namespace ngraph
    // Assert that nodes in the function is colocated and return that placement
    Placement get_colocated_function_placement(std::shared_ptr<Function> func);
-    size_t get_colocated_function_placement_size(std::shared_ptr<Function> func);
    std::pair<std::shared_ptr<op::Result>, std::shared_ptr<op::Parameter>>
        insert_result_parameter_split(const std::shared_ptr<Node>& src_node,
                                      const std::shared_ptr<Node>& dst_node);
-    std::pair<std::shared_ptr<op::Result>, std::shared_ptr<op::Parameter>>
-        insert_result_parameter_split_size(const std::shared_ptr<Node>& src_node,
-                                           const std::shared_ptr<Node>& dst_node);
    void insert_new_node_between(const std::shared_ptr<Node>& src_node,
                                 const std::shared_ptr<Node>& dst_node,
                                 const std::shared_ptr<Node>& new_node);

--- a/src/ngraph/node.cpp
+++ b/src/ngraph/node.cpp
@@ -150,14 +150,14 @@ void Node::set_placement(Placement placement)
    m_placement = placement;
 }
-size_t Node::get_placement_size() const
+size_t Node::get_placement_index() const
 {
-    return m_placement_size;
+    return m_placement_index;
 }
-void Node::set_placement(size_t placement)
+void Node::set_placement_index(size_t placement)
 {
-    m_placement_size = placement;
+    m_placement_index = placement;
 }
 std::shared_ptr<Node> Node::get_argument(size_t index) const

--- a/src/ngraph/node.hpp
+++ b/src/ngraph/node.hpp
@@ -234,10 +234,10 @@ namespace ngraph
        void set_placement(Placement placement);
        /// Get device placement
-        size_t get_placement_size() const;
+        size_t get_placement_index() const;
        /// Set device placement
-        void set_placement(size_t placement);
+        void set_placement_index(size_t placement);
        /// Get input descriptor that is connected to src
        descriptor::Input* get_input_from(const std::shared_ptr<Node>& src);
@@ -251,6 +251,8 @@ namespace ngraph
        virtual std::shared_ptr<Node> get_default_value() const { return nullptr; }
        /// Use instance ids for comparison instead of memory addresses to improve determinism
        bool operator<(const Node& other) const { return m_instance_id < other.m_instance_id; }
+        static const size_t placement_invalid = -1;
    protected:
        std::set<std::shared_ptr<Node>> m_control_dependencies;
        void set_output_size(size_t n);
@@ -264,7 +266,7 @@ namespace ngraph
        std::deque<descriptor::Output> m_outputs;
        std::unordered_map<Node*, autodiff::Adjoints> m_adjoint_map;
        Placement m_placement = Placement::DEFAULT;
-        size_t m_placement_size = 0;
+        size_t m_placement_index = placement_invalid;
    };
    class NodeValidationError : public AssertionFailure

--- a/src/ngraph/op/result.cpp
+++ b/src/ngraph/op/result.cpp
@@ -36,7 +36,7 @@ void op::Result::validate_and_infer_types()
                                                        << " outputs (1 expected).";
    // always borrow the placement conf even the default one
-    set_placement(get_argument(0)->get_placement_size());
+    set_placement_index(get_argument(0)->get_placement_index());
    set_output_type(0, get_input_element_type(0), get_input_partial_shape(0));
 }

--- a/src/ngraph/pass/assign_placement.cpp
+++ b/src/ngraph/pass/assign_placement.cpp
@@ -28,31 +28,9 @@ pass::AssignPlacement::AssignPlacement(function<Placement(shared_ptr<Node>)> pla
 {
 }
-pass::AssignPlacement::AssignPlacement(vector<shared_ptr<runtime::Backend>> placement_backends)
-    : m_placement_backends(placement_backends)
-{
-}
 bool pass::AssignPlacement::run_on_node(shared_ptr<Node> node)
 {
-    if (!m_placement_backends.empty())
+    node->set_placement(m_placement_policy(node));
-    {
-        size_t backend_index = 0;
-        for (auto backend : m_placement_backends)
-        {
-            backend_index += 1;
-            if (backend->is_supported(*node))
-            {
-                node->set_placement(backend_index);
-                return false;
-            }
-        }
-        throw runtime_error("Node " + node->get_name() + " not supported by any backend");
-    }
-    else
-    {
-        node->set_placement(m_placement_policy(node));
-    }
    return false;
 }
--- a/src/ngraph/pass/assign_placement.hpp
+++ b/src/ngraph/pass/assign_placement.hpp
@@ -32,14 +32,10 @@ namespace ngraph
        public:
            // TODO: make policy a class
            AssignPlacement(std::function<Placement(std::shared_ptr<Node>)> placement_policy);
-            AssignPlacement(
-                std::vector<std::shared_ptr<ngraph::runtime::Backend>> placement_backends);
        private:
            bool run_on_node(std::shared_ptr<Node> node) override;
-            std::vector<std::shared_ptr<ngraph::runtime::Backend>> m_placement_backends;
            std::function<Placement(std::shared_ptr<Node>)> m_placement_policy;
        };
    }

--- a/src/ngraph/placement.cpp
+++ b/src/ngraph/placement.cpp
@@ -224,186 +224,3 @@ pair<vector<shared_ptr<Function>>, unordered_map<shared_ptr<op::Parameter>, shar
    return make_pair(sub_functions, map_parameter_to_result);
 }
-static Node* take_independent_node_with_placement_priority_size(
-    map<size_t, deque<Node*>>& independent_nodes_by_placement, size_t placement)
-{
-    Node* selected_node = nullptr;
-    if (independent_nodes_by_placement.find(placement) != independent_nodes_by_placement.end() &&
-        independent_nodes_by_placement.at(placement).size() != 0)
-    {
-        selected_node = independent_nodes_by_placement.at(placement).front();
-        independent_nodes_by_placement.at(placement).pop_front();
-    }
-    else
-    {
-        for (auto& it : independent_nodes_by_placement)
-        {
-            if (it.second.size() > 0)
-            {
-                selected_node = it.second.front();
-                it.second.pop_front();
-                break;
-            }
-        }
-    }
-    return selected_node;
-}
-static vector<unordered_set<shared_ptr<Node>>>
-    group_function_nodes_to_clusters_size(const shared_ptr<Function>& f)
-{
-    // Topologically sort nodes by picking independent node with the same placement as the
-    // previously picked node greedily
-    map<size_t, deque<Node*>> independent_nodes_by_placement;
-    unordered_map<Node*, size_t> node_dependency_count;
-    unordered_map<ngraph::Node*, shared_ptr<ngraph::Node>> node_map;
-    for (shared_ptr<Node> node : f->get_ops())
-    {
-        size_t dependency_count = node->get_arguments().size();
-        node_map[node.get()] = node;
-        node_dependency_count[node.get()] = dependency_count;
-        if (dependency_count == 0)
-        {
-            independent_nodes_by_placement[node->get_placement_size()].push_back(node.get());
-        }
-    }
-    list<shared_ptr<Node>> sorted_nodes;
-    size_t previous_placement = 0; // Placement::DEFAULT
-    while (Node* independent_node = take_independent_node_with_placement_priority_size(
-               independent_nodes_by_placement, previous_placement))
-    {
-        previous_placement = independent_node->get_placement_size();
-        sorted_nodes.push_back(node_map.at(independent_node));
-        for (auto user : independent_node->get_users())
-        {
-            Node* user_node = user.get();
-            node_dependency_count.at(user_node) -= 1;
-            if (node_dependency_count.at(user_node) == 0)
-            {
-                independent_nodes_by_placement[user_node->get_placement_size()].push_back(
-                    user_node);
-            }
-        }
-    }
-    if (sorted_nodes.size() != f->get_ops().size())
-    {
-        throw ngraph_error("sorted_nodes.size()== " + to_string(sorted_nodes.size()) +
-                           " != f->get_ops().size()== " + to_string(f->get_ops().size()) +
-                           ". Internal error with topological sort.");
-    }
-    // Build clusters from the sorted_nodes
-    previous_placement = 0; // Placement::DEFAULT;
-    vector<unordered_set<shared_ptr<Node>>> clusters;
-    for (shared_ptr<Node> node : sorted_nodes)
-    {
-        size_t node_placement = node->get_placement_size();
-        if (node_placement != previous_placement)
-        {
-            unordered_set<shared_ptr<Node>> new_cluster;
-            clusters.push_back(new_cluster);
-        }
-        clusters.back().insert(node);
-        previous_placement = node_placement;
-    }
-    // Sanity check for node duplication and full node coverage
-    unordered_set<shared_ptr<Node>> cluster_nodes;
-    for (auto cluster : clusters)
-    {
-        for (auto node : cluster)
-        {
-            if (cluster_nodes.find(node) != cluster_nodes.end())
-            {
-                throw ngraph_error("Node " + node->get_name() + " is duplicated in clusters");
-            }
-            cluster_nodes.insert(node);
-        }
-    }
-    unordered_set<shared_ptr<Node>> f_nodes;
-    for (auto node : f->get_ordered_ops())
-    {
-        f_nodes.insert(node);
-    }
-    if (cluster_nodes != f_nodes)
-    {
-        throw ngraph_error(
-            "Cluster's nodes are not the same as function's nodes. cluster_nodes.size()=" +
-            to_string(cluster_nodes.size()) + ", f_nodes.size()=" + to_string(f_nodes.size()));
-    }
-    return clusters;
-}
-// Suffix *_size  as a part of function name is temporary, this suffix
-//  will be removed when the backends move to the latest Hybrid backend
-pair<vector<shared_ptr<Function>>, unordered_map<shared_ptr<op::Parameter>, shared_ptr<op::Result>>>
-    ngraph::split_function_by_placement_size(const shared_ptr<Function>& f)
-{
-    // Split functions to clusters of nodes that can be computed together
-    vector<unordered_set<shared_ptr<Node>>> clusters = group_function_nodes_to_clusters_size(f);
-    // Map from (intermediate) parameter to result node, for guiding data copy among devices
-    unordered_map<shared_ptr<op::Parameter>, shared_ptr<op::Result>> map_parameter_to_result;
-    // Split neighboring nodes if they belong to different clusters
-    // TODO: optimization to group multiple result node from the same source,
-    //       and to group the parameter node in the same cluster with the same result node source
-    unordered_map<shared_ptr<Node>, unordered_set<shared_ptr<Node>>*> map_node_to_cluster;
-    for (auto& cluster : clusters)
-    {
-        for (auto node : cluster)
-        {
-            map_node_to_cluster[node] = &cluster;
-        }
-    }
-    for (auto dst_node : f->get_ordered_ops())
-    {
-        for (auto src_node : dst_node->get_arguments())
-        {
-            auto src_cluster = map_node_to_cluster.at(src_node);
-            auto dst_cluster = map_node_to_cluster.at(dst_node);
-            if (src_cluster != dst_cluster)
-            {
-                // Split src_node and dst_node
-                pair<shared_ptr<op::Result>, shared_ptr<op::Parameter>> res_par_pair =
-                    insert_result_parameter_split_size(src_node, dst_node);
-                shared_ptr<op::Result> res_node = res_par_pair.first;
-                shared_ptr<op::Parameter> par_node = res_par_pair.second;
-                map_parameter_to_result[par_node] = res_node;
-                // Insert newly created nodes into clusters
-                src_cluster->insert(res_node);
-                dst_cluster->insert(par_node);
-            }
-        }
-    }
-    // Create functions from clusters
-    vector<shared_ptr<Function>> sub_functions;
-    for (auto cluster : clusters)
-    {
-        ParameterVector par_vector;
-        ResultVector res_vector;
-        for (auto node : cluster)
-        {
-            if (auto res_node = dynamic_pointer_cast<op::Result>(node))
-            {
-                res_vector.push_back(res_node);
-            }
-            else if (auto par_node = dynamic_pointer_cast<op::Parameter>(node))
-            {
-                par_vector.push_back(par_node);
-            }
-        }
-        auto sub_function = make_shared<Function>(res_vector, par_vector);
-        sub_functions.push_back(sub_function);
-    }
-    return make_pair(sub_functions, map_parameter_to_result);
-}
--- a/src/ngraph/placement.hpp
+++ b/src/ngraph/placement.hpp
@@ -51,9 +51,4 @@ namespace ngraph
    std::pair<std::vector<std::shared_ptr<Function>>,
              std::unordered_map<std::shared_ptr<op::Parameter>, std::shared_ptr<op::Result>>>
        split_function_by_placement(const std::shared_ptr<Function>& f);
-    // Split function to function(s) with unique placement
-    std::pair<std::vector<std::shared_ptr<Function>>,
-              std::unordered_map<std::shared_ptr<op::Parameter>, std::shared_ptr<op::Result>>>
-        split_function_by_placement_size(const std::shared_ptr<Function>& f);
 }
--- a/src/ngraph/runtime/CMakeLists.txt
+++ b/src/ngraph/runtime/CMakeLists.txt
@@ -36,4 +36,8 @@ if (NGRAPH_NOP_ENABLE)
    add_subdirectory(nop)
 endif()
+if (NGRAPH_GPUH_ENABLE)
+    add_subdirectory(gpuh)
+endif()
 add_subdirectory(plaidml)
--- a/src/ngraph/runtime/gpu/gpu_backend.cpp
+++ b/src/ngraph/runtime/gpu/gpu_backend.cpp
@@ -37,15 +37,7 @@ extern "C" const char* get_ngraph_version_string()
 extern "C" runtime::Backend* new_backend(const char* configuration_string)
 {
-#ifdef NGRAPH_HYBRID_ENABLE
-    vector<pair<string, shared_ptr<runtime::Backend>>> backend_list{
-        {"GPU", make_shared<runtime::gpu::GPU_Backend>()}};
-    auto wrapper = new runtime::hybrid::HybridBackend(backend_list);
-    return wrapper;
-#else
    return new runtime::gpu::GPU_Backend();
-#endif
 }
 extern "C" void delete_backend(runtime::Backend* backend)

--- a/src/ngraph/runtime/gpuh/CMakeLists.txt
+++ b/src/ngraph/runtime/gpuh/CMakeLists.txt
+# ******************************************************************************
+# Copyright 2017-2018 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ******************************************************************************
+if (NGRAPH_GPUH_ENABLE)
+    add_library(gpuh_backend SHARED gpuh_backend.cpp)
+    if(NGRAPH_LIB_VERSIONING_ENABLE)
+        set_target_properties(gpuh_backend PROPERTIES
+            VERSION ${NGRAPH_VERSION}
+            SOVERSION ${NGRAPH_API_VERSION})
+    endif()
+    target_link_libraries(gpuh_backend PUBLIC ngraph)
+    set_target_properties(gpuh_backend PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${NGRAPH_BUILD_DIR})
+    install(TARGETS gpuh_backend
+        LIBRARY DESTINATION "${NGRAPH_INSTALL_LIB}"
+        ARCHIVE DESTINATION "${NGRAPH_INSTALL_LIB}"
+    )
+endif()
--- a/src/ngraph/runtime/gpuh/gpuh_backend.cpp
+++ b/src/ngraph/runtime/gpuh/gpuh_backend.cpp
+//*****************************************************************************
+// Copyright 2017-2018 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+#include "ngraph/runtime/gpuh/gpuh_backend.hpp"
+#include "ngraph/graph_util.hpp"
+#include "ngraph/pass/assign_placement.hpp"
+#include "ngraph/pass/manager.hpp"
+#include "ngraph/runtime/interpreter/int_backend.hpp"
+#include "ngraph/runtime/tensor.hpp"
+using namespace ngraph;
+using namespace std;
+extern "C" const char* get_ngraph_version_string()
+{
+    return NGRAPH_VERSION;
+}
+extern "C" runtime::Backend* new_backend(const char* configuration_string)
+{
+    return new runtime::gpuh::GPUHBackend();
+}
+runtime::gpuh::GPUHBackend::GPUHBackend()
+    : HybridBackend({{"INTERPRETER", make_shared<ngraph::runtime::interpreter::INTBackend>()}})
+{
+}
--- a/src/ngraph/runtime/gpuh/gpuh_backend.hpp
+++ b/src/ngraph/runtime/gpuh/gpuh_backend.hpp
+//*****************************************************************************
+// Copyright 2017-2018 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+#pragma once
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+#include "ngraph/runtime/hybrid/hybrid_backend.hpp"
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace gpuh
+        {
+            class GPUHBackend;
+        }
+    }
+}
+class ngraph::runtime::gpuh::GPUHBackend : public ngraph::runtime::hybrid::HybridBackend
+{
+public:
+    GPUHBackend();
+};
--- a/src/ngraph/runtime/gpuh/unit_test.manifest
+++ b/src/ngraph/runtime/gpuh/unit_test.manifest
--- a/src/ngraph/runtime/hybrid/CMakeLists.txt
+++ b/src/ngraph/runtime/hybrid/CMakeLists.txt
@@ -15,7 +15,10 @@
 # ******************************************************************************
 if (NGRAPH_HYBRID_ENABLE)
-    add_library(hybrid_backend SHARED hybrid_backend.cpp)
+    add_library(hybrid_backend SHARED
+        hybrid_backend.cpp
+        hybrid_util.cpp
+        pass/assign_placement.cpp)
    if(NGRAPH_LIB_VERSIONING_ENABLE)
        set_target_properties(hybrid_backend PROPERTIES
            VERSION ${NGRAPH_VERSION}

--- a/src/ngraph/runtime/hybrid/hybrid_backend.cpp
+++ b/src/ngraph/runtime/hybrid/hybrid_backend.cpp
@@ -16,8 +16,9 @@
 #include "ngraph/runtime/hybrid/hybrid_backend.hpp"
 #include "ngraph/graph_util.hpp"
-#include "ngraph/pass/assign_placement.hpp"
 #include "ngraph/pass/manager.hpp"
+#include "ngraph/runtime/hybrid/hybrid_util.hpp"
+#include "ngraph/runtime/hybrid/pass/assign_placement.hpp"
 #include "ngraph/runtime/tensor.hpp"
 using namespace ngraph;
@@ -80,8 +81,8 @@ runtime::Handle runtime::hybrid::HybridBackend::compile(shared_ptr<Function> fun
        instance.m_function = clone_function(*func);
        // Run placement pass
-        pass::Manager pass_manager;
+        ngraph::pass::Manager pass_manager;
-        pass_manager.register_pass<pass::AssignPlacement>(backend_list);
+        pass_manager.register_pass<runtime::hybrid::pass::AssignPlacement>(backend_list);
        pass_manager.run_passes(instance.m_function);
        // Split function to sub_functions
@@ -93,9 +94,15 @@ runtime::Handle runtime::hybrid::HybridBackend::compile(shared_ptr<Function> fun
        for (shared_ptr<Function>& sub_function : instance.m_sub_functions)
        {
            size_t placement = get_colocated_function_placement_size(sub_function);
-            auto backend =
+            auto backend = m_backend_list[placement];
-                m_backend_list[(placement - 1)]; // (placement-1) as 0 is default placement
            backend.second->compile(sub_function);
+            // Compile will replace nodes so we need to make one more pass through all
+            // ops to reset placement
+            for (auto op : sub_function->get_ops())
+            {
+                op->set_placement_index(placement);
+            }
        }
    }
@@ -132,8 +139,7 @@ bool runtime::hybrid::HybridBackend::call(shared_ptr<Function> func,
    {
        // Init backend
        size_t placement = get_colocated_function_placement_size(sub_function);
-        // (placement-1) as 0 is default placement
+        auto backend = m_backend_list[placement].second;
-        auto backend = m_backend_list[(placement - 1)].second;
        // Prepare parameter TensorViews
        vector<shared_ptr<runtime::Tensor>> parameter_tvs;

--- a/src/ngraph/runtime/hybrid/hybrid_util.cpp
+++ b/src/ngraph/runtime/hybrid/hybrid_util.cpp
--- a/src/ngraph/runtime/hybrid/hybrid_util.hpp
+++ b/src/ngraph/runtime/hybrid/hybrid_util.hpp
+//*****************************************************************************
+// Copyright 2017-2018 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+#pragma once
+#include <memory>
+#include <unordered_map>
+#include <vector>
+#include "ngraph/function.hpp"
+#include "ngraph/op/parameter.hpp"
+#include "ngraph/op/result.hpp"
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace hybrid
+        {
+            // Split function to function(s) with unique placement
+            std::pair<
+                std::vector<std::shared_ptr<Function>>,
+                std::unordered_map<std::shared_ptr<op::Parameter>, std::shared_ptr<op::Result>>>
+                split_function_by_placement_size(const std::shared_ptr<Function>& f);
+            // Assert that nodes in the function is colocated and return that placement
+            size_t get_colocated_function_placement_size(std::shared_ptr<Function> func);
+        }
+    }
+}
--- a/src/ngraph/runtime/hybrid/pass/assign_placement.cpp
+++ b/src/ngraph/runtime/hybrid/pass/assign_placement.cpp
+//*****************************************************************************
+// Copyright 2017-2018 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+#include "ngraph/runtime/hybrid/pass/assign_placement.hpp"
+#include "ngraph/log.hpp"
+#include "ngraph/node.hpp"
+#include "ngraph/placement.hpp"
+#include "ngraph/runtime/backend.hpp"
+using namespace ngraph;
+using namespace std;
+runtime::hybrid::pass::AssignPlacement::AssignPlacement(
+    vector<shared_ptr<runtime::Backend>> placement_backends)
+    : m_placement_backends(placement_backends)
+{
+}
+bool runtime::hybrid::pass::AssignPlacement::run_on_node(shared_ptr<Node> node)
+{
+    size_t backend_index = 0;
+    for (auto backend : m_placement_backends)
+    {
+        if (backend->is_supported(*node))
+        {
+            node->set_placement_index(backend_index);
+            return false;
+        }
+        backend_index++;
+    }
+    throw runtime_error("Node " + node->get_name() + " not supported by any backend");
+}
--- a/src/ngraph/runtime/hybrid/pass/assign_placement.hpp
+++ b/src/ngraph/runtime/hybrid/pass/assign_placement.hpp
+//*****************************************************************************
+// Copyright 2017-2018 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+#pragma once
+#include <exception>
+#include <functional>
+#include <sstream>
+#include "ngraph/pass/pass.hpp"
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace hybrid
+        {
+            namespace pass
+            {
+                class AssignPlacement;
+            }
+        }
+    }
+}
+class ngraph::runtime::hybrid::pass::AssignPlacement : public ngraph::pass::NodePass
+{
+public:
+    // TODO: make policy a class
+    AssignPlacement(std::vector<std::shared_ptr<ngraph::runtime::Backend>> placement_backends);
+private:
+    bool run_on_node(std::shared_ptr<Node> node) override;
+    std::vector<std::shared_ptr<ngraph::runtime::Backend>> m_placement_backends;
+};
--- a/src/ngraph/runtime/tensor.cpp
+++ b/src/ngraph/runtime/tensor.cpp
@@ -15,7 +15,10 @@
 //*****************************************************************************
 #include "ngraph/runtime/tensor.hpp"
+#include "ngraph/assertion.hpp"
 #include "ngraph/descriptor/layout/tensor_layout.hpp"
+#include "ngraph/log.hpp"
+#include "ngraph/runtime/aligned_buffer.hpp"
 #include "ngraph/type/element_type.hpp"
 using namespace ngraph;
@@ -70,3 +73,21 @@ void runtime::Tensor::set_stale(bool val)
 {
    m_stale = val;
 }
+void runtime::Tensor::copy_from(const ngraph::runtime::Tensor& source)
+{
+    if (get_element_count() != source.get_element_count())
+    {
+        throw invalid_argument("runtime::Tensor::copy_from element count must match");
+    }
+    if (get_element_type() != source.get_element_type())
+    {
+        throw invalid_argument("runtime::Tensor::copy_from element types must match");
+    }
+    // This is potentially inefficient but is supplied only to get things going
+    // This is be replaced with more optimial implementations in later PRs
+    auto size = get_size_in_bytes();
+    AlignedBuffer buffer{size, 64};
+    source.read(buffer.get_ptr(), 0, size);
+    write(buffer.get_ptr(), 0, size);
+}
--- a/src/ngraph/runtime/tensor.hpp
+++ b/src/ngraph/runtime/tensor.hpp
@@ -100,6 +100,10 @@ namespace ngraph
            /// \param n Number of bytes to read, must be integral number of elements.
            virtual void read(void* p, size_t offset, size_t n) const = 0;
+            /// \brief copy bytes directly from source to this tensor
+            /// \param source The source tensor
+            virtual void copy_from(const ngraph::runtime::Tensor& source);
        protected:
            std::shared_ptr<ngraph::descriptor::Tensor> m_descriptor;
            bool m_stale;

--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -76,12 +76,6 @@ if (NGRAPH_INTERPRETER_ENABLE)
    set(ACTIVE_BACKEND_LIST ${ACTIVE_BACKEND_LIST} INTERPRETER)
 endif()
-if (NGRAPH_HYBRID_ENABLE)
-    list(APPEND SRC
-        hybrid_backend.cpp
-        hybrid_utils.cpp)
-endif()
 if (NGRAPH_CPU_ENABLE)
    list(APPEND SRC core_fusion.cpp builder_quantization.cpp)
    list(APPEND SRC backend_performance.cpp cpu_fusion.cpp cpu_test.cpp cpu_reshape_sinking.cpp cpu_debugger.cpp)
@@ -100,8 +94,8 @@ if (NGRAPH_INTELGPU_ENABLE)
    set(ACTIVE_BACKEND_LIST ${ACTIVE_BACKEND_LIST} INTELGPU)
 endif()
-if (NGRAPH_HYBRID_ENABLE)
+if (NGRAPH_GPUH_ENABLE)
-    set(ACTIVE_BACKEND_LIST ${ACTIVE_BACKEND_LIST} HYBRID)
+    set(ACTIVE_BACKEND_LIST ${ACTIVE_BACKEND_LIST} GPUH)
 endif()
 if (NGRAPH_PLAIDML_ENABLE)
@@ -232,6 +226,10 @@ if (NGRAPH_NOP_ENABLE)
    target_link_libraries(unit-test PRIVATE nop_backend)
 endif()
+if (NGRAPH_GPUH_ENABLE)
+    target_link_libraries(unit-test PRIVATE gpuh_backend)
+endif()
 if (NGRAPH_ONNXIFI_ENABLE)
    target_include_directories(unit-test SYSTEM PUBLIC ${ONNX_INCLUDE_DIR})
    target_link_libraries(unit-test PRIVATE onnxifi-ngraph)