The General Purpose graph splitting is no longer used (#2391)

* remove general splitting code. New code in hybrid transformer. * more cleanup

The General Purpose graph splitting is no longer used (#2391)
* remove general splitting code. New code in hybrid transformer. * more cleanup
1efd0bfd · Robert Kimball · Scott Cyphers · c9a9c154 · 1efd0bfd · 1efd0bfd
Commit 1efd0bfd authored Feb 04, 2019 by Robert Kimball Committed by Scott Cyphers Feb 04, 2019
9 changed files
--- a/src/ngraph/CMakeLists.txt
+++ b/src/ngraph/CMakeLists.txt
@@ -123,7 +123,6 @@ set (SRC
    op/util/logical_reduction.cpp
    op/util/unary_elementwise_arithmetic.cpp
    partial_shape.cpp
-    pass/assign_placement.cpp
    pass/algebraic_simplification.cpp
    pass/common_function_collection.cpp
    pass/constant_folding.cpp
@@ -139,8 +138,8 @@ set (SRC
    pass/memory_visualize.cpp
    pass/nop_elimination.cpp
    pass/pass.cpp
-    pass/pass_config.cpp 
+    pass/pass_config.cpp
-    pass/prefix_reshape_elimination.cpp 
+    pass/prefix_reshape_elimination.cpp
    pass/propagate_cacheability.cpp
    pass/reshape_elimination.cpp
    pass/reshape_sinking.cpp

--- a/src/ngraph/graph_util.cpp
+++ b/src/ngraph/graph_util.cpp
@@ -31,7 +31,6 @@
 #include "ngraph/op/constant.hpp"
 #include "ngraph/op/parameter.hpp"
 #include "ngraph/op/result.hpp"
-#include "ngraph/placement.hpp"
 #include "ngraph/result_vector.hpp"
 #include "ngraph/util.hpp"
@@ -406,29 +405,6 @@ void ngraph::insert_new_node_between(const shared_ptr<Node>& src_node,
    dst_input->replace_output(new_node, 0); // Remove [0] (again), add [8], remove [1], add [9]
 }
-// Assert that nodes in the function is colocated and return that placement
-Placement ngraph::get_colocated_function_placement(shared_ptr<Function> func)
-{
-    Placement function_placement = Placement::DEFAULT;
-    traverse_nodes(func, [&](shared_ptr<Node> node) {
-        Placement node_placement = node->get_placement();
-        if (node_placement == Placement::DEFAULT)
-        {
-            throw ngraph_error("Node should have a device placement, not Placement::DEFAULT");
-        }
-        if (function_placement == Placement::DEFAULT)
-        {
-            // First time seeing a node
-            function_placement = node->get_placement();
-        }
-        else if (function_placement != node_placement)
-        {
-            throw ngraph_error("Function contains nodes of two different placements");
-        }
-    });
-    return function_placement;
-}
 std::shared_ptr<Node> ngraph::make_zero(const element::Type& element_type, const Shape& shape)
 {
    std::shared_ptr<Node> zero = op::Constant::create(element_type, Shape{}, {0.0});

--- a/src/ngraph/pass/assign_placement.cpp
+++ b/src/ngraph/pass/assign_placement.cpp
-//*****************************************************************************
-// Copyright 2017-2019 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//*****************************************************************************
-#include "ngraph/pass/assign_placement.hpp"
-#include "ngraph/log.hpp"
-#include "ngraph/node.hpp"
-#include "ngraph/placement.hpp"
-#include "ngraph/runtime/backend.hpp"
-using namespace ngraph;
-using namespace std;
-pass::AssignPlacement::AssignPlacement(function<Placement(shared_ptr<Node>)> placement_policy)
-    : m_placement_policy(placement_policy)
-{
-}
-bool pass::AssignPlacement::run_on_node(shared_ptr<Node> node)
-{
-    node->set_placement(m_placement_policy(node));
-    return false;
-}
--- a/src/ngraph/pass/assign_placement.hpp
+++ b/src/ngraph/pass/assign_placement.hpp
-//*****************************************************************************
-// Copyright 2017-2019 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//*****************************************************************************
-#pragma once
-#include <exception>
-#include <functional>
-#include <sstream>
-#include "ngraph/pass/pass.hpp"
-#include "ngraph/placement.hpp"
-namespace ngraph
-{
-    namespace pass
-    {
-        class AssignPlacement : public NodePass
-        {
-        public:
-            // TODO: make policy a class
-            AssignPlacement(std::function<Placement(std::shared_ptr<Node>)> placement_policy);
-        private:
-            bool run_on_node(std::shared_ptr<Node> node) override;
-            std::function<Placement(std::shared_ptr<Node>)> m_placement_policy;
-        };
-    }
-}
--- a/src/ngraph/placement.cpp
+++ b/src/ngraph/placement.cpp
@@ -39,188 +39,3 @@ std::string ngraph::placement_to_string(Placement placement)
    }
    throw runtime_error("unhandled placement type");
 }
-static Node* take_independent_node_with_placement_priority(
-    map<Placement, deque<Node*>>& independent_nodes_by_placement, Placement placement)
-{
-    Node* selected_node = nullptr;
-    if (independent_nodes_by_placement.find(placement) != independent_nodes_by_placement.end() &&
-        independent_nodes_by_placement.at(placement).size() != 0)
-    {
-        selected_node = independent_nodes_by_placement.at(placement).front();
-        independent_nodes_by_placement.at(placement).pop_front();
-    }
-    else
-    {
-        for (auto& it : independent_nodes_by_placement)
-        {
-            if (it.second.size() > 0)
-            {
-                selected_node = it.second.front();
-                it.second.pop_front();
-                break;
-            }
-        }
-    }
-    return selected_node;
-}
-static vector<unordered_set<shared_ptr<Node>>>
-    group_function_nodes_to_clusters(const shared_ptr<Function>& f)
-{
-    // Topologically sort nodes by picking independent node with the same placement as the
-    // previously picked node greedily
-    map<Placement, deque<Node*>> independent_nodes_by_placement;
-    unordered_map<Node*, size_t> node_dependency_count;
-    unordered_map<ngraph::Node*, shared_ptr<ngraph::Node>> node_map;
-    for (shared_ptr<Node> node : f->get_ops())
-    {
-        size_t dependency_count = node->get_arguments().size();
-        node_map[node.get()] = node;
-        node_dependency_count[node.get()] = dependency_count;
-        if (dependency_count == 0)
-        {
-            independent_nodes_by_placement[node->get_placement()].push_back(node.get());
-        }
-    }
-    list<shared_ptr<Node>> sorted_nodes;
-    Placement previous_placement = Placement::DEFAULT;
-    while (Node* independent_node = take_independent_node_with_placement_priority(
-               independent_nodes_by_placement, previous_placement))
-    {
-        previous_placement = independent_node->get_placement();
-        sorted_nodes.push_back(node_map.at(independent_node));
-        for (auto user : independent_node->get_users())
-        {
-            Node* user_node = user.get();
-            node_dependency_count.at(user_node) -= 1;
-            if (node_dependency_count.at(user_node) == 0)
-            {
-                independent_nodes_by_placement[user_node->get_placement()].push_back(user_node);
-            }
-        }
-    }
-    if (sorted_nodes.size() != f->get_ops().size())
-    {
-        throw ngraph_error("sorted_nodes.size()== " + to_string(sorted_nodes.size()) +
-                           " != f->get_ops().size()== " + to_string(f->get_ops().size()) +
-                           ". Internal error with topological sort.");
-    }
-    // Build clusters from the sorted_nodes
-    previous_placement = Placement::DEFAULT;
-    vector<unordered_set<shared_ptr<Node>>> clusters;
-    for (shared_ptr<Node> node : sorted_nodes)
-    {
-        Placement node_placement = node->get_placement();
-        if (node_placement != previous_placement)
-        {
-            unordered_set<shared_ptr<Node>> new_cluster;
-            clusters.push_back(new_cluster);
-        }
-        clusters.back().insert(node);
-        previous_placement = node_placement;
-    }
-    // Sanity check for node duplication and full node coverage
-    unordered_set<shared_ptr<Node>> cluster_nodes;
-    for (auto cluster : clusters)
-    {
-        for (auto node : cluster)
-        {
-            if (cluster_nodes.find(node) != cluster_nodes.end())
-            {
-                throw ngraph_error("Node " + node->get_name() + " is duplicated in clusters");
-            }
-            cluster_nodes.insert(node);
-        }
-    }
-    unordered_set<shared_ptr<Node>> f_nodes;
-    for (auto node : f->get_ordered_ops())
-    {
-        f_nodes.insert(node);
-    }
-    if (cluster_nodes != f_nodes)
-    {
-        throw ngraph_error(
-            "Cluster's nodes are not the same as function's nodes. cluster_nodes.size()=" +
-            to_string(cluster_nodes.size()) + ", f_nodes.size()=" + to_string(f_nodes.size()));
-    }
-    return clusters;
-}
-// Split function by placement, maximizing the span each subgraph. Each subgraph will be placed in
-// a single device.
-//
-// For nested functions, we only consider the ops in the main function that represent calling of the
-// nested functions.
-pair<vector<shared_ptr<Function>>, unordered_map<shared_ptr<op::Parameter>, shared_ptr<op::Result>>>
-    ngraph::split_function_by_placement(const shared_ptr<Function>& f)
-{
-    // Split functions to clusters of nodes that can be computed together
-    vector<unordered_set<shared_ptr<Node>>> clusters = group_function_nodes_to_clusters(f);
-    // Map from (intermediate) parameter to result node, for guiding data copy among devices
-    unordered_map<shared_ptr<op::Parameter>, shared_ptr<op::Result>> map_parameter_to_result;
-    // Split neighboring nodes if they belong to different clusters
-    // TODO: optimization to group multiple result node from the same source,
-    //       and to group the parameter node in the same cluster with the same result node source
-    unordered_map<shared_ptr<Node>, unordered_set<shared_ptr<Node>>*> map_node_to_cluster;
-    for (auto& cluster : clusters)
-    {
-        for (auto node : cluster)
-        {
-            map_node_to_cluster[node] = &cluster;
-        }
-    }
-    for (auto dst_node : f->get_ordered_ops())
-    {
-        for (auto src_node : dst_node->get_arguments())
-        {
-            auto src_cluster = map_node_to_cluster.at(src_node);
-            auto dst_cluster = map_node_to_cluster.at(dst_node);
-            if (src_cluster != dst_cluster)
-            {
-                // Split src_node and dst_node
-                pair<shared_ptr<op::Result>, shared_ptr<op::Parameter>> res_par_pair =
-                    insert_result_parameter_split(src_node, dst_node);
-                shared_ptr<op::Result> res_node = res_par_pair.first;
-                shared_ptr<op::Parameter> par_node = res_par_pair.second;
-                map_parameter_to_result[par_node] = res_node;
-                // Insert newly created nodes into clusters
-                src_cluster->insert(res_node);
-                dst_cluster->insert(par_node);
-            }
-        }
-    }
-    // Create functions from clusters
-    vector<shared_ptr<Function>> sub_functions;
-    for (auto cluster : clusters)
-    {
-        ParameterVector par_vector;
-        ResultVector res_vector;
-        for (auto node : cluster)
-        {
-            if (auto res_node = dynamic_pointer_cast<op::Result>(node))
-            {
-                res_vector.push_back(res_node);
-            }
-            else if (auto par_node = dynamic_pointer_cast<op::Parameter>(node))
-            {
-                par_vector.push_back(par_node);
-            }
-        }
-        auto sub_function = make_shared<Function>(res_vector, par_vector);
-        sub_functions.push_back(sub_function);
-    }
-    return make_pair(sub_functions, map_parameter_to_result);
-}
--- a/src/ngraph/placement.hpp
+++ b/src/ngraph/placement.hpp
@@ -26,15 +26,6 @@
 namespace ngraph
 {
-    class Function;
-    class Node;
-    namespace op
-    {
-        class Parameter;
-        class Result;
-    }
    enum class Placement
    {
        DEFAULT,
@@ -46,9 +37,4 @@ namespace ngraph
    };
    std::string placement_to_string(Placement placement);
-    // Split function to function(s) with unique placement
-    std::pair<std::vector<std::shared_ptr<Function>>,
-              std::unordered_map<std::shared_ptr<op::Parameter>, std::shared_ptr<op::Result>>>
-        split_function_by_placement(const std::shared_ptr<Function>& f);
 }
--- a/src/ngraph/runtime/gpuh/gpuh_backend.cpp
+++ b/src/ngraph/runtime/gpuh/gpuh_backend.cpp
@@ -16,7 +16,6 @@
 #include "ngraph/runtime/gpuh/gpuh_backend.hpp"
 #include "ngraph/graph_util.hpp"
-#include "ngraph/pass/assign_placement.hpp"
 #include "ngraph/pass/manager.hpp"
 #include "ngraph/runtime/gpu/gpu_backend.hpp"
 #include "ngraph/runtime/interpreter/int_backend.hpp"

--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -37,7 +37,6 @@ set(SRC
    cse.cpp
    element_type.cpp
    file_util.cpp
-    graph_partition.cpp
    includes.cpp
    input_output_assign.cpp
    main.cpp
@@ -200,7 +199,7 @@ if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "^(Apple)?Clang$")
 endif()
 if (NGRAPH_CPU_ENABLE)
-    # The INTERPRETER backend is required for graph_partition, convolution, and backwards unit tests
+    # The INTERPRETER backend is required for convolution, and backwards unit tests
    target_link_libraries(unit-test PRIVATE cpu_backend interpreter_backend)
    target_link_libraries(unit-test PRIVATE libmkldnn)
 endif()

--- a/test/graph_partition.cpp
+++ b/test/graph_partition.cpp
-//*****************************************************************************
-// Copyright 2017-2019 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//*****************************************************************************
-#include <memory>
-#include <sstream>
-#include <string>
-#include <typeindex>
-#include <typeinfo>
-#include <vector>
-#include "gtest/gtest.h"
-#include "ngraph/graph_util.hpp"
-#include "ngraph/ngraph.hpp"
-#include "ngraph/pass/assign_placement.hpp"
-#include "ngraph/pass/manager.hpp"
-#include "ngraph/runtime/host_tensor.hpp"
-#include "ngraph/util.hpp"
-#include "util/ndarray.hpp"
-#include "util/test_tools.hpp"
-using namespace std;
-using namespace ngraph;
-// Perform all operations on INTERPRETER and fallback Multiply to CPU
-static function<Placement(shared_ptr<Node>)> int_with_cpu_mul_policy = [](shared_ptr<Node> node) {
-    Placement placement;
-    string node_op = node->description();
-    if (node_op == "Multiply")
-    {
-        placement = Placement::CPU;
-    }
-    else
-    {
-        placement = Placement::INTERPRETER;
-    }
-    return placement;
-};
-// HybridCallFrame servers 2 purposes:
-// 1. HybridBackend's main use case is to test device placement and graph partition routines.
-// 2. It also shows how glued-hybrid runtime can be built by combining different runtimes.
-//
-// By default, HybridBackend operates on INTERPRETER (for example, the tensor view is
-// INTERPRETER tensor view). It falls back to CPU when requested by placement.
-class HybridBackend
-{
-public:
-    HybridBackend(const function<Placement(shared_ptr<Node>)>& placement_policy)
-        : m_placement_policy(placement_policy)
-    {
-    }
-    ~HybridBackend() {}
-    shared_ptr<runtime::Tensor> create_tensor(const element::Type& element_type, const Shape& shape)
-    {
-        return get_cached_backend(Placement::INTERPRETER)->create_tensor(element_type, shape);
-    }
-    bool compile(const shared_ptr<Function>& func)
-    {
-        if (m_function_map.find(func) == m_function_map.end())
-        {
-            // Clone function
-            FunctionInstance instance;
-            instance.m_function = clone_function(*func);
-            // Run placement pass
-            pass::Manager pass_manager;
-            pass_manager.register_pass<pass::AssignPlacement>(int_with_cpu_mul_policy);
-            pass_manager.run_passes(instance.m_function);
-            // Split function to sub_functions
-            tie(instance.m_sub_functions, instance.m_map_parameter_to_result) =
-                split_function_by_placement(instance.m_function);
-            m_function_map.insert({func, instance});
-            // Compile subfunctions in corresponding backends
-            for (shared_ptr<Function>& sub_function : instance.m_sub_functions)
-            {
-                Placement placement = get_colocated_function_placement(sub_function);
-                auto backend = get_cached_backend(placement);
-                backend->compile(sub_function);
-            }
-        }
-        return true;
-    }
-    bool call_with_validate(const shared_ptr<Function>& func,
-                            const vector<shared_ptr<runtime::Tensor>>& outputs,
-                            const vector<shared_ptr<runtime::Tensor>>& inputs)
-    {
-        // Get FunctionInstance
-        bool rc = true;
-        auto it = m_function_map.find(func);
-        if (it == m_function_map.end())
-        {
-            compile(func);
-            it = m_function_map.find(func);
-        }
-        if (it == m_function_map.end())
-        {
-            throw runtime_error("Error constructing backend.");
-        }
-        FunctionInstance& instance = it->second;
-        // Parameter and result node in sub_function maps to one Tensor
-        unordered_map<shared_ptr<Node>, shared_ptr<runtime::Tensor>> map_node_to_tensor_view;
-        for (size_t i = 0; i < inputs.size(); ++i)
-        {
-            map_node_to_tensor_view[instance.m_function->get_parameters()[i]] = inputs[i];
-        }
-        for (size_t i = 0; i < outputs.size(); ++i)
-        {
-            map_node_to_tensor_view[instance.m_function->get_results()[i]] = outputs[i];
-        }
-        // Call subfunctions
-        for (shared_ptr<Function>& sub_function : instance.m_sub_functions)
-        {
-            // Init backend
-            Placement placement = get_colocated_function_placement(sub_function);
-            auto backend = get_cached_backend(placement);
-            // Prepare parameter TensorViews
-            vector<shared_ptr<runtime::Tensor>> parameter_tvs;
-            for (auto parameter_node : sub_function->get_parameters())
-            {
-                if (map_node_to_tensor_view.find(parameter_node) != map_node_to_tensor_view.end())
-                {
-                    parameter_tvs.push_back(map_node_to_tensor_view.at(parameter_node));
-                }
-                else
-                {
-                    auto result_node = instance.m_map_parameter_to_result.at(parameter_node);
-                    auto result_tv = map_node_to_tensor_view.at(result_node);
-                    auto parameter_tv = backend->create_tensor(parameter_node->get_element_type(),
-                                                               parameter_node->get_shape());
-                    copy_data(parameter_tv, read_vector<float>(result_tv));
-                    map_node_to_tensor_view[parameter_node] = parameter_tv;
-                    parameter_tvs.push_back(parameter_tv);
-                }
-            }
-            // Prepare result TensorViews
-            vector<shared_ptr<runtime::Tensor>> result_tvs;
-            for (auto result_node : sub_function->get_results())
-            {
-                if (map_node_to_tensor_view.find(result_node) != map_node_to_tensor_view.end())
-                {
-                    result_tvs.push_back(map_node_to_tensor_view.at(result_node));
-                }
-                else
-                {
-                    auto result_tv = backend->create_tensor(result_node->get_element_type(),
-                                                            result_node->get_shape());
-                    map_node_to_tensor_view[result_node] = result_tv;
-                    result_tvs.push_back(result_tv);
-                }
-            }
-            // Call
-            backend->call_with_validate(sub_function, result_tvs, parameter_tvs);
-        }
-        return rc;
-    }
-protected:
-    class FunctionInstance
-    {
-    public:
-        shared_ptr<Function> m_function;
-        vector<shared_ptr<Function>> m_sub_functions;
-        unordered_map<shared_ptr<op::Parameter>, shared_ptr<op::Result>> m_map_parameter_to_result;
-    };
-    shared_ptr<runtime::Backend> get_cached_backend(Placement placement)
-    {
-        if (m_cached_backends.find(placement) == m_cached_backends.end())
-        {
-            m_cached_backends[placement] = runtime::Backend::create(placement_to_string(placement));
-        }
-        return m_cached_backends.at(placement);
-    }
-    map<Placement, shared_ptr<runtime::Backend>> m_cached_backends;
-    map<shared_ptr<Function>, FunctionInstance> m_function_map;
-    function<Placement(shared_ptr<Node>)> m_placement_policy;
-};
-TEST(graph_partition, placement_all_cpu_policy)
-{
-    Shape shape = Shape{2, 2};
-    shared_ptr<op::Parameter> A = make_shared<op::Parameter>(element::f32, shape);
-    shared_ptr<op::Parameter> B = make_shared<op::Parameter>(element::f32, shape);
-    shared_ptr<op::Parameter> C = make_shared<op::Parameter>(element::f32, shape);
-    shared_ptr<Node> AplusB = A + B;
-    shared_ptr<Node> AplusBtimesC = AplusB * C;
-    shared_ptr<Function> f = make_shared<Function>(AplusBtimesC, ParameterVector{A, B, C});
-    for (auto node : f->get_ordered_ops())
-    {
-        EXPECT_EQ(node->get_placement(), Placement::DEFAULT);
-    }
-    pass::Manager pass_manager;
-    pass_manager.register_pass<pass::AssignPlacement>(
-        [](shared_ptr<Node> node) { return Placement::CPU; });
-    pass_manager.run_passes(f);
-    for (auto node : f->get_ordered_ops())
-    {
-        EXPECT_EQ(node->get_placement(), Placement::CPU);
-    }
-}
-#ifdef NGRAPH_CPU_ENABLE
-TEST(graph_partition, placement_int_with_cpu_mul_policy)
-{
-    Shape shape = Shape{2, 2};
-    shared_ptr<op::Parameter> A = make_shared<op::Parameter>(element::f32, shape);
-    shared_ptr<op::Parameter> B = make_shared<op::Parameter>(element::f32, shape);
-    shared_ptr<op::Parameter> C = make_shared<op::Parameter>(element::f32, shape);
-    shared_ptr<Node> AplusB = A + B;
-    shared_ptr<Node> AplusBtimesC = AplusB * C;
-    shared_ptr<Function> f = make_shared<Function>(AplusBtimesC, ParameterVector{A, B, C});
-    for (auto node : f->get_ordered_ops())
-    {
-        EXPECT_EQ(node->get_placement(), Placement::DEFAULT);
-    }
-    pass::Manager pass_manager;
-    pass_manager.register_pass<pass::AssignPlacement>(int_with_cpu_mul_policy);
-    pass_manager.run_passes(f);
-    for (auto node : f->get_ordered_ops())
-    {
-        string node_op = node->description();
-        if (node_op == "Multiply")
-        {
-            EXPECT_EQ(node->get_placement(), Placement::CPU);
-        }
-        else
-        {
-            EXPECT_EQ(node->get_placement(), Placement::INTERPRETER);
-        }
-    }
-}
-TEST(graph_partition, hybrid_abc_manual)
-{
-    // A   B   C    A   B     C
-    //  \ /   /      \ /     /
-    //   +D  /        +D    /
-    //    \ /         |    /
-    //     *E         R0  R1  f0(INT)
-    //     |       ------------------
-    //     R          P0  P1
-    //                 \ /
-    //                  *E
-    //                  |
-    //                  R2    f1(CPU)
-    //             ------------------
-    //                  P2
-    //                  |
-    //                  R     f2(INT)
-    //             ------------------
-    Shape shape = Shape{2, 2};
-    auto A = make_shared<op::Parameter>(element::f32, shape);
-    auto B = make_shared<op::Parameter>(element::f32, shape);
-    auto C = make_shared<op::Parameter>(element::f32, shape);
-    auto D = A + B;
-    auto E = D * C;
-    auto R = make_shared<op::Result>(E);
-    auto f = make_shared<Function>(ResultVector{R}, ParameterVector{A, B, C});
-    pass::Manager pass_manager;
-    pass_manager.register_pass<pass::AssignPlacement>(int_with_cpu_mul_policy);
-    pass_manager.run_passes(f);
-    // Insert parameter
-    auto RP0 = insert_result_parameter_split(D, E);
-    shared_ptr<op::Result> R0 = RP0.first;
-    shared_ptr<op::Parameter> P0 = RP0.second;
-    auto RP1 = insert_result_parameter_split(C, E);
-    shared_ptr<op::Result> R1 = RP1.first;
-    shared_ptr<op::Parameter> P1 = RP1.second;
-    auto RP2 = insert_result_parameter_split(E, R);
-    shared_ptr<op::Result> R2 = RP2.first;
-    shared_ptr<op::Parameter> P2 = RP2.second;
-    // Backends
-    auto int_backend = runtime::Backend::create(placement_to_string(Placement::INTERPRETER));
-    auto cpu_backend = runtime::Backend::create(placement_to_string(Placement::CPU));
-    // f0 on INT
-    auto a = int_backend->create_tensor(element::f32, shape);
-    auto b = int_backend->create_tensor(element::f32, shape);
-    auto c = int_backend->create_tensor(element::f32, shape);
-    auto r0 = int_backend->create_tensor(element::f32, shape);
-    auto r1 = int_backend->create_tensor(element::f32, shape);
-    copy_data(a, test::NDArray<float, 2>({{1, 2}, {3, 4}}).get_vector());
-    copy_data(b, test::NDArray<float, 2>({{5, 6}, {7, 8}}).get_vector());
-    copy_data(c, test::NDArray<float, 2>({{9, 10}, {11, 12}}).get_vector());
-    auto f0 = make_shared<Function>(ResultVector{R0, R1}, ParameterVector{A, B, C});
-    int_backend->compile(f0);
-    int_backend->call_with_validate(f0, {r0, r1}, {a, b, c});
-    // f1 on CPU
-    auto p0 = cpu_backend->create_tensor(element::f32, shape);
-    auto p1 = cpu_backend->create_tensor(element::f32, shape);
-    auto r2 = cpu_backend->create_tensor(element::f32, shape);
-    copy_data(p0, read_vector<float>(r0));
-    copy_data(p1, read_vector<float>(r1));
-    auto f1 = make_shared<Function>(ResultVector{R2}, ParameterVector{P0, P1});
-    cpu_backend->compile(f1);
-    cpu_backend->call_with_validate(f1, {r2}, {p0, p1});
-    // f2 on INT
-    auto p2 = int_backend->create_tensor(element::f32, shape);
-    auto r = int_backend->create_tensor(element::f32, shape);
-    copy_data(p2, read_vector<float>(r2));
-    auto f2 = make_shared<Function>(ResultVector{R}, ParameterVector{P2});
-    int_backend->compile(f2);
-    int_backend->call_with_validate(f2, {r}, {p2});
-    // Check final result on INT
-    EXPECT_EQ(read_vector<float>(r),
-              (test::NDArray<float, 2>({{54, 80}, {110, 144}})).get_vector());
-}
-TEST(graph_partition, hybrid_abc)
-{
-    // Same as hybrid_abc_manual, but using the test hybrid transformer
-    //
-    // A   B   C    A   B     C
-    //  \ /   /      \ /     /
-    //   +D  /        +D    /
-    //    \ /         |    /
-    //     *E         R0  R1  f0(INT)
-    //     |       ------------------
-    //     R          P0  P1
-    //                 \ /
-    //                  *E
-    //                  |
-    //                  R2    f1(CPU)
-    //             ------------------
-    //                  P2
-    //                  |
-    //                  R     f2(INT)
-    //             ------------------
-    Shape shape = Shape{2, 2};
-    auto A = make_shared<op::Parameter>(element::f32, shape);
-    auto B = make_shared<op::Parameter>(element::f32, shape);
-    auto C = make_shared<op::Parameter>(element::f32, shape);
-    auto D = A + B;
-    auto E = D * C;
-    auto R = make_shared<op::Result>(E);
-    auto f = make_shared<Function>(ResultVector{R}, ParameterVector{A, B, C});
-    auto backend = make_shared<HybridBackend>(int_with_cpu_mul_policy);
-    shared_ptr<runtime::Tensor> a = backend->create_tensor(element::f32, shape);
-    shared_ptr<runtime::Tensor> b = backend->create_tensor(element::f32, shape);
-    shared_ptr<runtime::Tensor> c = backend->create_tensor(element::f32, shape);
-    shared_ptr<runtime::Tensor> r = backend->create_tensor(element::f32, shape);
-    copy_data(a, test::NDArray<float, 2>({{1, 2}, {3, 4}}).get_vector());
-    copy_data(b, test::NDArray<float, 2>({{5, 6}, {7, 8}}).get_vector());
-    copy_data(c, test::NDArray<float, 2>({{9, 10}, {11, 12}}).get_vector());
-    backend->call_with_validate(f, {r}, {a, b, c});
-    EXPECT_EQ(read_vector<float>(r),
-              (test::NDArray<float, 2>({{54, 80}, {110, 144}})).get_vector());
-}
-TEST(graph_partition, hybrid_abcd)
-{
-    //   A   B
-    //    \ /
-    // C  E*   D
-    //  \ / \ /
-    //  F+  G+
-    //    \ /
-    //    H+
-    Shape shape = Shape{2, 2};
-    shared_ptr<op::Parameter> A = make_shared<op::Parameter>(element::f32, shape);
-    shared_ptr<op::Parameter> B = make_shared<op::Parameter>(element::f32, shape);
-    shared_ptr<op::Parameter> C = make_shared<op::Parameter>(element::f32, shape);
-    shared_ptr<op::Parameter> D = make_shared<op::Parameter>(element::f32, shape);
-    shared_ptr<Node> E = A * B;
-    shared_ptr<Node> F = C + E;
-    shared_ptr<Node> G = E + D;
-    shared_ptr<Node> H = F + G;
-    shared_ptr<Function> f = make_shared<Function>(H, ParameterVector{A, B, C, D});
-    auto backend = make_shared<HybridBackend>(int_with_cpu_mul_policy);
-    backend->compile(f);
-    shared_ptr<runtime::Tensor> a = backend->create_tensor(element::f32, shape);
-    shared_ptr<runtime::Tensor> b = backend->create_tensor(element::f32, shape);
-    shared_ptr<runtime::Tensor> c = backend->create_tensor(element::f32, shape);
-    shared_ptr<runtime::Tensor> d = backend->create_tensor(element::f32, shape);
-    shared_ptr<runtime::Tensor> r = backend->create_tensor(element::f32, shape);
-    copy_data(a, test::NDArray<float, 2>({{1, 2}, {3, 4}}).get_vector());
-    copy_data(b, test::NDArray<float, 2>({{5, 6}, {7, 8}}).get_vector());
-    copy_data(c, test::NDArray<float, 2>({{9, 10}, {11, 12}}).get_vector());
-    copy_data(d, test::NDArray<float, 2>({{13, 14}, {15, 16}}).get_vector());
-    backend->call_with_validate(f, {r}, {a, b, c, d});
-    EXPECT_EQ(read_vector<float>(r), (test::NDArray<float, 2>({{32, 48}, {68, 92}})).get_vector());
-}
-TEST(graph_partition, hybrid_back_and_forth)
-{
-    // A   B
-    //  \ / \
-    //  D*   |
-    //    \ /
-    //    E+   C
-    //      \ /
-    //      F*
-    Shape shape = Shape{2, 2};
-    shared_ptr<op::Parameter> A = make_shared<op::Parameter>(element::f32, shape);
-    shared_ptr<op::Parameter> B = make_shared<op::Parameter>(element::f32, shape);
-    shared_ptr<op::Parameter> C = make_shared<op::Parameter>(element::f32, shape);
-    shared_ptr<Node> D = A * B;
-    shared_ptr<Node> E = D + B;
-    shared_ptr<Node> F = E * C;
-    shared_ptr<Function> f = make_shared<Function>(F, ParameterVector{A, B, C});
-    auto backend = make_shared<HybridBackend>(int_with_cpu_mul_policy);
-    backend->compile(f);
-    shared_ptr<runtime::Tensor> a = backend->create_tensor(element::f32, shape);
-    shared_ptr<runtime::Tensor> b = backend->create_tensor(element::f32, shape);
-    shared_ptr<runtime::Tensor> c = backend->create_tensor(element::f32, shape);
-    shared_ptr<runtime::Tensor> r = backend->create_tensor(element::f32, shape);
-    copy_data(a, test::NDArray<float, 2>({{1, 2}, {3, 4}}).get_vector());
-    copy_data(b, test::NDArray<float, 2>({{5, 6}, {7, 8}}).get_vector());
-    copy_data(c, test::NDArray<float, 2>({{9, 10}, {11, 12}}).get_vector());
-    backend->call_with_validate(f, {r}, {a, b, c});
-    EXPECT_EQ(read_vector<float>(r),
-              (test::NDArray<float, 2>({{90, 180}, {308, 480}})).get_vector());
-}
-TEST(graph_partition, hybrid_multi_middle_nodes)
-{
-    // A   B   C
-    //  \ / \ / \
-    //  D+  E+  |
-    //    \ / \ /
-    //    F*  G*
-    //      \ /
-    //      H+
-    Shape shape = Shape{2, 2};
-    shared_ptr<op::Parameter> A = make_shared<op::Parameter>(element::f32, shape);
-    shared_ptr<op::Parameter> B = make_shared<op::Parameter>(element::f32, shape);
-    shared_ptr<op::Parameter> C = make_shared<op::Parameter>(element::f32, shape);
-    shared_ptr<Node> D = A + B;
-    shared_ptr<Node> E = B + C;
-    shared_ptr<Node> F = D * E;
-    shared_ptr<Node> G = E * C;
-    shared_ptr<Node> H = F + G;
-    shared_ptr<Function> f = make_shared<Function>(H, ParameterVector{A, B, C});
-    auto backend = make_shared<HybridBackend>(int_with_cpu_mul_policy);
-    backend->compile(f);
-    shared_ptr<runtime::Tensor> a = backend->create_tensor(element::f32, shape);
-    shared_ptr<runtime::Tensor> b = backend->create_tensor(element::f32, shape);
-    shared_ptr<runtime::Tensor> c = backend->create_tensor(element::f32, shape);
-    shared_ptr<runtime::Tensor> r = backend->create_tensor(element::f32, shape);
-    copy_data(a, test::NDArray<float, 2>({{1, 2}, {3, 4}}).get_vector());
-    copy_data(b, test::NDArray<float, 2>({{5, 6}, {7, 8}}).get_vector());
-    copy_data(c, test::NDArray<float, 2>({{9, 10}, {11, 12}}).get_vector());
-    backend->call_with_validate(f, {r}, {a, b, c});
-    EXPECT_EQ(read_vector<float>(r),
-              (test::NDArray<float, 2>({{210, 288}, {378, 480}})).get_vector());
-}
-TEST(graph_partition, hybrid_no_split)
-{
-    // A   B
-    //  \ /
-    //   +
-    Shape shape = Shape{2, 2};
-    shared_ptr<op::Parameter> A = make_shared<op::Parameter>(element::f32, shape);
-    shared_ptr<op::Parameter> B = make_shared<op::Parameter>(element::f32, shape);
-    shared_ptr<Node> C = A + B;
-    shared_ptr<Function> f = make_shared<Function>(C, ParameterVector{A, B});
-    auto backend = make_shared<HybridBackend>(int_with_cpu_mul_policy);
-    backend->compile(f);
-    shared_ptr<runtime::Tensor> a = backend->create_tensor(element::f32, shape);
-    shared_ptr<runtime::Tensor> b = backend->create_tensor(element::f32, shape);
-    shared_ptr<runtime::Tensor> c = backend->create_tensor(element::f32, shape);
-    copy_data(a, test::NDArray<float, 2>({{1, 2}, {3, 4}}).get_vector());
-    copy_data(b, test::NDArray<float, 2>({{5, 6}, {7, 8}}).get_vector());
-    backend->call_with_validate(f, {c}, {a, b});
-    EXPECT_EQ(read_vector<float>(c), (test::NDArray<float, 2>({{6, 8}, {10, 12}})).get_vector());
-}
-#endif