Add in place concat optimization.

2efc0065 · amy.zhuang · f5d2fa68 · 2efc0065 · 2efc0065 · 2efc0065
Commit 2efc0065 authored Oct 06, 2018 by amy.zhuang
9 changed files
--- a/src/ngraph/pass/memory_layout.cpp
+++ b/src/ngraph/pass/memory_layout.cpp
@@ -19,6 +19,7 @@

 #include "ngraph/log.hpp"
 #include "ngraph/log.hpp"
+#include "ngraph/op/concat.hpp"
 #include "ngraph/pass/liveness.hpp"
 #include "ngraph/pass/manager.hpp"
 #include "ngraph/pass/memory_layout.hpp"
@@ -43,20 +44,25 @@ bool pass::MemoryLayout::run_on_function(shared_ptr<ngraph::Function> function)

        if (auto op = std::dynamic_pointer_cast<op::Op>(node))
        {
-            if (auto op_annotations = op->get_op_annotations())
+            // concat in_place_oi should be treated differently
+            if (!std::dynamic_pointer_cast<op::Concat>(node))
            {
-                for (auto oi_pair : op_annotations->get_in_place_oi_pairs())
+                if (auto op_annotations = op->get_op_annotations())
                {
-                    auto output = &node->get_outputs().at(oi_pair.output).get_tensor();
-                    auto input = &node->get_inputs().at(oi_pair.input).get_tensor();
-                    auto input_node = node->get_inputs().at(oi_pair.input).get_output().get_node();
-
-                    // an input tensor can be reused if this is the last use
-                    if (node->liveness_free_list.count(input) != 0 &&
-                        node->liveness_new_list.count(output) != 0)
+                    for (auto oi_pair : op_annotations->get_in_place_oi_pairs())
                    {
-                        in_place_outputs.insert({output, input});
-                        reused_inputs.insert(input);
+                        auto output = &node->get_outputs().at(oi_pair.output).get_tensor();
+                        auto input = &node->get_inputs().at(oi_pair.input).get_tensor();
+                        auto input_node =
+                            node->get_inputs().at(oi_pair.input).get_output().get_node();
+
+                        // an input tensor can be reused if this is the last use
+                        if (node->liveness_free_list.count(input) != 0 &&
+                            node->liveness_new_list.count(output) != 0)
+                        {
+                            in_place_outputs.insert({output, input});
+                            reused_inputs.insert(input);
+                        }
                    }
                }
            }
@@ -68,6 +74,27 @@ bool pass::MemoryLayout::run_on_function(shared_ptr<ngraph::Function> function)
                                ? in_place_outputs.at(tensor)->get_pool_offset()
                                : mm.allocate(tensor->size());
            tensor->set_pool_offset(offset);
+            // check if the op is concat
+            if (auto concat = std::dynamic_pointer_cast<op::Concat>(node))
+            {
+                if (auto op_annotations = concat->get_op_annotations())
+                {
+                    auto in_place_oi_pairs = op_annotations->get_in_place_oi_pairs();
+                    if (in_place_oi_pairs.size() > 0)
+                    {
+                        for (auto arg : concat->get_arguments())
+                        {
+                            auto input_node = std::dynamic_pointer_cast<op::Op>(arg);
+                            auto input_tensor = &input_node->get_output_tensor();
+                            auto old_offset = input_tensor->get_pool_offset();
+                            input_tensor->set_pool_offset(offset);
+                            NGRAPH_DEBUG << "memeory_layout: change offset, old offset is "
+                                         << old_offset << ", new offset is " << offset << std::endl;
+                            offset += input_tensor->size();
+                        }
+                    }
+                }
+            }
        }

        if (!m_disable_memory_sharing)

--- a/src/ngraph/runtime/cpu/CMakeLists.txt
+++ b/src/ngraph/runtime/cpu/CMakeLists.txt
@@ -106,6 +106,7 @@ set(SRC
    pass/cpu_layout.cpp
    pass/cpu_loop_kernel_fusion.cpp
    pass/cpu_mat_fusion.cpp
+    pass/cpu_post_layout_assignment.cpp
    pass/cpu_post_layout_optimizations.cpp
    pass/cpu_rnn_fusion.cpp
    pass/cpu_workspace_insertion.cpp

--- a/src/ngraph/runtime/cpu/builder/concat.cpp
+++ b/src/ngraph/runtime/cpu/builder/concat.cpp
@@ -32,8 +32,9 @@ namespace ngraph
            template <>
            void Builder::BUILDER_DECL(ngraph::op::Concat)
            {
-                auto axis =
-                    (static_cast<const ngraph::op::Concat*>(node))->get_concatenation_axis();
+
+                auto concat = static_cast<const ngraph::op::Concat*>(node);
+                auto axis = concat->get_concatenation_axis();

                auto& functors = external_function->get_functors();

@@ -48,10 +49,42 @@ namespace ngraph
                        arg_shapes.emplace_back(arg.get_shape());
                    }
                }
+                auto nargs = args.size();

                auto& out_tensor = external_function->get_tensor_data(out[0].get_name());
                auto out_shape = out[0].get_shape();

+                auto element_size = concat->get_input_element_type(0).size();
+                if (auto op_annotations = concat->get_op_annotations())
+                {
+                    auto in_place_oi_pairs = op_annotations->get_in_place_oi_pairs();
+                    if (in_place_oi_pairs.size() > 0)
+                    {
+                        auto functor = [&, arg_tensors, nargs, out_shape, arg_shapes, element_size](
+                            CPURuntimeContext* ctx) {
+                            auto out_size = shape_size(out_shape) * element_size;
+                            auto offset = 0;
+                            for (size_t i = 0; i < nargs; i++)
+                            {
+                                auto arg_size = shape_size(arg_shapes[i]) * element_size;
+                                if (arg_tensors[i] < out_tensor ||
+                                    arg_tensors[i] >=
+                                        reinterpret_cast<char*>(out_tensor) + out_size)
+                                {
+                                    memcpy(reinterpret_cast<char*>(out_tensor) + offset,
+                                           arg_tensors[i],
+                                           arg_size);
+                                }
+                                offset += arg_size;
+                            }
+
+                        };
+
+                        functors.emplace_back(functor);
+                        return;
+                    }
+                }
+
                if (runtime::cpu::mkldnn_utils::use_mkldnn_kernel(node))
                {
                    auto& mkldnn_emitter = external_function->get_mkldnn_emitter();

--- a/src/ngraph/runtime/cpu/cpu_emitter.cpp
+++ b/src/ngraph/runtime/cpu/cpu_emitter.cpp
@@ -993,6 +993,30 @@ namespace ngraph
            template <>
            void CPU_Emitter::EMITTER_DECL(ngraph::op::Concat)
            {
+                auto concat = static_cast<const ngraph::op::Concat*>(node);
+                if (auto op_annotations = concat->get_op_annotations())
+                {
+                    auto in_place_oi_pairs = op_annotations->get_in_place_oi_pairs();
+                    if (in_place_oi_pairs.size() > 0)
+                    {
+                        auto offset = 0;
+                        for (auto i = 0; i < args.size(); i++)
+                        {
+                            writer << "if (" << args[i].get_name() << " < " << out[0].get_name()
+                                   << " || " << args[i].get_name() << " >= " << out[0].get_name()
+                                   << " + " << out[0].get_size() << ")\n";
+                            writer.block_begin();
+                            writer << "memcpy(" << out[0].get_name() << " + " << offset << ", "
+                                   << args[i].get_name() << ", "
+                                   << args[i].get_size() * out[0].get_element_type().size()
+                                   << ");\n";
+                            writer.block_end();
+                            offset += args[i].get_size();
+                        }
+                        return;
+                    }
+                }
+
                auto result_shape = out[0].get_shape();

 #if USE_EIGEN_CORE_INLINE == 1

--- a/src/ngraph/runtime/cpu/cpu_external_function.cpp
+++ b/src/ngraph/runtime/cpu/cpu_external_function.cpp
@@ -163,6 +163,7 @@
 #include "ngraph/runtime/cpu/pass/cpu_horizontal_fusion.hpp"
 #include "ngraph/runtime/cpu/pass/cpu_layout.hpp"
 #include "ngraph/runtime/cpu/pass/cpu_mat_fusion.hpp"
+#include "ngraph/runtime/cpu/pass/cpu_post_layout_assignment.hpp"
 #include "ngraph/runtime/cpu/pass/cpu_post_layout_optimizations.hpp"
 #include "ngraph/runtime/cpu/pass/cpu_rnn_fusion.hpp"
 #include "ngraph/runtime/cpu/pass/cpu_workspace_insertion.hpp"
@@ -395,6 +396,7 @@ void runtime::cpu::CPU_ExternalFunction::register_common_passes(ngraph::pass::Ma
    pass_manager.register_pass<runtime::cpu::pass::CPUAssignment>(this);
    pass_manager.register_pass<runtime::cpu::pass::CPULayout>(this);
    pass_manager.register_pass<runtime::cpu::pass::CPUPostLayoutOptimizations>();
+    pass_manager.register_pass<runtime::cpu::pass::CPUPostLayoutAssignment>(this);
    pass_manager.register_pass<ngraph::pass::GetOutputElementElimination>();
    pass_manager.get_state().set_visualize_tree_ops_map(runtime::cpu::get_visualize_tree_ops_map());
 }
@@ -636,6 +638,42 @@ using namespace ngraph::runtime;
            }
        }

+        // concat
+        for (shared_ptr<Node> node : ordered_ops)
+        {
+            if (auto concat = std::dynamic_pointer_cast<ngraph::op::Concat>(node))
+            {
+                if (auto op_annotations = concat->get_op_annotations())
+                {
+                    auto in_place_oi_pairs = op_annotations->get_in_place_oi_pairs();
+                    if (in_place_oi_pairs.size() > 0)
+                    {
+                        bool found_last_concat = true;
+                        for (auto user : concat->get_users())
+                        {
+                            if (dynamic_pointer_cast<ngraph::op::Concat>(user))
+                            {
+                                found_last_concat = false;
+                                break;
+                            }
+                        }
+                        if (found_last_concat)
+                        {
+                            for (auto arg : concat->get_arguments())
+                            {
+                                if (auto arg_concat = dynamic_pointer_cast<ngraph::op::Concat>(arg))
+                                {
+                                    NGRAPH_DEBUG << "call propagate_in_place_concat for "
+                                                 << arg->get_name() << std::endl;
+                                    propagate_in_place_concat(arg_concat);
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
        writer << "bool " << current_function->get_name() << "_t_en[" << tensor_index << "];\n";

        writer << "extern \"C\" void " << current_function->get_name();
@@ -1128,6 +1166,42 @@ void runtime::cpu::CPU_ExternalFunction::propagate_in_place_output(
    } while (propagate_further);
 }

+void runtime::cpu::CPU_ExternalFunction::propagate_in_place_concat(
+    shared_ptr<ngraph::op::Concat> concat)
+{
+    std::deque<std::shared_ptr<ngraph::op::Concat>> stack;
+    stack.push_front(concat);
+
+    while (stack.size() > 0)
+    {
+        auto it = stack.front();
+        stack.pop_front();
+        if (auto op_annotations = it->get_op_annotations())
+        {
+            auto in_place_oi_pairs = op_annotations->get_in_place_oi_pairs();
+            if (in_place_oi_pairs.size() > 0)
+            {
+                auto output_tensor = &it->get_output_tensor();
+                auto offset = output_tensor->get_pool_offset();
+                for (auto arg : it->get_arguments())
+                {
+                    auto input_node = std::dynamic_pointer_cast<ngraph::op::Op>(arg);
+                    auto input_tensor = &input_node->get_output_tensor();
+                    auto old_offset = input_tensor->get_pool_offset();
+                    input_tensor->set_pool_offset(offset);
+                    NGRAPH_DEBUG << "cpu_external_function: change offset, old offset is "
+                                 << old_offset << ", new offset is " << offset << std::endl;
+                    offset += input_tensor->size();
+                    if (auto arg_concat = std::dynamic_pointer_cast<ngraph::op::Concat>(arg))
+                    {
+                        stack.push_front(arg_concat);
+                    }
+                }
+            }
+        }
+    }
+}
+
 void runtime::cpu::CPU_ExternalFunction::build()
 {
    if (m_is_built)
@@ -1181,6 +1255,43 @@ void runtime::cpu::CPU_ExternalFunction::build()
    }

    // Build executor
+
+    // concat
+    for (shared_ptr<Node> node : m_function->get_ordered_ops())
+    {
+        if (auto concat = std::dynamic_pointer_cast<ngraph::op::Concat>(node))
+        {
+            if (auto op_annotations = concat->get_op_annotations())
+            {
+                auto in_place_oi_pairs = op_annotations->get_in_place_oi_pairs();
+                if (in_place_oi_pairs.size() > 0)
+                {
+                    bool found_last_concat = true;
+                    for (auto user : concat->get_users())
+                    {
+                        if (dynamic_pointer_cast<ngraph::op::Concat>(user))
+                        {
+                            found_last_concat = false;
+                            break;
+                        }
+                    }
+                    if (found_last_concat)
+                    {
+                        for (auto arg : concat->get_arguments())
+                        {
+                            if (auto arg_concat = dynamic_pointer_cast<ngraph::op::Concat>(arg))
+                            {
+                                NGRAPH_DEBUG << "call propagate_in_place_concat for "
+                                             << arg->get_name() << std::endl;
+                                propagate_in_place_concat(arg_concat);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
    // Intermediates
    if (m_function->get_temporary_pool_size())
    {

--- a/src/ngraph/runtime/cpu/cpu_external_function.hpp
+++ b/src/ngraph/runtime/cpu/cpu_external_function.hpp
@@ -36,6 +36,7 @@
 #endif

 #include "ngraph/function.hpp"
+#include "ngraph/op/concat.hpp"
 #include "ngraph/pass/manager.hpp"
 #include "ngraph/runtime/cpu/cpu_call_frame.hpp"
 #include "ngraph/runtime/cpu/cpu_layout_descriptor.hpp"
@@ -157,6 +158,8 @@ namespace ngraph
                void propagate_in_place_output(ngraph::descriptor::Output* res_src_output,
                                               std::string output_name,
                                               bool dex);
+                // For a chain of concat ops, propagate pool offsets
+                void propagate_in_place_concat(std::shared_ptr<ngraph::op::Concat> concat);
                bool computes_result(Node* node);

 #if !defined(NGRAPH_DEX_ONLY)

--- a/src/ngraph/runtime/cpu/pass/cpu_post_layout_assignment.cpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_post_layout_assignment.cpp
+//*****************************************************************************
+// Copyright 2017-2018 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+
+#include "ngraph/runtime/cpu/pass/cpu_post_layout_assignment.hpp"
+#include <typeindex>
+#include <typeinfo>
+
+#include "ngraph/descriptor/output.hpp"
+#include "ngraph/op/concat.hpp"
+#include "ngraph/runtime/cpu/cpu_op_annotations.hpp"
+
+using namespace std;
+using namespace ngraph;
+
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            namespace pass
+            {
+                template <>
+                void CPUPostLayoutAssignment::ASSIGN_DECL(ngraph::op::Concat)
+                {
+                    auto concat = static_cast<op::Concat*>(node);
+                    auto shape = concat->get_input_shape(0);
+                    auto axis = concat->get_concatenation_axis();
+                    auto product = 1;
+                    for (int i = 0; i < axis; i++)
+                    {
+                        product *= shape[i];
+                    }
+                    if (product != 1)
+                    {
+                        NGRAPH_DEBUG << "cpu_post_layout_assignment: The product of Concat's shape "
+                                        "before concat axis is not 1, no in place concat";
+                        return;
+                    }
+
+                    bool in_place_concat = false;
+
+                    for (descriptor::Input& input : concat->get_inputs())
+                    {
+                        if (shape_size(input.get_shape()) == 0)
+                        {
+                            NGRAPH_DEBUG << "cpu_post_layout_assignment: 0 length tensor, no in "
+                                            "place concat";
+                            return;
+                        }
+                        const auto& output = input.get_output();
+                        auto arg = output.get_node();
+                        if (std::dynamic_pointer_cast<op::Constant>(arg) ||
+                            std::dynamic_pointer_cast<op::Parameter>(arg))
+                        {
+                            NGRAPH_DEBUG << "cpu_post_layout_assignment: " << arg->get_name()
+                                         << ": constant or parameter, no in place concat";
+                            return;
+                        }
+                        else if (output.get_inputs().size() != 1)
+                        {
+                            // check if we can do in place concat
+                            auto concat_count = 0;
+                            for (auto input : output.get_inputs())
+                            {
+                                auto user = input->get_node();
+                                if (std::dynamic_pointer_cast<op::Concat>(user))
+                                {
+                                    concat_count++;
+                                    if (concat_count == 2)
+                                    {
+                                        NGRAPH_DEBUG << "cpu_post_layout_assignment: multiple "
+                                                        "concat users, no in place concat";
+                                        return;
+                                    }
+                                }
+                            }
+
+                            std::unordered_set<Node*> visited;
+                            std::deque<Node*> stack;
+                            stack.push_front(arg.get());
+
+                            while (stack.size() > 0)
+                            {
+                                ngraph::Node* curr = stack.front();
+                                visited.insert(curr);
+                                if (curr->is_output())
+                                {
+                                    NGRAPH_DEBUG << "cpu_post_layout_assignment: not post "
+                                                    "dominated, no in place concat";
+                                    return;
+                                }
+                                else
+                                {
+                                    if (auto op = dynamic_cast<op::Op*>(curr))
+                                    {
+                                        if (auto op_annotations = op->get_op_annotations())
+                                        {
+                                            for (auto oi_pair :
+                                                 op_annotations->get_in_place_oi_pairs())
+                                            {
+                                                if (oi_pair.destructive)
+                                                {
+                                                    NGRAPH_DEBUG << "cpu_post_layout_assignment: "
+                                                                    "destructive in place oi, no "
+                                                                    "in place concat";
+                                                    return;
+                                                }
+                                            }
+                                        }
+                                    }
+                                }
+                                stack.pop_front();
+                                if (curr != concat)
+                                {
+                                    for (auto next : curr->get_users())
+                                    {
+                                        if (visited.count(next.get()) == 0)
+                                        {
+                                            stack.push_front(next.get());
+                                        }
+                                    }
+                                }
+                            }
+                            in_place_concat = true;
+                        }
+                        else
+                        {
+                            in_place_concat = true;
+                        }
+                    }
+
+                    if (in_place_concat)
+                    {
+                        auto op_annotations = concat->get_op_annotations();
+                        if (op_annotations)
+                        {
+                            op_annotations->add_in_place_oi_pair({0, 0, false});
+                        }
+                        else
+                        {
+                            op_annotations =
+                                std::make_shared<ngraph::runtime::cpu::CPUOpAnnotations>();
+                            op_annotations->add_in_place_oi_pair({0, 0, false});
+                            concat->set_op_annotations(op_annotations);
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+#define TI(x) type_index(typeid(x))
+
+static const runtime::cpu::pass::PostLayoutAssignOpMap s_dispatcher{
+    {TI(ngraph::op::Concat),
+     &runtime::cpu::pass::CPUPostLayoutAssignment::assign<ngraph::op::Concat>},
+};
+
+bool runtime::cpu::pass::CPUPostLayoutAssignment::run_on_call_graph(
+    const std::list<std::shared_ptr<Node>>& nodes)
+{
+    for (const auto& node : nodes)
+    {
+        auto& n = *node;
+        auto handler = s_dispatcher.find(TI(n));
+        if (handler != s_dispatcher.end())
+        {
+            handler->second(m_external_function, node.get());
+        }
+    }
+
+    return false;
+}
--- a/src/ngraph/runtime/cpu/pass/cpu_post_layout_assignment.hpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_post_layout_assignment.hpp
+//*****************************************************************************
+// Copyright 2017-2018 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+
+#pragma once
+
+#include "ngraph/pass/pass.hpp"
+#include "ngraph/runtime/cpu/cpu_external_function.hpp"
+#include "ngraph/runtime/cpu/cpu_tensor_view.hpp"
+
+#define ASSIGN_DECL(op_name)                                                                       \
+    assign<op_name>(ngraph::runtime::cpu::CPU_ExternalFunction * external_function,                \
+                    ngraph::Node * node)
+
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            namespace pass
+            {
+                using PostLayoutAssignFunction =
+                    std::function<void(CPU_ExternalFunction*, ngraph::Node*)>;
+
+                using PostLayoutAssignOpMap =
+                    std::unordered_map<std::type_index, PostLayoutAssignFunction>;
+
+                class CPUPostLayoutAssignment : public ngraph::pass::CallGraphPass
+                {
+                public:
+                    CPUPostLayoutAssignment(CPU_ExternalFunction* external_function)
+                        : m_external_function(external_function)
+                    {
+                    }
+
+                    virtual bool
+                        run_on_call_graph(const std::list<std::shared_ptr<Node>>& nodes) override;
+
+                    template <typename OP>
+                    static void
+                        assign(ngraph::runtime::cpu::CPU_ExternalFunction* external_function,
+                               ngraph::Node* node)
+                    {
+                        throw std::runtime_error("Unimplemented op '" + node->description() +
+                                                 "' in CPU post layout assignment");
+                    }
+
+                private:
+                    CPU_ExternalFunction* m_external_function;
+                };
+            }
+        }
+    }
+}
--- a/test/backend_test.in.cpp
+++ b/test/backend_test.in.cpp
@@ -641,6 +641,70 @@ NGRAPH_TEST(${BACKEND_NAME}, concat_2d_tensor)
    EXPECT_EQ((vector<float>{1, 2, 3}), read_vector<float>(result));
 }

+NGRAPH_TEST(${BACKEND_NAME}, concat_in_place_2d_tensor)
+{
+    Shape shape{1, 1};
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto B = make_shared<op::Parameter>(element::f32, shape);
+    auto add1 = make_shared<op::Add>(A, B);
+    auto C = make_shared<op::Parameter>(element::f32, shape);
+    auto D = make_shared<op::Parameter>(element::f32, shape);
+    auto add2 = make_shared<op::Add>(C, D);
+    auto subtract = make_shared<op::Subtract>(C, A);
+    Shape shape_r{3, 1};
+    auto f = make_shared<Function>(make_shared<op::Concat>(NodeVector{add1, add2, subtract}, 0),
+                                   op::ParameterVector{A, B, C, D});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape);
+    copy_data(a, vector<float>{1});
+    auto b = backend->create_tensor(element::f32, shape);
+    copy_data(b, vector<float>{2});
+    auto c = backend->create_tensor(element::f32, shape);
+    copy_data(c, vector<float>{3});
+    auto d = backend->create_tensor(element::f32, shape);
+    copy_data(d, vector<float>{4});
+    auto result = backend->create_tensor(element::f32, shape_r);
+
+    backend->call_with_validate(f, {result}, {a, b, c, d});
+    EXPECT_EQ((vector<float>{3, 7, 2}), read_vector<float>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, concat_in_place_propagate_2d_tensor)
+{
+    Shape shape{1, 1};
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto B = make_shared<op::Parameter>(element::f32, shape);
+    auto add1 = make_shared<op::Add>(A, B);
+    auto C = make_shared<op::Parameter>(element::f32, shape);
+    auto D = make_shared<op::Parameter>(element::f32, shape);
+    auto add2 = make_shared<op::Add>(C, D);
+    auto concat1 = make_shared<op::Concat>(NodeVector{add1, add2}, 0);
+    auto subtract = make_shared<op::Subtract>(C, A);
+    Shape shape_r{3, 1};
+    auto f = make_shared<Function>(make_shared<op::Concat>(NodeVector{concat1, subtract}, 0),
+                                   op::ParameterVector{A, B, C, D});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape);
+    copy_data(a, vector<float>{1});
+    auto b = backend->create_tensor(element::f32, shape);
+    copy_data(b, vector<float>{2});
+    auto c = backend->create_tensor(element::f32, shape);
+    copy_data(c, vector<float>{3});
+    auto d = backend->create_tensor(element::f32, shape);
+    copy_data(d, vector<float>{4});
+    auto result = backend->create_tensor(element::f32, shape_r);
+
+    backend->call_with_validate(f, {result}, {a, b, c, d});
+    EXPECT_EQ((vector<float>{3, 7, 2}), read_vector<float>(result));
+}
+
+
 // from numpy import *
 // a=linspace(1,2*3*4*3*2,2*3*4*3*2)
 // b=linspace(1000+1,1000+2*3*3*3*2,2*3*3*3*2)