Codegen for >2D concat following ref kernel pattern (#296)

fdab16db · Adam Procter · GitHub · c89b1a84 · fdab16db · fdab16db
Unverified Commit fdab16db authored Dec 13, 2017 by Adam Procter Committed by GitHub Dec 13, 2017
15 changed files
--- a/src/ngraph/CMakeLists.txt
+++ b/src/ngraph/CMakeLists.txt
@@ -151,6 +151,8 @@ if (NGRAPH_CPU_ENABLE AND LLVM_INCLUDE_DIR AND
        runtime/cpu/cpu_backend.cpp
        runtime/cpu/cpu_manager.cpp
        runtime/cpu/cpu_kernels.cpp
+        runtime/cpu/cpu_kernel_emitters.cpp
+        runtime/cpu/cpu_kernel_utils.cpp
        runtime/cpu/cpu_emitter.cpp
        runtime/cpu/cpu_external_function.cpp
        runtime/cpu/cpu_tensor_view.cpp

--- a/src/ngraph/codegen/code_writer.cpp
+++ b/src/ngraph/codegen/code_writer.cpp
@@ -20,6 +20,7 @@ using namespace ngraph;
 codegen::CodeWriter::CodeWriter()
    : indent(0)
    , m_pending_indent(true)
+    , m_temporary_name_count(0)
 {
 }
@@ -32,3 +33,13 @@ void codegen::CodeWriter::operator+=(const std::string& s)
 {
    *this << s;
 }
+std::string codegen::CodeWriter::generate_temporary_name(std::string prefix)
+{
+    std::stringstream ss;
+    ss << prefix << "__" << m_temporary_name_count;
+    m_temporary_name_count++;
+    return ss.str();
+}
--- a/src/ngraph/codegen/code_writer.hpp
+++ b/src/ngraph/codegen/code_writer.hpp
@@ -66,7 +66,10 @@ public:
        return out;
    }
+    std::string generate_temporary_name(std::string prefix = "tempvar");
 private:
    std::stringstream m_ss;
    bool m_pending_indent;
+    size_t m_temporary_name_count;
 };
--- a/src/ngraph/coordinate_transform.cpp
+++ b/src/ngraph/coordinate_transform.cpp
@@ -30,18 +30,18 @@ inline T ceil_div(T x, T y)
    return (x == 0 ? 0 : (1 + (x - 1) / y));
 }
-CoordinateTransform::CoordinateTransform(const Shape& source_space_shape,
+CoordinateTransform::CoordinateTransform(const Shape& source_shape,
                                         const Coordinate& source_start_corner,
                                         const Coordinate& source_end_corner,
                                         const Strides& source_strides,
                                         const AxisVector& source_axis_order)
-    : m_source_space_shape(source_space_shape)
+    : m_source_shape(source_shape)
    , m_source_start_corner(source_start_corner)
    , m_source_end_corner(source_end_corner)
    , m_source_strides(source_strides)
    , m_source_axis_order(source_axis_order)
 {
-    m_n_axes = source_space_shape.size();
+    m_n_axes = source_shape.size();
    if (m_n_axes != source_start_corner.size())
    {
@@ -80,8 +80,8 @@ CoordinateTransform::CoordinateTransform(const Shape& source_space_shape,
    for (size_t i = 0; i < m_n_axes; i++)
    {
-        if (source_start_corner[i] >= source_space_shape[i] &&
+        if (source_start_corner[i] >= source_shape[i] &&
-            !(source_start_corner[i] == 0 && source_space_shape[i] == 0))
+            !(source_start_corner[i] == 0 && source_shape[i] == 0))
        {
            std::stringstream ss;
@@ -92,7 +92,7 @@ CoordinateTransform::CoordinateTransform(const Shape& source_space_shape,
    for (size_t i = 0; i < m_n_axes; i++)
    {
-        if (source_end_corner[i] > source_space_shape[i])
+        if (source_end_corner[i] > source_shape[i])
        {
            std::stringstream ss;
@@ -120,7 +120,7 @@ CoordinateTransform::CoordinateTransform(const Shape& source_space_shape,
    }
 }
-AxisVector default_axis_order(size_t n_axes)
+static AxisVector default_axis_order(size_t n_axes)
 {
    AxisVector result(n_axes);
    size_t n = 0;
@@ -129,50 +129,50 @@ AxisVector default_axis_order(size_t n_axes)
    return result;
 }
-CoordinateTransform::CoordinateTransform(const Shape& source_space_shape,
+CoordinateTransform::CoordinateTransform(const Shape& source_shape,
                                         const Coordinate& source_start_corner,
                                         const Coordinate& source_end_corner,
                                         const Strides& source_strides)
-    : CoordinateTransform(source_space_shape,
+    : CoordinateTransform(source_shape,
                          source_start_corner,
                          source_end_corner,
                          source_strides,
-                          default_axis_order(source_space_shape.size()))
+                          default_axis_order(source_shape.size()))
 {
 }
-Strides default_source_strides(size_t n_axes)
+static Strides default_source_strides(size_t n_axes)
 {
    return AxisVector(n_axes, 1);
 }
-CoordinateTransform::CoordinateTransform(const Shape& source_space_shape,
+CoordinateTransform::CoordinateTransform(const Shape& source_shape,
                                         const Coordinate& source_start_corner,
                                         const Coordinate& source_end_corner)
-    : CoordinateTransform(source_space_shape,
+    : CoordinateTransform(source_shape,
                          source_start_corner,
                          source_end_corner,
-                          default_source_strides(source_space_shape.size()),
+                          default_source_strides(source_shape.size()),
-                          default_axis_order(source_space_shape.size()))
+                          default_axis_order(source_shape.size()))
 {
 }
-Coordinate default_source_start_corner(size_t n_axes)
+static Coordinate default_source_start_corner(size_t n_axes)
 {
    return Coordinate(n_axes, 0);
 }
-Coordinate default_source_end_corner(const Shape& source_space_shape)
+static Coordinate default_source_end_corner(const Shape& source_shape)
 {
-    return source_space_shape;
+    return source_shape;
 }
-CoordinateTransform::CoordinateTransform(const Shape& source_space_shape)
+CoordinateTransform::CoordinateTransform(const Shape& source_shape)
-    : CoordinateTransform(source_space_shape,
+    : CoordinateTransform(source_shape,
-                          default_source_start_corner(source_space_shape.size()),
+                          default_source_start_corner(source_shape.size()),
-                          default_source_end_corner(source_space_shape),
+                          default_source_end_corner(source_shape),
-                          default_source_strides(source_space_shape.size()),
+                          default_source_strides(source_shape.size()),
-                          default_axis_order(source_space_shape.size()))
+                          default_axis_order(source_shape.size()))
 {
 }
@@ -185,7 +185,7 @@ size_t CoordinateTransform::index_source(const Coordinate& c) const
    for (size_t axis = m_n_axes; axis-- > 0;)
    {
        index += c[axis] * stride;
-        stride *= m_source_space_shape[axis];
+        stride *= m_source_shape[axis];
    }
    return index;

--- a/src/ngraph/coordinate_transform.hpp
+++ b/src/ngraph/coordinate_transform.hpp
@@ -41,12 +41,17 @@ namespace ngraph
                            const Coordinate& source_start_corner,
                            const Coordinate& source_end_corner);
-        CoordinateTransform(const Shape& source_space_shape);
+        CoordinateTransform(const Shape& source_shape);
        size_t index(const Coordinate& c) const;
        bool in_bounds(const Coordinate& c) const;
        Coordinate get_target_shape() const;
+        Shape get_source_shape() { return m_source_shape; }
+        Coordinate get_source_start_corner() { return m_source_start_corner; }
+        Coordinate get_source_end_corner() { return m_source_end_corner; }
+        Strides get_source_strides() { return m_source_strides; }
+        AxisVector get_source_axis_order() { return m_source_axis_order; }
        class Iterator
        {
        public:
@@ -73,7 +78,7 @@ namespace ngraph
        Coordinate to_source_coordinate(const Coordinate& c) const;
        size_t index_source(const Coordinate& c) const;
-        Shape m_source_space_shape;
+        Shape m_source_shape;
        Shape m_source_start_corner;
        Shape m_source_end_corner;
        Strides m_source_strides;

--- a/src/ngraph/runtime/cpu/cpu_emitter.cpp
+++ b/src/ngraph/runtime/cpu/cpu_emitter.cpp
@@ -34,6 +34,7 @@
 #include "ngraph/ops/slice.hpp"
 #include "ngraph/ops/sum.hpp"
 #include "ngraph/runtime/cpu/cpu_emitter.hpp"
+#include "ngraph/runtime/cpu/cpu_kernel_emitters.hpp"
 #include "ngraph/util.hpp"
 using namespace std;
@@ -250,6 +251,49 @@ void runtime::cpu::CPU_Emitter::EmitConcat(const ngraph::Node* n,
        m_out.indent--;
        m_out << "}\n";
    }
+    else
+    {
+        if (m_use_ref_kernels)
+        {
+            auto axis = (dynamic_cast<const op::Concat*>(n))->get_concatenation_axis();
+            std::vector<std::string> arg_names;
+            std::vector<std::string> arg_shape_strings;
+            for (auto arg : args)
+            {
+                arg_names.push_back(arg.get_name());
+                arg_shape_strings.push_back("{" + join(arg.get_shape()) + "}");
+            }
+            m_out << "kernel::concat<" << out[0].get_type() << ">({" << join(arg_names) << "},\n";
+            m_out << "                         " << out[0].get_name() << ",\n";
+            m_out << "                         {" << join(arg_shape_strings) << "},\n";
+            m_out << "                         {" << join(result_shape) << "},\n";
+            m_out << "                         " << axis << ");\n";
+        }
+        else
+        {
+            auto axis = (dynamic_cast<const op::Concat*>(n))->get_concatenation_axis();
+            std::vector<std::string> arg_names;
+            std::vector<Shape> arg_shapes;
+            for (auto arg : args)
+            {
+                arg_names.push_back(arg.get_name());
+                arg_shapes.push_back(arg.get_shape());
+            }
+            kernels::emit_concat(m_out,
+                                 args[0].get_element_type().c_type_string(),
+                                 arg_names,
+                                 out[0].get_name(),
+                                 arg_shapes,
+                                 result_shape,
+                                 axis);
+        }
+    }
 }
 void runtime::cpu::CPU_Emitter::EmitDivide(const ngraph::Node* n,

--- a/src/ngraph/runtime/cpu/cpu_emitter.hpp
+++ b/src/ngraph/runtime/cpu/cpu_emitter.hpp
@@ -37,10 +37,12 @@ namespace ngraph
            {
            protected:
                codegen::CodeWriter m_out;
+                bool m_use_ref_kernels;
            public:
                CPU_Emitter()
                    : m_out()
+                    , m_use_ref_kernels(std::getenv("NGRAPH_CPU_USE_REF_KERNELS") != nullptr)
                {
                }
                std::string get_code() { return m_out.get_code(); }

--- a/src/ngraph/runtime/cpu/cpu_external_function.cpp
+++ b/src/ngraph/runtime/cpu/cpu_external_function.cpp
@@ -207,6 +207,7 @@ void runtime::cpu::CPU_ExternalFunction::compile()
 #include "ngraph/runtime/cpu/cpu_eigen_utils.hpp"
 #include "ngraph/runtime/cpu/cpu_kernels.hpp"
 #include "ngraph/runtime/kernel/broadcast.hpp"
+#include "ngraph/runtime/kernel/concat.hpp"
 #include "ngraph/runtime/kernel/dot.hpp"
 #include "ngraph/runtime/kernel/one_hot.hpp"
 #include "ngraph/runtime/kernel/reduce.hpp"

--- a/src/ngraph/runtime/cpu/cpu_kernel_emitters.cpp
+++ b/src/ngraph/runtime/cpu/cpu_kernel_emitters.cpp
+// ----------------------------------------------------------------------------
+// Copyright 2017 Nervana Systems Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// ----------------------------------------------------------------------------
+#include "ngraph/runtime/cpu/cpu_kernel_emitters.hpp"
+#include "ngraph/codegen/code_writer.hpp"
+#include "ngraph/runtime/cpu/cpu_kernel_utils.hpp"
+using namespace ngraph;
+using namespace ngraph::runtime::cpu::kernels;
+//
+// For the reference kernel this is based on, see ngraph/runtime/kernel/concat.hpp.
+//
+void ngraph::runtime::cpu::kernels::emit_concat(codegen::CodeWriter& writer,
+                                                std::string element_type,
+                                                const std::vector<std::string> args,
+                                                std::string out,
+                                                const std::vector<Shape>& in_shapes,
+                                                const Shape& out_shape,
+                                                size_t concatenation_axis)
+{
+    size_t concatenation_pos = 0;
+    for (size_t i = 0; i < args.size(); i++)
+    {
+        Coordinate out_start_coord = Coordinate(out_shape.size(), 0);
+        out_start_coord[concatenation_axis] = concatenation_pos;
+        Coordinate out_end_coord = out_shape;
+        out_end_coord[concatenation_axis] = concatenation_pos + in_shapes[i][concatenation_axis];
+        CoordinateTransform input_transform(in_shapes[i]);
+        CoordinateTransform output_chunk_transform(out_shape, out_start_coord, out_end_coord);
+        emit_pointwise_copy(
+            writer, element_type, args[i], out, input_transform, output_chunk_transform);
+        concatenation_pos += in_shapes[i][concatenation_axis];
+    }
+}
--- a/src/ngraph/runtime/cpu/cpu_kernel_emitters.hpp
+++ b/src/ngraph/runtime/cpu/cpu_kernel_emitters.hpp
+// ----------------------------------------------------------------------------
+// Copyright 2017 Nervana Systems Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// ----------------------------------------------------------------------------
+#pragma once
+#include "ngraph/codegen/code_writer.hpp"
+#include "ngraph/common.hpp"
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            namespace kernels
+            {
+                void emit_concat(codegen::CodeWriter& writer,
+                                 std::string element_type,
+                                 const std::vector<std::string> args,
+                                 std::string out,
+                                 const std::vector<Shape>& in_shapes,
+                                 const Shape& out_shape,
+                                 size_t concatenation_axis);
+            }
+        }
+    }
+}
--- a/src/ngraph/runtime/cpu/cpu_kernel_utils.cpp
+++ b/src/ngraph/runtime/cpu/cpu_kernel_utils.cpp
+// ----------------------------------------------------------------------------
+// Copyright 2017 Nervana Systems Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// ----------------------------------------------------------------------------
+#include "ngraph/runtime/cpu/cpu_kernel_utils.hpp"
+#include "ngraph/codegen/code_writer.hpp"
+#include "ngraph/common.hpp"
+#include "ngraph/coordinate_transform.hpp"
+#include "ngraph/util.hpp"
+using namespace ngraph;
+using namespace ngraph::runtime::cpu::kernels;
+//
+// Given a coordinate transform and a vector of index expressions relative to
+// the target coordinate space, produces the strings needed to index into the
+// source coordinate space if it is represented as a multidimensional array.
+//
+// For example,
+//
+//    trans has stride (2,2,2), axis order (2,0,1), and start offsets (3,4,5)
+//
+//    index_vars are "i", "j", "k"
+//
+// this will produce:
+//
+//    {"((k) * 2 + 5)", "((i) * 2 + 3)", "((j) * 2 + 4)"}
+//
+//
+std::vector<std::string>
+    ngraph::runtime::cpu::kernels::emit_multi_indices(CoordinateTransform trans,
+                                                      std::vector<std::string> index_vars)
+{
+    std::vector<std::string> result;
+    for (size_t i = 0; i < index_vars.size(); i++)
+    {
+        std::string index_var = index_vars[trans.get_source_axis_order()[i]];
+        size_t source_stride = trans.get_source_strides()[i];
+        size_t source_start = trans.get_source_start_corner()[i];
+        std::stringstream ss;
+        if (source_stride == 1 && source_start == 0)
+        {
+            ss << index_var;
+        }
+        else if (source_stride == 1)
+        {
+            ss << "((" << index_var << ") + " << source_start << ")";
+        }
+        else if (source_start == 0)
+        {
+            ss << "(" << source_stride << " * (" << index_var << "))";
+        }
+        else
+        {
+            ss << "(" << source_stride << " * (" << index_var << ") + " << source_start << ")";
+        }
+        result.push_back(ss.str());
+    }
+    return result;
+}
+//
+// Given a coordinate transform and a vector of index expressions relative to
+// the target coordinate space, produces the strings needed to index into the
+// source coordinate space if it is represented as a multidimensional array.
+//
+// For example,
+//
+//    trans has source shape (2,2,2) stride (2,2,2), axis order (2,0,1),
+//       and start offsets (3,4,5)
+//
+//    index_vars are "i", "j", "k"
+//
+// this will produce:
+//
+//    "((4 * ((k) * 2 + 5)) + (2 * ((i) * 2 + 3)) + ((j) * 2 + 4))"
+//
+//
+std::string ngraph::runtime::cpu::kernels::emit_linear_index(CoordinateTransform trans,
+                                                             std::vector<std::string> index_vars)
+{
+    std::vector<std::string> multi_indices = emit_multi_indices(trans, index_vars);
+    size_t stride = 1;
+    for (size_t i = index_vars.size(); i-- > 0;)
+    {
+        // No need to do this (multiply by stride) if it's 1, though it wouldn't hurt anything.
+        if (stride != 1)
+        {
+            std::stringstream ss;
+            ss << "(" << stride << " * " << multi_indices[i] << ")";
+            multi_indices[i] = ss.str();
+        }
+        stride *= trans.get_source_shape()[i];
+    }
+    std::stringstream ss;
+    ss << "(" << join(multi_indices, " + ") << ")";
+    return ss.str();
+}
+//
+// Begins an indexing loop (just a for-loop) with index_var as the index
+// variable, starting at start, continuing while [index_var] < [end].
+//
+// Optionally emits an OpenMP parallel pragma, if "omp" is true.
+//
+std::string ngraph::runtime::cpu::kernels::start_index_loop(std::string index_var,
+                                                            size_t start,
+                                                            size_t end,
+                                                            bool omp)
+{
+    std::stringstream ss;
+    if (omp)
+    {
+        ss << "#pragma omp parallel for\n";
+    }
+    ss << "for(size_t " << index_var << " = " << start << "; " << index_var << " < " << end << "; "
+       << index_var << "++)\n"
+       << "{\n";
+    return ss.str();
+}
+//
+// Ends an indexing loop on the index variable [index_var].
+//
+std::string ngraph::runtime::cpu::kernels::end_index_loop(std::string index_var)
+{
+    std::stringstream ss;
+    ss << "} // end for(" << index_var << ")\n";
+    return ss.str();
+}
+std::string ngraph::runtime::cpu::kernels::emit_nd_sizes(CoordinateTransform trans)
+{
+    std::stringstream ss;
+    for (size_t s : trans.get_source_shape())
+    {
+        ss << "[" << s << "]";
+    }
+    return ss.str();
+}
+std::string ngraph::runtime::cpu::kernels::emit_nd_index(CoordinateTransform trans,
+                                                         std::vector<std::string> index_vars)
+{
+    std::stringstream ss;
+    for (std::string index : emit_multi_indices(trans, index_vars))
+    {
+        ss << "[" << index << "]";
+    }
+    return ss.str();
+}
+//
+// Emits a pointwise copy from source_buffer mediated by in_trans, to
+// dest_buffer mediated by dest_trans.
+//
+void ngraph::runtime::cpu::kernels::emit_pointwise_copy(codegen::CodeWriter& writer,
+                                                        std::string element_type,
+                                                        std::string source_buffer,
+                                                        std::string dest_buffer,
+                                                        CoordinateTransform source_trans,
+                                                        CoordinateTransform dest_trans)
+{
+    std::vector<std::string> index_vars;
+    Shape source_start_corner = source_trans.get_source_start_corner();
+    Shape source_end_corner = source_trans.get_source_end_corner();
+    size_t n_axes = source_start_corner.size();
+    std::string source_nd_name = writer.generate_temporary_name("source_nd");
+    std::string dest_nd_name = writer.generate_temporary_name("dest_nd");
+    writer << element_type << "(&" << source_nd_name << ")" << emit_nd_sizes(source_trans)
+           << " = *reinterpret_cast<" << element_type << "(*)" << emit_nd_sizes(source_trans)
+           << ">(" << source_buffer << ");\n";
+    writer << element_type << "(&" << dest_nd_name << ")" << emit_nd_sizes(dest_trans)
+           << " = *reinterpret_cast<" << element_type << "(*)" << emit_nd_sizes(dest_trans) << ">("
+           << dest_buffer << ");\n";
+    for (size_t i = 0; i < n_axes; i++)
+    {
+        std::string index_var = writer.generate_temporary_name("i");
+        writer << start_index_loop(index_var, source_start_corner[i], source_end_corner[i], i == 0);
+        writer.indent++;
+        index_vars.push_back(index_var);
+    }
+    writer << dest_nd_name << emit_nd_index(dest_trans, index_vars) << " = " << source_nd_name
+           << emit_nd_index(source_trans, index_vars) << ";\n";
+    for (size_t i = n_axes; i-- > 0;)
+    {
+        writer.indent--;
+        writer << end_index_loop(index_vars[i]);
+    }
+}
--- a/src/ngraph/runtime/cpu/cpu_kernel_utils.hpp
+++ b/src/ngraph/runtime/cpu/cpu_kernel_utils.hpp
+// ----------------------------------------------------------------------------
+// Copyright 2017 Nervana Systems Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// ----------------------------------------------------------------------------
+#pragma once
+#include "ngraph/codegen/code_writer.hpp"
+#include "ngraph/common.hpp"
+#include "ngraph/coordinate_transform.hpp"
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            namespace kernels
+            {
+                std::vector<std::string> emit_multi_indices(CoordinateTransform trans,
+                                                            std::vector<std::string> index_vars);
+                std::string emit_linear_index(CoordinateTransform trans,
+                                              std::vector<std::string> index_vars);
+                std::string
+                    start_index_loop(std::string index_var, size_t start, size_t end, bool omp);
+                std::string end_index_loop(std::string index_var);
+                std::string emit_nd_sizes(CoordinateTransform trans);
+                std::string emit_nd_index(CoordinateTransform trans,
+                                          std::vector<std::string> index_vars);
+                void emit_pointwise_copy(codegen::CodeWriter& writer,
+                                         std::string element_type,
+                                         std::string source_buffer,
+                                         std::string dest_buffer,
+                                         CoordinateTransform source_trans,
+                                         CoordinateTransform dest_trans);
+            }
+        }
+    }
+}
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -22,7 +22,6 @@ include_directories(
    )
 set (SRC
-    backend_performance.cpp
    builder.cpp
    builder_autobroadcast.cpp
    build_graph.cpp
@@ -69,7 +68,7 @@ endif()
 if(NGRAPH_CPU_ENABLE AND LLVM_INCLUDE_DIR)
    include_directories(SYSTEM ${LLVM_INCLUDE_DIR})
    link_directories(${LLVM_LIB_DIR})
-    set(SRC ${SRC} codegen.cpp)
+    set(SRC ${SRC} backend_performance.cpp codegen.cpp)
    set(BACKEND_NAMES ${BACKEND_NAMES} "CPU")
 endif()

--- a/test/backend_performance.cpp
+++ b/test/backend_performance.cpp
@@ -22,6 +22,7 @@
 #include "ngraph/codegen/execution_engine.hpp"
 #include "ngraph/file_util.hpp"
 #include "ngraph/log.hpp"
+#include "ngraph/ops/concatenate.hpp"
 #include "ngraph/runtime/backend.hpp"
 #include "ngraph/runtime/call_frame.hpp"
 #include "ngraph/runtime/cpu/cpu_call_frame.hpp"
@@ -33,6 +34,13 @@
 using namespace std;
 using namespace ngraph;
+template <typename T>
+static void copy_data(shared_ptr<runtime::TensorView> tv, const vector<T>& data)
+{
+    size_t data_size = data.size() * sizeof(T);
+    tv->write(data.data(), 0, data_size);
+}
 // Starting point CPU: 1.2ms/iteration
 shared_ptr<runtime::TensorView> make_tensor(runtime::Backend& backend, const ValueType& value)
@@ -124,3 +132,115 @@ TEST(benchmark, mxnet_10_bucket_lstm)
        NGRAPH_INFO << p.name() << ", " << p.total_microseconds();
    }
 }
+//
+// Benchmarks a graph that concatenates six 32x1x200 arrays along the middle axis.
+//
+TEST(benchmark, concat_32x1x200_axis1_6)
+{
+    const size_t n_arrays = 6;
+    Shape shape_of_each_array = Shape{32, 1, 200};
+    size_t concatenation_axis = 1;
+    Shape result_shape;
+    result_shape = shape_of_each_array;
+    result_shape[concatenation_axis] *= n_arrays;
+    size_t elements_per_array = 1;
+    for (size_t d : shape_of_each_array)
+    {
+        elements_per_array *= d;
+    }
+    vector<vector<float>> data_arrays(n_arrays);
+    for (size_t i = 0; i < n_arrays; i++)
+    {
+        data_arrays[i] = vector<float>(elements_per_array);
+        for (size_t j = 0; j < elements_per_array; j++)
+        {
+            data_arrays[i][j] = float(j + 1);
+        }
+    }
+    bool using_ref_kernels = (std::getenv("NGRAPH_CPU_USE_REF_KERNELS") != nullptr);
+    vector<std::string> backend_names{"INTERPRETER", "NGVM", "CPU"};
+    vector<int> n_runs{200, 200, using_ref_kernels ? 200 : 200000}; // one for each backend
+    vector<std::function<void()>> test_callbacks;                   // one for each backend
+    vector<std::shared_ptr<runtime::TensorView>> result_tvs;        // one for each backend
+    for (std::string backend_name : backend_names)
+    {
+        vector<std::shared_ptr<op::Parameter>> params(n_arrays);
+        vector<std::shared_ptr<Node>> params_as_nodes(n_arrays);
+        for (size_t i = 0; i < n_arrays; i++)
+        {
+            auto param = make_shared<op::Parameter>(
+                make_shared<TensorViewType>(element::Float32::element_type(), shape_of_each_array));
+            params[i] = param;
+            params_as_nodes[i] = param;
+        }
+        auto concat = make_shared<op::Concat>(params_as_nodes, concatenation_axis);
+        auto f = make_shared<Function>(concat, params);
+        auto manager = runtime::Manager::get(backend_name);
+        auto external = manager->compile(f);
+        auto backend = manager->allocate_backend();
+        auto cf = backend->make_call_frame(external);
+        vector<shared_ptr<runtime::Value>> input_vals;
+        for (size_t i = 0; i < n_arrays; i++)
+        {
+            auto tv = backend->make_primary_tensor_view(element::Float32::element_type(),
+                                                        shape_of_each_array);
+            copy_data(tv, data_arrays[i]);
+            input_vals.push_back(tv);
+        }
+        auto result_tv =
+            backend->make_primary_tensor_view(element::Float32::element_type(), result_shape);
+        result_tvs.push_back(result_tv);
+        std::function<void()> cb = [input_vals, result_tv, cf]() {
+            cf->call(input_vals, {result_tv});
+        };
+        test_callbacks.push_back(cb);
+    }
+    for (size_t i = 0; i < backend_names.size(); i++)
+    {
+        std::cout << backend_names[i] << ": " << n_runs[i] << " tests in " << std::flush;
+        stopwatch sw;
+        std::function<void()> cb = test_callbacks[i];
+        sw.start();
+        for (int j = 0; j < n_runs[i]; j++)
+        {
+            cb();
+        }
+        sw.stop();
+        std::cout << sw.get_milliseconds() << "ms (" << (sw.get_microseconds() / n_runs[i])
+                  << " us/test)" << std::endl;
+    }
+    for (size_t i = 1; i < backend_names.size(); i++)
+    {
+        std::cout << "Verifying " << backend_names[i] << " result against " << backend_names[0]
+                  << "..." << std::flush;
+        if (result_tvs[i]->get_vector<float>() == result_tvs[0]->get_vector<float>())
+        {
+            std::cout << " OK" << std::endl;
+        }
+        else
+        {
+            std::cout << " FAILED" << std::endl;
+            ADD_FAILURE();
+        }
+    }
+}
--- a/test/backend_test.in.cpp
+++ b/test/backend_test.in.cpp