Codegen for >2D concat following ref kernel pattern (#296)

fdab16db · Adam Procter · GitHub · c89b1a84 · fdab16db · fdab16db
Unverified Commit fdab16db authored Dec 13, 2017 by Adam Procter Committed by GitHub Dec 13, 2017
15 changed files
--- a/src/ngraph/CMakeLists.txt
+++ b/src/ngraph/CMakeLists.txt
@@ -151,6 +151,8 @@ if (NGRAPH_CPU_ENABLE AND LLVM_INCLUDE_DIR AND
        runtime/cpu/cpu_backend.cpp
        runtime/cpu/cpu_manager.cpp
        runtime/cpu/cpu_kernels.cpp
+        runtime/cpu/cpu_kernel_emitters.cpp
+        runtime/cpu/cpu_kernel_utils.cpp
        runtime/cpu/cpu_emitter.cpp
        runtime/cpu/cpu_external_function.cpp
        runtime/cpu/cpu_tensor_view.cpp

--- a/src/ngraph/codegen/code_writer.cpp
+++ b/src/ngraph/codegen/code_writer.cpp
@@ -20,6 +20,7 @@ using namespace ngraph;
 codegen::CodeWriter::CodeWriter()
    : indent(0)
    , m_pending_indent(true)
+    , m_temporary_name_count(0)
 {
 }
@@ -32,3 +33,13 @@ void codegen::CodeWriter::operator+=(const std::string& s)
 {
    *this << s;
 }
+std::string codegen::CodeWriter::generate_temporary_name(std::string prefix)
+{
+    std::stringstream ss;
+    ss << prefix << "__" << m_temporary_name_count;
+    m_temporary_name_count++;
+    return ss.str();
+}
--- a/src/ngraph/codegen/code_writer.hpp
+++ b/src/ngraph/codegen/code_writer.hpp
@@ -66,7 +66,10 @@ public:
        return out;
    }
+    std::string generate_temporary_name(std::string prefix = "tempvar");
 private:
    std::stringstream m_ss;
    bool m_pending_indent;
+    size_t m_temporary_name_count;
 };
--- a/src/ngraph/coordinate_transform.cpp
+++ b/src/ngraph/coordinate_transform.cpp
@@ -30,18 +30,18 @@ inline T ceil_div(T x, T y)
    return (x == 0 ? 0 : (1 + (x - 1) / y));
 }
-CoordinateTransform::CoordinateTransform(const Shape& source_space_shape,
+CoordinateTransform::CoordinateTransform(const Shape& source_shape,
                                         const Coordinate& source_start_corner,
                                         const Coordinate& source_end_corner,
                                         const Strides& source_strides,
                                         const AxisVector& source_axis_order)
-    : m_source_space_shape(source_space_shape)
+    : m_source_shape(source_shape)
    , m_source_start_corner(source_start_corner)
    , m_source_end_corner(source_end_corner)
    , m_source_strides(source_strides)
    , m_source_axis_order(source_axis_order)
 {
-    m_n_axes = source_space_shape.size();
+    m_n_axes = source_shape.size();
    if (m_n_axes != source_start_corner.size())
    {
@@ -80,8 +80,8 @@ CoordinateTransform::CoordinateTransform(const Shape& source_space_shape,
    for (size_t i = 0; i < m_n_axes; i++)
    {
-        if (source_start_corner[i] >= source_space_shape[i] &&
+        if (source_start_corner[i] >= source_shape[i] &&
-            !(source_start_corner[i] == 0 && source_space_shape[i] == 0))
+            !(source_start_corner[i] == 0 && source_shape[i] == 0))
        {
            std::stringstream ss;
@@ -92,7 +92,7 @@ CoordinateTransform::CoordinateTransform(const Shape& source_space_shape,
    for (size_t i = 0; i < m_n_axes; i++)
    {
-        if (source_end_corner[i] > source_space_shape[i])
+        if (source_end_corner[i] > source_shape[i])
        {
            std::stringstream ss;
@@ -120,7 +120,7 @@ CoordinateTransform::CoordinateTransform(const Shape& source_space_shape,
    }
 }
-AxisVector default_axis_order(size_t n_axes)
+static AxisVector default_axis_order(size_t n_axes)
 {
    AxisVector result(n_axes);
    size_t n = 0;
@@ -129,50 +129,50 @@ AxisVector default_axis_order(size_t n_axes)
    return result;
 }
-CoordinateTransform::CoordinateTransform(const Shape& source_space_shape,
+CoordinateTransform::CoordinateTransform(const Shape& source_shape,
                                         const Coordinate& source_start_corner,
                                         const Coordinate& source_end_corner,
                                         const Strides& source_strides)
-    : CoordinateTransform(source_space_shape,
+    : CoordinateTransform(source_shape,
                          source_start_corner,
                          source_end_corner,
                          source_strides,
-                          default_axis_order(source_space_shape.size()))
+                          default_axis_order(source_shape.size()))
 {
 }
-Strides default_source_strides(size_t n_axes)
+static Strides default_source_strides(size_t n_axes)
 {
    return AxisVector(n_axes, 1);
 }
-CoordinateTransform::CoordinateTransform(const Shape& source_space_shape,
+CoordinateTransform::CoordinateTransform(const Shape& source_shape,
                                         const Coordinate& source_start_corner,
                                         const Coordinate& source_end_corner)
-    : CoordinateTransform(source_space_shape,
+    : CoordinateTransform(source_shape,
                          source_start_corner,
                          source_end_corner,
-                          default_source_strides(source_space_shape.size()),
+                          default_source_strides(source_shape.size()),
-                          default_axis_order(source_space_shape.size()))
+                          default_axis_order(source_shape.size()))
 {
 }
-Coordinate default_source_start_corner(size_t n_axes)
+static Coordinate default_source_start_corner(size_t n_axes)
 {
    return Coordinate(n_axes, 0);
 }
-Coordinate default_source_end_corner(const Shape& source_space_shape)
+static Coordinate default_source_end_corner(const Shape& source_shape)
 {
-    return source_space_shape;
+    return source_shape;
 }
-CoordinateTransform::CoordinateTransform(const Shape& source_space_shape)
+CoordinateTransform::CoordinateTransform(const Shape& source_shape)
-    : CoordinateTransform(source_space_shape,
+    : CoordinateTransform(source_shape,
-                          default_source_start_corner(source_space_shape.size()),
+                          default_source_start_corner(source_shape.size()),
-                          default_source_end_corner(source_space_shape),
+                          default_source_end_corner(source_shape),
-                          default_source_strides(source_space_shape.size()),
+                          default_source_strides(source_shape.size()),
-                          default_axis_order(source_space_shape.size()))
+                          default_axis_order(source_shape.size()))
 {
 }
@@ -185,7 +185,7 @@ size_t CoordinateTransform::index_source(const Coordinate& c) const
    for (size_t axis = m_n_axes; axis-- > 0;)
    {
        index += c[axis] * stride;
-        stride *= m_source_space_shape[axis];
+        stride *= m_source_shape[axis];
    }
    return index;

--- a/src/ngraph/coordinate_transform.hpp
+++ b/src/ngraph/coordinate_transform.hpp
@@ -41,12 +41,17 @@ namespace ngraph
                            const Coordinate& source_start_corner,
                            const Coordinate& source_end_corner);
-        CoordinateTransform(const Shape& source_space_shape);
+        CoordinateTransform(const Shape& source_shape);
        size_t index(const Coordinate& c) const;
        bool in_bounds(const Coordinate& c) const;
        Coordinate get_target_shape() const;
+        Shape get_source_shape() { return m_source_shape; }
+        Coordinate get_source_start_corner() { return m_source_start_corner; }
+        Coordinate get_source_end_corner() { return m_source_end_corner; }
+        Strides get_source_strides() { return m_source_strides; }
+        AxisVector get_source_axis_order() { return m_source_axis_order; }
        class Iterator
        {
        public:
@@ -73,7 +78,7 @@ namespace ngraph
        Coordinate to_source_coordinate(const Coordinate& c) const;
        size_t index_source(const Coordinate& c) const;
-        Shape m_source_space_shape;
+        Shape m_source_shape;
        Shape m_source_start_corner;
        Shape m_source_end_corner;
        Strides m_source_strides;

--- a/src/ngraph/runtime/cpu/cpu_emitter.cpp
+++ b/src/ngraph/runtime/cpu/cpu_emitter.cpp
@@ -34,6 +34,7 @@
 #include "ngraph/ops/slice.hpp"
 #include "ngraph/ops/sum.hpp"
 #include "ngraph/runtime/cpu/cpu_emitter.hpp"
+#include "ngraph/runtime/cpu/cpu_kernel_emitters.hpp"
 #include "ngraph/util.hpp"
 using namespace std;
@@ -250,6 +251,49 @@ void runtime::cpu::CPU_Emitter::EmitConcat(const ngraph::Node* n,
        m_out.indent--;
        m_out << "}\n";
    }
+    else
+    {
+        if (m_use_ref_kernels)
+        {
+            auto axis = (dynamic_cast<const op::Concat*>(n))->get_concatenation_axis();
+            std::vector<std::string> arg_names;
+            std::vector<std::string> arg_shape_strings;
+            for (auto arg : args)
+            {
+                arg_names.push_back(arg.get_name());
+                arg_shape_strings.push_back("{" + join(arg.get_shape()) + "}");
+            }
+            m_out << "kernel::concat<" << out[0].get_type() << ">({" << join(arg_names) << "},\n";
+            m_out << "                         " << out[0].get_name() << ",\n";
+            m_out << "                         {" << join(arg_shape_strings) << "},\n";
+            m_out << "                         {" << join(result_shape) << "},\n";
+            m_out << "                         " << axis << ");\n";
+        }
+        else
+        {
+            auto axis = (dynamic_cast<const op::Concat*>(n))->get_concatenation_axis();
+            std::vector<std::string> arg_names;
+            std::vector<Shape> arg_shapes;
+            for (auto arg : args)
+            {
+                arg_names.push_back(arg.get_name());
+                arg_shapes.push_back(arg.get_shape());
+            }
+            kernels::emit_concat(m_out,
+                                 args[0].get_element_type().c_type_string(),
+                                 arg_names,
+                                 out[0].get_name(),
+                                 arg_shapes,
+                                 result_shape,
+                                 axis);
+        }
+    }
 }
 void runtime::cpu::CPU_Emitter::EmitDivide(const ngraph::Node* n,

--- a/src/ngraph/runtime/cpu/cpu_emitter.hpp
+++ b/src/ngraph/runtime/cpu/cpu_emitter.hpp
@@ -37,10 +37,12 @@ namespace ngraph
            {
            protected:
                codegen::CodeWriter m_out;
+                bool m_use_ref_kernels;
            public:
                CPU_Emitter()
                    : m_out()
+                    , m_use_ref_kernels(std::getenv("NGRAPH_CPU_USE_REF_KERNELS") != nullptr)
                {
                }
                std::string get_code() { return m_out.get_code(); }

--- a/src/ngraph/runtime/cpu/cpu_external_function.cpp
+++ b/src/ngraph/runtime/cpu/cpu_external_function.cpp
@@ -207,6 +207,7 @@ void runtime::cpu::CPU_ExternalFunction::compile()
 #include "ngraph/runtime/cpu/cpu_eigen_utils.hpp"
 #include "ngraph/runtime/cpu/cpu_kernels.hpp"
 #include "ngraph/runtime/kernel/broadcast.hpp"
+#include "ngraph/runtime/kernel/concat.hpp"
 #include "ngraph/runtime/kernel/dot.hpp"
 #include "ngraph/runtime/kernel/one_hot.hpp"
 #include "ngraph/runtime/kernel/reduce.hpp"

--- a/src/ngraph/runtime/cpu/cpu_kernel_emitters.cpp
+++ b/src/ngraph/runtime/cpu/cpu_kernel_emitters.cpp
+// ----------------------------------------------------------------------------
+// Copyright 2017 Nervana Systems Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// ----------------------------------------------------------------------------
+#include "ngraph/runtime/cpu/cpu_kernel_emitters.hpp"
+#include "ngraph/codegen/code_writer.hpp"
+#include "ngraph/runtime/cpu/cpu_kernel_utils.hpp"
+using namespace ngraph;
+using namespace ngraph::runtime::cpu::kernels;
+//
+// For the reference kernel this is based on, see ngraph/runtime/kernel/concat.hpp.
+//
+void ngraph::runtime::cpu::kernels::emit_concat(codegen::CodeWriter& writer,
+                                                std::string element_type,
+                                                const std::vector<std::string> args,
+                                                std::string out,
+                                                const std::vector<Shape>& in_shapes,
+                                                const Shape& out_shape,
+                                                size_t concatenation_axis)
+{
+    size_t concatenation_pos = 0;
+    for (size_t i = 0; i < args.size(); i++)
+    {
+        Coordinate out_start_coord = Coordinate(out_shape.size(), 0);
+        out_start_coord[concatenation_axis] = concatenation_pos;
+        Coordinate out_end_coord = out_shape;
+        out_end_coord[concatenation_axis] = concatenation_pos + in_shapes[i][concatenation_axis];
+        CoordinateTransform input_transform(in_shapes[i]);
+        CoordinateTransform output_chunk_transform(out_shape, out_start_coord, out_end_coord);
+        emit_pointwise_copy(
+            writer, element_type, args[i], out, input_transform, output_chunk_transform);
+        concatenation_pos += in_shapes[i][concatenation_axis];
+    }
+}
--- a/src/ngraph/runtime/cpu/cpu_kernel_emitters.hpp
+++ b/src/ngraph/runtime/cpu/cpu_kernel_emitters.hpp
+// ----------------------------------------------------------------------------
+// Copyright 2017 Nervana Systems Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// ----------------------------------------------------------------------------
+#pragma once
+#include "ngraph/codegen/code_writer.hpp"
+#include "ngraph/common.hpp"
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            namespace kernels
+            {
+                void emit_concat(codegen::CodeWriter& writer,
+                                 std::string element_type,
+                                 const std::vector<std::string> args,
+                                 std::string out,
+                                 const std::vector<Shape>& in_shapes,
+                                 const Shape& out_shape,
+                                 size_t concatenation_axis);
+            }
+        }
+    }
+}
--- a/src/ngraph/runtime/cpu/cpu_kernel_utils.cpp
+++ b/src/ngraph/runtime/cpu/cpu_kernel_utils.cpp
+// ----------------------------------------------------------------------------
+// Copyright 2017 Nervana Systems Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// ----------------------------------------------------------------------------
+#include "ngraph/runtime/cpu/cpu_kernel_utils.hpp"
+#include "ngraph/codegen/code_writer.hpp"
+#include "ngraph/common.hpp"
+#include "ngraph/coordinate_transform.hpp"
+#include "ngraph/util.hpp"
+using namespace ngraph;
+using namespace ngraph::runtime::cpu::kernels;
+//
+// Given a coordinate transform and a vector of index expressions relative to
+// the target coordinate space, produces the strings needed to index into the
+// source coordinate space if it is represented as a multidimensional array.
+//
+// For example,
+//
+//    trans has stride (2,2,2), axis order (2,0,1), and start offsets (3,4,5)
+//
+//    index_vars are "i", "j", "k"
+//
+// this will produce:
+//
+//    {"((k) * 2 + 5)", "((i) * 2 + 3)", "((j) * 2 + 4)"}
+//
+//
+std::vector<std::string>
+    ngraph::runtime::cpu::kernels::emit_multi_indices(CoordinateTransform trans,
+                                                      std::vector<std::string> index_vars)
+{
+    std::vector<std::string> result;
+    for (size_t i = 0; i < index_vars.size(); i++)
+    {
+        std::string index_var = index_vars[trans.get_source_axis_order()[i]];
+        size_t source_stride = trans.get_source_strides()[i];
+        size_t source_start = trans.get_source_start_corner()[i];
+        std::stringstream ss;
+        if (source_stride == 1 && source_start == 0)
+        {
+            ss << index_var;
+        }
+        else if (source_stride == 1)
+        {
+            ss << "((" << index_var << ") + " << source_start << ")";
+        }
+        else if (source_start == 0)
+        {
+            ss << "(" << source_stride << " * (" << index_var << "))";
+        }
+        else
+        {
+            ss << "(" << source_stride << " * (" << index_var << ") + " << source_start << ")";
+        }
+        result.push_back(ss.str());
+    }
+    return result;
+}
+//
+// Given a coordinate transform and a vector of index expressions relative to
+// the target coordinate space, produces the strings needed to index into the
+// source coordinate space if it is represented as a multidimensional array.
+//
+// For example,
+//
+//    trans has source shape (2,2,2) stride (2,2,2), axis order (2,0,1),
+//       and start offsets (3,4,5)
+//
+//    index_vars are "i", "j", "k"
+//
+// this will produce:
+//
+//    "((4 * ((k) * 2 + 5)) + (2 * ((i) * 2 + 3)) + ((j) * 2 + 4))"
+//
+//
+std::string ngraph::runtime::cpu::kernels::emit_linear_index(CoordinateTransform trans,
+                                                             std::vector<std::string> index_vars)
+{
+    std::vector<std::string> multi_indices = emit_multi_indices(trans, index_vars);
+    size_t stride = 1;
+    for (size_t i = index_vars.size(); i-- > 0;)
+    {
+        // No need to do this (multiply by stride) if it's 1, though it wouldn't hurt anything.
+        if (stride != 1)
+        {
+            std::stringstream ss;
+            ss << "(" << stride << " * " << multi_indices[i] << ")";
+            multi_indices[i] = ss.str();
+        }
+        stride *= trans.get_source_shape()[i];
+    }
+    std::stringstream ss;
+    ss << "(" << join(multi_indices, " + ") << ")";
+    return ss.str();
+}
+//
+// Begins an indexing loop (just a for-loop) with index_var as the index
+// variable, starting at start, continuing while [index_var] < [end].
+//
+// Optionally emits an OpenMP parallel pragma, if "omp" is true.
+//
+std::string ngraph::runtime::cpu::kernels::start_index_loop(std::string index_var,
+                                                            size_t start,
+                                                            size_t end,
+                                                            bool omp)
+{
+    std::stringstream ss;
+    if (omp)
+    {
+        ss << "#pragma omp parallel for\n";
+    }
+    ss << "for(size_t " << index_var << " = " << start << "; " << index_var << " < " << end << "; "
+       << index_var << "++)\n"
+       << "{\n";
+    return ss.str();
+}
+//
+// Ends an indexing loop on the index variable [index_var].
+//
+std::string ngraph::runtime::cpu::kernels::end_index_loop(std::string index_var)
+{
+    std::stringstream ss;
+    ss << "} // end for(" << index_var << ")\n";
+    return ss.str();
+}
+std::string ngraph::runtime::cpu::kernels::emit_nd_sizes(CoordinateTransform trans)
+{
+    std::stringstream ss;
+    for (size_t s : trans.get_source_shape())
+    {
+        ss << "[" << s << "]";
+    }
+    return ss.str();
+}
+std::string ngraph::runtime::cpu::kernels::emit_nd_index(CoordinateTransform trans,
+                                                         std::vector<std::string> index_vars)
+{
+    std::stringstream ss;
+    for (std::string index : emit_multi_indices(trans, index_vars))
+    {
+        ss << "[" << index << "]";
+    }
+    return ss.str();
+}
+//
+// Emits a pointwise copy from source_buffer mediated by in_trans, to
+// dest_buffer mediated by dest_trans.
+//
+void ngraph::runtime::cpu::kernels::emit_pointwise_copy(codegen::CodeWriter& writer,
+                                                        std::string element_type,
+                                                        std::string source_buffer,
+                                                        std::string dest_buffer,
+                                                        CoordinateTransform source_trans,
+                                                        CoordinateTransform dest_trans)
+{
+    std::vector<std::string> index_vars;
+    Shape source_start_corner = source_trans.get_source_start_corner();
+    Shape source_end_corner = source_trans.get_source_end_corner();
+    size_t n_axes = source_start_corner.size();
+    std::string source_nd_name = writer.generate_temporary_name("source_nd");
+    std::string dest_nd_name = writer.generate_temporary_name("dest_nd");
+    writer << element_type << "(&" << source_nd_name << ")" << emit_nd_sizes(source_trans)
+           << " = *reinterpret_cast<" << element_type << "(*)" << emit_nd_sizes(source_trans)
+           << ">(" << source_buffer << ");\n";
+    writer << element_type << "(&" << dest_nd_name << ")" << emit_nd_sizes(dest_trans)
+           << " = *reinterpret_cast<" << element_type << "(*)" << emit_nd_sizes(dest_trans) << ">("
+           << dest_buffer << ");\n";
+    for (size_t i = 0; i < n_axes; i++)
+    {
+        std::string index_var = writer.generate_temporary_name("i");
+        writer << start_index_loop(index_var, source_start_corner[i], source_end_corner[i], i == 0);
+        writer.indent++;
+        index_vars.push_back(index_var);
+    }
+    writer << dest_nd_name << emit_nd_index(dest_trans, index_vars) << " = " << source_nd_name
+           << emit_nd_index(source_trans, index_vars) << ";\n";
+    for (size_t i = n_axes; i-- > 0;)
+    {
+        writer.indent--;
+        writer << end_index_loop(index_vars[i]);
+    }
+}
--- a/src/ngraph/runtime/cpu/cpu_kernel_utils.hpp
+++ b/src/ngraph/runtime/cpu/cpu_kernel_utils.hpp
+// ----------------------------------------------------------------------------
+// Copyright 2017 Nervana Systems Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// ----------------------------------------------------------------------------
+#pragma once
+#include "ngraph/codegen/code_writer.hpp"
+#include "ngraph/common.hpp"
+#include "ngraph/coordinate_transform.hpp"
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            namespace kernels
+            {
+                std::vector<std::string> emit_multi_indices(CoordinateTransform trans,
+                                                            std::vector<std::string> index_vars);
+                std::string emit_linear_index(CoordinateTransform trans,
+                                              std::vector<std::string> index_vars);
+                std::string
+                    start_index_loop(std::string index_var, size_t start, size_t end, bool omp);
+                std::string end_index_loop(std::string index_var);
+                std::string emit_nd_sizes(CoordinateTransform trans);
+                std::string emit_nd_index(CoordinateTransform trans,
+                                          std::vector<std::string> index_vars);
+                void emit_pointwise_copy(codegen::CodeWriter& writer,
+                                         std::string element_type,
+                                         std::string source_buffer,
+                                         std::string dest_buffer,
+                                         CoordinateTransform source_trans,
+                                         CoordinateTransform dest_trans);
+            }
+        }
+    }
+}
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -22,7 +22,6 @@ include_directories(
    )
 set (SRC
-    backend_performance.cpp
    builder.cpp
    builder_autobroadcast.cpp
    build_graph.cpp
@@ -69,7 +68,7 @@ endif()
 if(NGRAPH_CPU_ENABLE AND LLVM_INCLUDE_DIR)
    include_directories(SYSTEM ${LLVM_INCLUDE_DIR})
    link_directories(${LLVM_LIB_DIR})
-    set(SRC ${SRC} codegen.cpp)
+    set(SRC ${SRC} backend_performance.cpp codegen.cpp)
    set(BACKEND_NAMES ${BACKEND_NAMES} "CPU")
 endif()

--- a/test/backend_performance.cpp
+++ b/test/backend_performance.cpp
@@ -22,6 +22,7 @@
 #include "ngraph/codegen/execution_engine.hpp"
 #include "ngraph/file_util.hpp"
 #include "ngraph/log.hpp"
+#include "ngraph/ops/concatenate.hpp"
 #include "ngraph/runtime/backend.hpp"
 #include "ngraph/runtime/call_frame.hpp"
 #include "ngraph/runtime/cpu/cpu_call_frame.hpp"
@@ -33,6 +34,13 @@
 using namespace std;
 using namespace ngraph;
+template <typename T>
+static void copy_data(shared_ptr<runtime::TensorView> tv, const vector<T>& data)
+{
+    size_t data_size = data.size() * sizeof(T);
+    tv->write(data.data(), 0, data_size);
+}
 // Starting point CPU: 1.2ms/iteration
 shared_ptr<runtime::TensorView> make_tensor(runtime::Backend& backend, const ValueType& value)
@@ -124,3 +132,115 @@ TEST(benchmark, mxnet_10_bucket_lstm)
        NGRAPH_INFO << p.name() << ", " << p.total_microseconds();
    }
 }
+//
+// Benchmarks a graph that concatenates six 32x1x200 arrays along the middle axis.
+//
+TEST(benchmark, concat_32x1x200_axis1_6)
+{
+    const size_t n_arrays = 6;
+    Shape shape_of_each_array = Shape{32, 1, 200};
+    size_t concatenation_axis = 1;
+    Shape result_shape;
+    result_shape = shape_of_each_array;
+    result_shape[concatenation_axis] *= n_arrays;
+    size_t elements_per_array = 1;
+    for (size_t d : shape_of_each_array)
+    {
+        elements_per_array *= d;
+    }
+    vector<vector<float>> data_arrays(n_arrays);
+    for (size_t i = 0; i < n_arrays; i++)
+    {
+        data_arrays[i] = vector<float>(elements_per_array);
+        for (size_t j = 0; j < elements_per_array; j++)
+        {
+            data_arrays[i][j] = float(j + 1);
+        }
+    }
+    bool using_ref_kernels = (std::getenv("NGRAPH_CPU_USE_REF_KERNELS") != nullptr);
+    vector<std::string> backend_names{"INTERPRETER", "NGVM", "CPU"};
+    vector<int> n_runs{200, 200, using_ref_kernels ? 200 : 200000}; // one for each backend
+    vector<std::function<void()>> test_callbacks;                   // one for each backend
+    vector<std::shared_ptr<runtime::TensorView>> result_tvs;        // one for each backend
+    for (std::string backend_name : backend_names)
+    {
+        vector<std::shared_ptr<op::Parameter>> params(n_arrays);
+        vector<std::shared_ptr<Node>> params_as_nodes(n_arrays);
+        for (size_t i = 0; i < n_arrays; i++)
+        {
+            auto param = make_shared<op::Parameter>(
+                make_shared<TensorViewType>(element::Float32::element_type(), shape_of_each_array));
+            params[i] = param;
+            params_as_nodes[i] = param;
+        }
+        auto concat = make_shared<op::Concat>(params_as_nodes, concatenation_axis);
+        auto f = make_shared<Function>(concat, params);
+        auto manager = runtime::Manager::get(backend_name);
+        auto external = manager->compile(f);
+        auto backend = manager->allocate_backend();
+        auto cf = backend->make_call_frame(external);
+        vector<shared_ptr<runtime::Value>> input_vals;
+        for (size_t i = 0; i < n_arrays; i++)
+        {
+            auto tv = backend->make_primary_tensor_view(element::Float32::element_type(),
+                                                        shape_of_each_array);
+            copy_data(tv, data_arrays[i]);
+            input_vals.push_back(tv);
+        }
+        auto result_tv =
+            backend->make_primary_tensor_view(element::Float32::element_type(), result_shape);
+        result_tvs.push_back(result_tv);
+        std::function<void()> cb = [input_vals, result_tv, cf]() {
+            cf->call(input_vals, {result_tv});
+        };
+        test_callbacks.push_back(cb);
+    }
+    for (size_t i = 0; i < backend_names.size(); i++)
+    {
+        std::cout << backend_names[i] << ": " << n_runs[i] << " tests in " << std::flush;
+        stopwatch sw;
+        std::function<void()> cb = test_callbacks[i];
+        sw.start();
+        for (int j = 0; j < n_runs[i]; j++)
+        {
+            cb();
+        }
+        sw.stop();
+        std::cout << sw.get_milliseconds() << "ms (" << (sw.get_microseconds() / n_runs[i])
+                  << " us/test)" << std::endl;
+    }
+    for (size_t i = 1; i < backend_names.size(); i++)
+    {
+        std::cout << "Verifying " << backend_names[i] << " result against " << backend_names[0]
+                  << "..." << std::flush;
+        if (result_tvs[i]->get_vector<float>() == result_tvs[0]->get_vector<float>())
+        {
+            std::cout << " OK" << std::endl;
+        }
+        else
+        {
+            std::cout << " FAILED" << std::endl;
+            ADD_FAILURE();
+        }
+    }
+}
--- a/test/backend_test.in.cpp
+++ b/test/backend_test.in.cpp
@@ -21,6 +21,7 @@
 #include "ngraph/log.hpp"
 #include "ngraph/ngraph.hpp"
 #include "ngraph/serializer.hpp"
+#include "util/all_close.hpp"
 #include "util/ndarray.hpp"
 using namespace std;
@@ -441,6 +442,134 @@ TEST(${BACKEND_NAME}, concat_vector)
              result->get_vector<float>());
 }
+// from numpy import *
+// a=linspace(1,2*3*4*3*2,2*3*4*3*2)
+// b=linspace(1000+1,1000+2*3*3*3*2,2*3*3*3*2)
+// c=linspace(2000+1,2000+2*3*2*3*2,2*3*2*3*2)
+// a.shape=(2,3,4,3,2)
+// b.shape=(2,3,3,3,2)
+// c.shape=(2,3,2,3,2)
+// z=concatenate((a,b,c),axis=2)
+// z.shape=(2*3*(4+3+2)*3*2)
+// set_printoptions(suppress=True)
+// print(z)
+//
+// [    1.     2.     3.     4.     5.     6.     7.     8.     9.    10.
+//     11.    12.    13.    14.    15.    16.    17.    18.    19.    20.
+//     21.    22.    23.    24.  1001.  1002.  1003.  1004.  1005.  1006.
+//   1007.  1008.  1009.  1010.  1011.  1012.  1013.  1014.  1015.  1016.
+//   1017.  1018.  2001.  2002.  2003.  2004.  2005.  2006.  2007.  2008.
+//   2009.  2010.  2011.  2012.    25.    26.    27.    28.    29.    30.
+//     31.    32.    33.    34.    35.    36.    37.    38.    39.    40.
+//     41.    42.    43.    44.    45.    46.    47.    48.  1019.  1020.
+//   1021.  1022.  1023.  1024.  1025.  1026.  1027.  1028.  1029.  1030.
+//   1031.  1032.  1033.  1034.  1035.  1036.  2013.  2014.  2015.  2016.
+//   2017.  2018.  2019.  2020.  2021.  2022.  2023.  2024.    49.    50.
+//     51.    52.    53.    54.    55.    56.    57.    58.    59.    60.
+//     61.    62.    63.    64.    65.    66.    67.    68.    69.    70.
+//     71.    72.  1037.  1038.  1039.  1040.  1041.  1042.  1043.  1044.
+//   1045.  1046.  1047.  1048.  1049.  1050.  1051.  1052.  1053.  1054.
+//   2025.  2026.  2027.  2028.  2029.  2030.  2031.  2032.  2033.  2034.
+//   2035.  2036.    73.    74.    75.    76.    77.    78.    79.    80.
+//     81.    82.    83.    84.    85.    86.    87.    88.    89.    90.
+//     91.    92.    93.    94.    95.    96.  1055.  1056.  1057.  1058.
+//   1059.  1060.  1061.  1062.  1063.  1064.  1065.  1066.  1067.  1068.
+//   1069.  1070.  1071.  1072.  2037.  2038.  2039.  2040.  2041.  2042.
+//   2043.  2044.  2045.  2046.  2047.  2048.    97.    98.    99.   100.
+//    101.   102.   103.   104.   105.   106.   107.   108.   109.   110.
+//    111.   112.   113.   114.   115.   116.   117.   118.   119.   120.
+//   1073.  1074.  1075.  1076.  1077.  1078.  1079.  1080.  1081.  1082.
+//   1083.  1084.  1085.  1086.  1087.  1088.  1089.  1090.  2049.  2050.
+//   2051.  2052.  2053.  2054.  2055.  2056.  2057.  2058.  2059.  2060.
+//    121.   122.   123.   124.   125.   126.   127.   128.   129.   130.
+//    131.   132.   133.   134.   135.   136.   137.   138.   139.   140.
+//    141.   142.   143.   144.  1091.  1092.  1093.  1094.  1095.  1096.
+//   1097.  1098.  1099.  1100.  1101.  1102.  1103.  1104.  1105.  1106.
+//   1107.  1108.  2061.  2062.  2063.  2064.  2065.  2066.  2067.  2068.
+//   2069.  2070.  2071.  2072.]
+TEST(${BACKEND_NAME}, concat_5d)
+{
+    vector<float> a_data(2 * 3 * 4 * 3 * 2);
+    for (int i = 0; i < 2 * 3 * 4 * 3 * 2; i++)
+    {
+        a_data[i] = float(i + 1);
+    }
+    vector<float> b_data(2 * 3 * 3 * 3 * 2);
+    for (int i = 0; i < 2 * 3 * 3 * 3 * 2; i++)
+    {
+        b_data[i] = 1000 + float(i + 1);
+    }
+    vector<float> c_data(2 * 3 * 2 * 3 * 2);
+    for (int i = 0; i < 2 * 3 * 2 * 3 * 2; i++)
+    {
+        c_data[i] = 2000 + float(i + 1);
+    }
+    auto shape_a = Shape{2, 3, 4, 3, 2};
+    auto A = make_shared<op::Parameter>(
+        make_shared<TensorViewType>(element::Float32::element_type(), shape_a));
+    auto shape_b = Shape{2, 3, 3, 3, 2};
+    auto B = make_shared<op::Parameter>(
+        make_shared<TensorViewType>(element::Float32::element_type(), shape_b));
+    auto shape_c = Shape{2, 3, 2, 3, 2};
+    auto C = make_shared<op::Parameter>(
+        make_shared<TensorViewType>(element::Float32::element_type(), shape_c));
+    auto shape_r = Shape{2, 3, 9, 3, 2};
+    auto rt = make_shared<TensorViewType>(element::Float32::element_type(), shape_r);
+    auto r = make_shared<op::Concat>(Nodes{A, B, C}, 2);
+    auto f = make_shared<Function>(r, rt, op::Parameters{A, B, C});
+    auto manager = runtime::Manager::get("${BACKEND_NAME}");
+    auto external = manager->compile(f);
+    auto backend = manager->allocate_backend();
+    auto cf = backend->make_call_frame(external);
+    // Create some tensors for input/output
+    auto a = backend->make_primary_tensor_view(element::Float32::element_type(), shape_a);
+    copy_data(a, a_data);
+    auto b = backend->make_primary_tensor_view(element::Float32::element_type(), shape_b);
+    copy_data(b, b_data);
+    auto c = backend->make_primary_tensor_view(element::Float32::element_type(), shape_c);
+    copy_data(c, c_data);
+    auto result = backend->make_primary_tensor_view(element::Float32::element_type(), shape_r);
+    cf->call({a, b, c}, {result});
+    EXPECT_EQ(
+        (vector<float>{
+            1.,    2.,    3.,    4.,    5.,    6.,    7.,    8.,    9.,    10.,   11.,   12.,
+            13.,   14.,   15.,   16.,   17.,   18.,   19.,   20.,   21.,   22.,   23.,   24.,
+            1001., 1002., 1003., 1004., 1005., 1006., 1007., 1008., 1009., 1010., 1011., 1012.,
+            1013., 1014., 1015., 1016., 1017., 1018., 2001., 2002., 2003., 2004., 2005., 2006.,
+            2007., 2008., 2009., 2010., 2011., 2012., 25.,   26.,   27.,   28.,   29.,   30.,
+            31.,   32.,   33.,   34.,   35.,   36.,   37.,   38.,   39.,   40.,   41.,   42.,
+            43.,   44.,   45.,   46.,   47.,   48.,   1019., 1020., 1021., 1022., 1023., 1024.,
+            1025., 1026., 1027., 1028., 1029., 1030., 1031., 1032., 1033., 1034., 1035., 1036.,
+            2013., 2014., 2015., 2016., 2017., 2018., 2019., 2020., 2021., 2022., 2023., 2024.,
+            49.,   50.,   51.,   52.,   53.,   54.,   55.,   56.,   57.,   58.,   59.,   60.,
+            61.,   62.,   63.,   64.,   65.,   66.,   67.,   68.,   69.,   70.,   71.,   72.,
+            1037., 1038., 1039., 1040., 1041., 1042., 1043., 1044., 1045., 1046., 1047., 1048.,
+            1049., 1050., 1051., 1052., 1053., 1054., 2025., 2026., 2027., 2028., 2029., 2030.,
+            2031., 2032., 2033., 2034., 2035., 2036., 73.,   74.,   75.,   76.,   77.,   78.,
+            79.,   80.,   81.,   82.,   83.,   84.,   85.,   86.,   87.,   88.,   89.,   90.,
+            91.,   92.,   93.,   94.,   95.,   96.,   1055., 1056., 1057., 1058., 1059., 1060.,
+            1061., 1062., 1063., 1064., 1065., 1066., 1067., 1068., 1069., 1070., 1071., 1072.,
+            2037., 2038., 2039., 2040., 2041., 2042., 2043., 2044., 2045., 2046., 2047., 2048.,
+            97.,   98.,   99.,   100.,  101.,  102.,  103.,  104.,  105.,  106.,  107.,  108.,
+            109.,  110.,  111.,  112.,  113.,  114.,  115.,  116.,  117.,  118.,  119.,  120.,
+            1073., 1074., 1075., 1076., 1077., 1078., 1079., 1080., 1081., 1082., 1083., 1084.,
+            1085., 1086., 1087., 1088., 1089., 1090., 2049., 2050., 2051., 2052., 2053., 2054.,
+            2055., 2056., 2057., 2058., 2059., 2060., 121.,  122.,  123.,  124.,  125.,  126.,
+            127.,  128.,  129.,  130.,  131.,  132.,  133.,  134.,  135.,  136.,  137.,  138.,
+            139.,  140.,  141.,  142.,  143.,  144.,  1091., 1092., 1093., 1094., 1095., 1096.,
+            1097., 1098., 1099., 1100., 1101., 1102., 1103., 1104., 1105., 1106., 1107., 1108.,
+            2061., 2062., 2063., 2064., 2065., 2066., 2067., 2068., 2069., 2070., 2071., 2072.}),
+        result->get_vector<float>());
+}
 TEST(${BACKEND_NAME}, divide)
 {
    auto manager = runtime::Manager::get("${BACKEND_NAME}");
@@ -2594,7 +2723,7 @@ TEST(DISABLED_${BACKEND_NAME}, reshape_6d)
    auto result = backend->make_primary_tensor_view(element::Float32::element_type(), shape_r);
    cf->call({a}, {result});
-    ASSERT_EQ(
+    EXPECT_EQ(
        (vector<float>{
            1.,   73.,  9.,   81.,  17.,  89.,  2.,   74.,  10.,  82.,  18.,  90.,  3.,   75.,
            11.,  83.,  19.,  91.,  4.,   76.,  12.,  84.,  20.,  92.,  145., 217., 153., 225.,
@@ -4190,7 +4319,7 @@ TEST(DISABLED_${BACKEND_NAME}, dot_3d_multi_axis)
    auto result = backend->make_primary_tensor_view(element::Float32::element_type(), shape_r);
    cf->call({a, b}, {result});
-    ASSERT_EQ((vector<float>{2938., 3016., 3094., 3172., 3250., 7042., 7264., 7486., 7708., 7930.}),
+    EXPECT_EQ((vector<float>{2938., 3016., 3094., 3172., 3250., 7042., 7264., 7486., 7708., 7930.}),
              result->get_vector<float>());
 }
@@ -4247,7 +4376,7 @@ TEST(DISABLED_${BACKEND_NAME}, dot_3d_one_axis_arbitrary)
    auto result = backend->make_primary_tensor_view(element::Float32::element_type(), shape_r);
    cf->call({a, b}, {result});
-    ASSERT_EQ((vector<float>{483,  189, 331, 86,  85,  1262, 2155, 354, 83,  18,   58,   543,  77,
+    EXPECT_EQ((vector<float>{483,  189, 331, 86,  85,  1262, 2155, 354, 83,  18,   58,   543,  77,
                             241,  325, 286, 859, 144, 438,  1025, 317, 973, 1041, 2930, 163,  69,
                             117,  50,  29,  472, 819, 62,   785,  236, 476, 235,  175,  1521, 2387,
                             1402, 97,  29,  69,  412, 63,   286,  429, 218, 45,   11,   29,   162,
@@ -4321,7 +4450,7 @@ TEST(DISABLED_${BACKEND_NAME}, dot_4d_5d_multi_axis)
    auto result = backend->make_primary_tensor_view(element::Float32::element_type(), shape_r);
    cf->call({a, b}, {result});
-    ASSERT_EQ(
+    EXPECT_EQ(
        (vector<float>{6942.,  7020.,  7098.,  7176.,  7254.,  7332.,  7410.,  7488.,  7566.,
                       7644.,  7722.,  7800.,  16590., 16812., 17034., 17256., 17478., 17700.,
                       17922., 18144., 18366., 18588., 18810., 19032., 26238., 26604., 26970.,
@@ -4388,7 +4517,7 @@ TEST(DISABLED_${BACKEND_NAME}, dot_4d_5d_multi_axis_more)
    auto result = backend->make_primary_tensor_view(element::Float32::element_type(), shape_r);
    cf->call({a, b}, {result});
-    ASSERT_EQ((vector<float>{251412., 254040.}), result->get_vector<float>());
+    EXPECT_EQ((vector<float>{251412., 254040.}), result->get_vector<float>());
 }
 //
@@ -4456,14 +4585,14 @@ TEST(DISABLED_${BACKEND_NAME}, dot_4d_5d_multi_axis_big_fp64_VERY_SLOW)
    auto result = backend->make_primary_tensor_view(element::Float64::element_type(), shape_r);
    cf->call({a, b}, {result});
-    ASSERT_EQ(
+    EXPECT_TRUE(test::all_close(
-        (vector<double>{
+        vector<double>{
            2.48832025919525478400e+18, 2.48832051839533977600e+18, 2.48832077759658444800e+18,
            2.48832103679413504000e+18, 2.48832129599669350400e+18, 2.48832155519793971200e+18,
            2.48832181439802265600e+18, 2.48832207359808000000e+18, 2.48832233279813580800e+18,
            2.48832259199822028800e+18, 2.48832285119946496000e+18, 2.48832311040043008000e+18,
            2.48832336959957401600e+18, 2.48832362880081817600e+18, 2.48832388800090368000e+18,
            2.48832414720096000000e+18, 2.48832440640101478400e+18, 2.48832466560109772800e+18,
-            2.48832492480234188800e+18, 2.48832518400031897600e+18}),
+            2.48832492480234188800e+18, 2.48832518400031897600e+18},
-        result->get_vector<double>());
+        result->get_vector<double>()));
 }