Switch from Eigen to OpenMP for loops for DS2 kernels (#345)

* speed up reduceslice with kernel emitter * const-ify and fix a clang warning * add elementwise ops, slice to for loops * add broadcast codegen * add Exp * fix bugs introduced in eigen kernels * fix another introduced bug in Eigen * Fix an Atomic Bug with Sum, do some cleanup * unit tests pass * Add Reshape Op, passes Tests * rewrite sum to correctly handle muti-threading * Code Cleanup * add some extra unary ops * Address review comments * fix an error in the review comment refactor * Add Power op * Add (most) of the Logic Ops * Make Concat default to OpenMP kernel * fix n-D reshape issue

Switch from Eigen to OpenMP for loops for DS2 kernels (#345)
* speed up reduceslice with kernel emitter * const-ify and fix a clang warning * add elementwise ops, slice to for loops * add broadcast codegen * add Exp * fix bugs introduced in eigen kernels * fix another introduced bug in Eigen * Fix an Atomic Bug with Sum, do some cleanup * unit tests pass * Add Reshape Op, passes Tests * rewrite sum to correctly handle muti-threading * Code Cleanup * add some extra unary ops * Address review comments * fix an error in the review comment refactor * Add Power op * Add (most) of the Logic Ops * Make Concat default to OpenMP kernel * fix n-D reshape issue
7df687c1 · Matthew Brookhart · GitHub · 7e89f1bb · 7df687c1 · 7df687c1
Unverified Commit 7df687c1 authored Jan 10, 2018 by Matthew Brookhart Committed by GitHub Jan 10, 2018
6 changed files
--- a/src/ngraph/runtime/cpu/cpu_emitter.cpp
+++ b/src/ngraph/runtime/cpu/cpu_emitter.cpp
@@ -43,6 +43,8 @@
 using namespace std;
 using namespace ngraph;

+#define PREFER_EIGEN 0
+
 static string eigen_vector_format(const runtime::cpu::TensorViewWrapper& tvi)
 {
    return "fmt::V{" + to_string(tvi.get_size()) + "}";
@@ -69,6 +71,7 @@ void runtime::cpu::CPU_Emitter::EmitAdd(const ngraph::Node* n,
    // the right alignment instead of Eigen::Unaligned
    m_out << "{   // " << n->get_name() << "\n";
    m_out.indent++;
+#if PREFER_EIGEN == 1
    m_out << "Eigen::Map<Eigen::Array<" << out[0].get_element_type().c_type_string() << ", "
          << out[0].get_size() << ", 1>, Eigen::Unaligned> out(" << out[0].get_name() << ");\n";
    m_out << "Eigen::Map<Eigen::Array<" << args[0].get_element_type().c_type_string() << ", "
@@ -76,7 +79,14 @@ void runtime::cpu::CPU_Emitter::EmitAdd(const ngraph::Node* n,
    m_out << "Eigen::Map<Eigen::Array<" << args[1].get_element_type().c_type_string() << ", "
          << args[1].get_size() << ", 1>, Eigen::Unaligned> arg1(" << args[1].get_name() << ");\n";
    m_out << "out = arg0 + arg1;\n";
-
+#else
+    m_out << "#pragma omp parallel for\n";
+    m_out << "for (size_t i = 0; i < " << out[0].get_size() << "; i++)\n";
+    m_out << "{\n";
+    m_out << "    " << out[0].get_name() << "[i] = " << args[0].get_name() << "[i] + "
+          << args[1].get_name() << "[i];\n";
+    m_out << "}\n";
+#endif
    m_out.indent--;
    m_out << "}\n";
 }
@@ -166,9 +176,18 @@ void runtime::cpu::CPU_Emitter::EmitMultiply(const ngraph::Node* n,
 {
    m_out << "{   // " << n->get_name() << "\n";
    m_out.indent++;
+#if PREFER_EIGEN == 1
    m_out << emit_array1d(out[0]) << " =\n"
          << "   " << emit_array1d(args[0]) << " *\n"
          << "   " << emit_array1d(args[1]) << ";\n";
+#else
+    m_out << "#pragma omp parallel for\n";
+    m_out << "for (size_t i = 0; i < " << out[0].get_size() << "; i++)\n";
+    m_out << "{\n";
+    m_out << "    " << out[0].get_name() << "[i] = " << args[0].get_name() << "[i] * "
+          << args[1].get_name() << "[i];\n";
+    m_out << "}\n";
+#endif
    m_out.indent--;
    m_out << "}\n";
 }
@@ -209,8 +228,16 @@ void runtime::cpu::CPU_Emitter::EmitAbs(const ngraph::Node* n,
 {
    m_out << "{   // " << n->get_name() << "\n";
    m_out.indent++;
+#if PREFER_EIGEN == 1
    m_out << emit_array1d(out[0]) << " =\n";
    m_out << "Eigen::abs(" << emit_array1d(args[0]) << ");\n";
+#else
+    m_out << "#pragma omp parallel for\n";
+    m_out << "for (size_t i = 0; i < " << out[0].get_size() << "; i++)\n";
+    m_out << "{\n";
+    m_out << "    " << out[0].get_name() << "[i] = std::abs(" << args[0].get_name() << "[i]);\n";
+    m_out << "}\n";
+#endif
    m_out.indent--;
    m_out << "}\n";
 }
@@ -221,6 +248,7 @@ void runtime::cpu::CPU_Emitter::EmitConcat(const ngraph::Node* n,
 {
    auto result_shape = out[0].get_shape();

+#if PREFER_EIGEN == 1
    if (result_shape.size() == 1)
    {
        m_out << "{   // " << n->get_name() << "\n";
@@ -294,15 +322,35 @@ void runtime::cpu::CPU_Emitter::EmitConcat(const ngraph::Node* n,
                arg_shapes.push_back(arg.get_shape());
            }

-            kernels::emit_concat(m_out,
-                                 args[0].get_element_type().c_type_string(),
-                                 arg_names,
-                                 out[0].get_name(),
-                                 arg_shapes,
-                                 result_shape,
-                                 axis);
+            kernel::emit_concat(m_out,
+                                args[0].get_element_type().c_type_string(),
+                                arg_names,
+                                out[0].get_name(),
+                                arg_shapes,
+                                result_shape,
+                                axis);
        }
    }
+#else
+    auto axis = (dynamic_cast<const op::Concat*>(n))->get_concatenation_axis();
+
+    std::vector<std::string> arg_names;
+    std::vector<Shape> arg_shapes;
+
+    for (auto arg : args)
+    {
+        arg_names.push_back(arg.get_name());
+        arg_shapes.push_back(arg.get_shape());
+    }
+
+    kernel::emit_concat(m_out,
+                        args[0].get_element_type().c_type_string(),
+                        arg_names,
+                        out[0].get_name(),
+                        arg_shapes,
+                        result_shape,
+                        axis);
+#endif
 }

 void runtime::cpu::CPU_Emitter::EmitDivide(const ngraph::Node* n,
@@ -321,9 +369,18 @@ void runtime::cpu::CPU_Emitter::EmitDivide(const ngraph::Node* n,
              << "[i] == 0) throw std::runtime_error(\"integer divide by zero\");\n";
        m_out << "}\n";
    }
+#if PREFER_EIGEN == 1
    m_out << emit_array1d(out[0]) << " =\n"
          << "    " << emit_array1d(args[0]) << " /\n"
          << "    " << emit_array1d(args[1]) << ";\n";
+#else
+    m_out << "#pragma omp parallel for\n";
+    m_out << "for (size_t i = 0; i < " << out[0].get_size() << "; i++)\n";
+    m_out << "{\n";
+    m_out << "    " << out[0].get_name() << "[i] = " << args[0].get_name() << "[i] / "
+          << args[1].get_name() << "[i];\n";
+    m_out << "}\n";
+#endif
    m_out.indent--;
    m_out << "}\n";
 }
@@ -334,9 +391,18 @@ void runtime::cpu::CPU_Emitter::EmitEqual(const ngraph::Node* n,
 {
    m_out << "{   // " << n->get_name() << "\n";
    m_out.indent++;
+#if PREFER_EIGEN == 1
    m_out << emit_array1d(out[0]) << " =\n"
          << "    (" << emit_array1d(args[0]) << " ==\n"
          << "    " << emit_array1d(args[1]) << ").template cast<char>();\n";
+#else
+    m_out << "#pragma omp parallel for\n";
+    m_out << "for (size_t i = 0; i < " << out[0].get_size() << "; i++)\n";
+    m_out << "{\n";
+    m_out << "    " << out[0].get_name() << "[i] = " << args[0].get_name()
+          << "[i] == " << args[1].get_name() << "[i];\n";
+    m_out << "}\n";
+#endif
    m_out.indent--;
    m_out << "}\n";
 }
@@ -347,9 +413,18 @@ void runtime::cpu::CPU_Emitter::EmitGreater(const ngraph::Node* n,
 {
    m_out << "{   // " << n->get_name() << " xxx\n";
    m_out.indent++;
+#if PREFER_EIGEN == 1
    m_out << emit_array1d(out[0]) << " =\n"
          << "    (" << emit_array1d(args[0]) << " >\n"
          << "    " << emit_array1d(args[1]) << ").template cast<char>();\n";
+#else
+    m_out << "#pragma omp parallel for\n";
+    m_out << "for (size_t i = 0; i < " << out[0].get_size() << "; i++)\n";
+    m_out << "{\n";
+    m_out << "    " << out[0].get_name() << "[i] = " << args[0].get_name() << "[i] > "
+          << args[1].get_name() << "[i];\n";
+    m_out << "}\n";
+#endif
    m_out.indent--;
    m_out << "}\n";
 }
@@ -360,9 +435,18 @@ void runtime::cpu::CPU_Emitter::EmitGreaterEq(const ngraph::Node* n,
 {
    m_out << "{   // " << n->get_name() << "\n";
    m_out.indent++;
+#if PREFER_EIGEN == 1
    m_out << emit_array1d(out[0]) << " =\n"
          << "    (" << emit_array1d(args[0]) << " >=\n"
          << "    " << emit_array1d(args[1]) << ").template cast<char>();\n";
+#else
+    m_out << "#pragma omp parallel for\n";
+    m_out << "for (size_t i = 0; i < " << out[0].get_size() << "; i++)\n";
+    m_out << "{\n";
+    m_out << "    " << out[0].get_name() << "[i] = " << args[0].get_name()
+          << "[i] >= " << args[1].get_name() << "[i];\n";
+    m_out << "}\n";
+#endif
    m_out.indent--;
    m_out << "}\n";
 }
@@ -373,9 +457,18 @@ void runtime::cpu::CPU_Emitter::EmitLess(const ngraph::Node* n,
 {
    m_out << "{   // " << n->get_name() << "\n";
    m_out.indent++;
+#if PREFER_EIGEN == 1
    m_out << emit_array1d(out[0]) << " =\n"
          << "    (" << emit_array1d(args[0]) << " <\n"
          << "    " << emit_array1d(args[1]) << ").template cast<char>();\n";
+#else
+    m_out << "#pragma omp parallel for\n";
+    m_out << "for (size_t i = 0; i < " << out[0].get_size() << "; i++)\n";
+    m_out << "{\n";
+    m_out << "    " << out[0].get_name() << "[i] = " << args[0].get_name() << "[i] < "
+          << args[1].get_name() << "[i];\n";
+    m_out << "}\n";
+#endif
    m_out.indent--;
    m_out << "}\n";
 }
@@ -386,9 +479,18 @@ void runtime::cpu::CPU_Emitter::EmitLessEq(const ngraph::Node* n,
 {
    m_out << "{   // " << n->get_name() << "\n";
    m_out.indent++;
+#if PREFER_EIGEN == 1
    m_out << emit_array1d(out[0]) << " =\n"
          << "    (" << emit_array1d(args[0]) << " <=\n"
          << "    " << emit_array1d(args[1]) << ").template cast<char>();\n";
+#else
+    m_out << "#pragma omp parallel for\n";
+    m_out << "for (size_t i = 0; i < " << out[0].get_size() << "; i++)\n";
+    m_out << "{\n";
+    m_out << "    " << out[0].get_name() << "[i] = " << args[0].get_name()
+          << "[i] <= " << args[1].get_name() << "[i];\n";
+    m_out << "}\n";
+#endif
    m_out.indent--;
    m_out << "}\n";
 }
@@ -399,8 +501,16 @@ void runtime::cpu::CPU_Emitter::EmitLog(const ngraph::Node* n,
 {
    m_out << "{   // " << n->get_name() << "\n";
    m_out.indent++;
+#if PREFER_EIGEN == 1
    m_out << emit_array1d(out[0]) << " =\n"
          << "    Eigen::log(" << emit_array1d(args[0]) << ");\n";
+#else
+    m_out << "#pragma omp parallel for\n";
+    m_out << "for (size_t i = 0; i < " << out[0].get_size() << "; i++)\n";
+    m_out << "{\n";
+    m_out << "    " << out[0].get_name() << "[i] = log(" << args[0].get_name() << "[i]);\n";
+    m_out << "}\n";
+#endif
    m_out.indent--;
    m_out << "}\n";
 }
@@ -411,9 +521,19 @@ void runtime::cpu::CPU_Emitter::EmitMaximum(const ngraph::Node* n,
 {
    m_out << "{   // " << n->get_name() << "\n";
    m_out.indent++;
+#if PREFER_EIGEN == 1
    m_out << emit_array1d(out[0]) << " =\n"
          << "        " << emit_array1d(args[0]) << ".max(\n"
          << "        " << emit_array1d(args[1]) << ");\n";
+#else
+    m_out << "#pragma omp parallel for\n";
+    m_out << "for (size_t i = 0; i < " << out[0].get_size() << "; i++)\n";
+    m_out << "{\n";
+    m_out << "    " << out[0].get_name() << "[i] = " << args[0].get_name() << "[i] > "
+          << args[1].get_name() << "[i] ? " << args[0].get_name() << "[i] : " << args[1].get_name()
+          << "[i] ;\n";
+    m_out << "}\n";
+#endif
    m_out.indent--;
    m_out << "}\n";
 }
@@ -424,9 +544,19 @@ void runtime::cpu::CPU_Emitter::EmitMinimum(const ngraph::Node* n,
 {
    m_out << "{   // " << n->get_name() << "\n";
    m_out.indent++;
+#if PREFER_EIGEN == 1
    m_out << emit_array1d(out[0]) << " =\n"
          << "    " << emit_array1d(args[0]) << ".min(\n"
          << "    " << emit_array1d(args[1]) << ");\n";
+#else
+    m_out << "#pragma omp parallel for\n";
+    m_out << "for (size_t i = 0; i < " << out[0].get_size() << "; i++)\n";
+    m_out << "{\n";
+    m_out << "    " << out[0].get_name() << "[i] = " << args[0].get_name() << "[i] < "
+          << args[1].get_name() << "[i] ? " << args[0].get_name() << "[i] : " << args[1].get_name()
+          << "[i] ;\n";
+    m_out << "}\n";
+#endif
    m_out.indent--;
    m_out << "}\n";
 }
@@ -437,8 +567,16 @@ void runtime::cpu::CPU_Emitter::EmitNegative(const ngraph::Node* n,
 {
    m_out << "{   // " << n->get_name() << "\n";
    m_out.indent++;
+#if PREFER_EIGEN == 1
    m_out << emit_array1d(out[0]) << " =\n"
          << "    -" << emit_array1d(args[0]) << ";\n";
+#else
+    m_out << "#pragma omp parallel for\n";
+    m_out << "for (size_t i = 0; i < " << out[0].get_size() << "; i++)\n";
+    m_out << "{\n";
+    m_out << "    " << out[0].get_name() << "[i] = -" << args[0].get_name() << "[i];\n";
+    m_out << "}\n";
+#endif
    m_out.indent--;
    m_out << "}\n";
 }
@@ -449,9 +587,18 @@ void runtime::cpu::CPU_Emitter::EmitNotEqual(const ngraph::Node* n,
 {
    m_out << "{   // " << n->get_name() << "\n";
    m_out.indent++;
+#if PREFER_EIGEN == 1
    m_out << emit_array1d(out[0]) << " =\n"
          << "    (" << emit_array1d(args[0]) << " !=\n"
          << "    " << emit_array1d(args[1]) << ").template cast<char>();\n";
+#else
+    m_out << "#pragma omp parallel for\n";
+    m_out << "for (size_t i = 0; i < " << out[0].get_size() << "; i++)\n";
+    m_out << "{\n";
+    m_out << "    " << out[0].get_name() << "[i] = " << args[0].get_name()
+          << "[i] != " << args[1].get_name() << "[i];\n";
+    m_out << "}\n";
+#endif
    m_out.indent--;
    m_out << "}\n";
 }
@@ -476,9 +623,18 @@ void runtime::cpu::CPU_Emitter::EmitSubtract(const ngraph::Node* n,
 {
    m_out << "{   // " << n->get_name() << "\n";
    m_out.indent++;
+#if PREFER_EIGEN == 1
    m_out << emit_array1d(out[0]) << " =\n"
          << "    " << emit_array1d(args[0]) << " -\n"
          << "    " << emit_array1d(args[1]) << ";\n";
+#else
+    m_out << "#pragma omp parallel for\n";
+    m_out << "for (size_t i = 0; i < " << out[0].get_size() << "; i++)\n";
+    m_out << "{\n";
+    m_out << "    " << out[0].get_name() << "[i] = " << args[0].get_name() << "[i] - "
+          << args[1].get_name() << "[i];\n";
+    m_out << "}\n";
+#endif
    m_out.indent--;
    m_out << "}\n";
 }
@@ -489,6 +645,9 @@ void runtime::cpu::CPU_Emitter::EmitBroadcast(const ngraph::Node* n,
 {
    auto broadcast = static_cast<const op::Broadcast*>(n);

+    m_out << "{   // " << n->get_name() << "\n";
+    m_out.indent++;
+#if PREFER_EIGEN == 1
    auto arg_shape = args[0].get_shape();
    auto result_shape = out[0].get_shape();

@@ -554,6 +713,17 @@ void runtime::cpu::CPU_Emitter::EmitBroadcast(const ngraph::Node* n,
        m_out << "                         {" << join(result_shape) << "},\n";
        m_out << "                         {" << join(broadcast->get_broadcast_axes()) << "});\n";
    }
+#else
+    kernel::emit_broadcast(m_out,
+                           args[0].get_element_type().c_type_string(),
+                           args[0].get_name(),
+                           out[0].get_name(),
+                           args[0].get_shape(),
+                           out[0].get_shape(),
+                           broadcast->get_broadcast_axes());
+#endif
+    m_out.indent--;
+    m_out << "}\n";
 }

 void runtime::cpu::CPU_Emitter::EmitConvert(const ngraph::Node* n,
@@ -582,7 +752,9 @@ void runtime::cpu::CPU_Emitter::EmitReshape(const ngraph::Node* n,
                                            const vector<runtime::cpu::TensorViewWrapper>& out)
 {
    auto reshape = static_cast<const op::Reshape*>(n);
-
+    m_out << "{   // " << n->get_name() << "\n";
+    m_out.indent++;
+#if PREFER_EIGEN == 1
    auto arg_shape = args[0].get_shape();
    auto arg_rank = arg_shape.size();

@@ -645,6 +817,17 @@ void runtime::cpu::CPU_Emitter::EmitReshape(const ngraph::Node* n,
        throw ngraph_error(
            "Axis permutation in reshape is not implemented yet for tensors with rank>2");
    }
+#else
+    kernel::emit_reshape(m_out,
+                         args[0].get_element_type().c_type_string(),
+                         args[0].get_name(),
+                         out[0].get_name(),
+                         args[0].get_shape(),
+                         out[0].get_shape(),
+                         reshape->get_input_order());
+#endif
+    m_out.indent--;
+    m_out << "}\n";
 }

 void runtime::cpu::CPU_Emitter::EmitFunctionCall(
@@ -860,6 +1043,9 @@ void runtime::cpu::CPU_Emitter::EmitSlice(const ngraph::Node* n,
 {
    const op::Slice* slice = static_cast<const op::Slice*>(n);

+    m_out << "{   // " << n->get_name() << "\n";
+    m_out.indent++;
+#if PREFER_EIGEN == 1
    size_t arg_rank = args[0].get_shape().size();

    const Coordinate& lower_bounds = slice->get_lower_bounds();
@@ -919,6 +1105,19 @@ void runtime::cpu::CPU_Emitter::EmitSlice(const ngraph::Node* n,
        m_out << "                         {" << join(slice->get_strides()) << "},\n";
        m_out << "                         {" << join(out[0].get_shape()) << "});\n";
    }
+#else
+    kernel::emit_slice(m_out,
+                       args[0].get_element_type().c_type_string(),
+                       args[0].get_name(),
+                       out[0].get_name(),
+                       args[0].get_shape(),
+                       out[0].get_shape(),
+                       slice->get_lower_bounds(),
+                       slice->get_upper_bounds(),
+                       slice->get_strides());
+#endif
+    m_out.indent--;
+    m_out << "}\n";
 }

 void runtime::cpu::CPU_Emitter::EmitSum(const ngraph::Node* n,
@@ -926,6 +1125,9 @@ void runtime::cpu::CPU_Emitter::EmitSum(const ngraph::Node* n,
                                        const vector<runtime::cpu::TensorViewWrapper>& out)
 {
    const op::Sum* sum = static_cast<const op::Sum*>(n);
+    m_out << "{   // " << n->get_name() << "\n";
+    m_out.indent++;
+#if PREFER_EIGEN == 1
    const Shape& arg_shape = args[0].get_shape();
    size_t arg_rank = arg_shape.size();
    const AxisSet& reduction_axes = sum->get_reduction_axes();
@@ -977,6 +1179,17 @@ void runtime::cpu::CPU_Emitter::EmitSum(const ngraph::Node* n,
        m_out << "                         {" << join(out[0].get_shape()) << "},\n";
        m_out << "                         {" << join(sum->get_reduction_axes()) << "});\n";
    }
+#else
+    kernel::emit_sum(m_out,
+                     args[0].get_element_type().c_type_string(),
+                     args[0].get_name(),
+                     out[0].get_name(),
+                     args[0].get_shape(),
+                     out[0].get_shape(),
+                     sum->get_reduction_axes());
+#endif
+    m_out.indent--;
+    m_out << "}\n";
 }

 void runtime::cpu::CPU_Emitter::EmitExp(const ngraph::Node* n,
@@ -985,8 +1198,16 @@ void runtime::cpu::CPU_Emitter::EmitExp(const ngraph::Node* n,
 {
    m_out << "{   // " << n->get_name() << "\n";
    m_out.indent++;
+#if PREFER_EIGEN == 1
    m_out << emit_array1d(out[0]) << " =\n"
          << "    " << emit_array1d(args[0]) << ".exp();\n";
+#else
+    m_out << "#pragma omp parallel for\n";
+    m_out << "for (size_t i = 0; i < " << out[0].get_size() << "; i++)\n";
+    m_out << "{\n";
+    m_out << "    " << out[0].get_name() << "[i] = exp(" << args[0].get_name() << "[i]);\n";
+    m_out << "}\n";
+#endif
    m_out.indent--;
    m_out << "}\n";
 }
@@ -997,8 +1218,16 @@ void runtime::cpu::CPU_Emitter::EmitSin(const ngraph::Node* n,
 {
    m_out << "{   // " << n->get_name() << "\n";
    m_out.indent++;
+#if PREFER_EIGEN == 1
    m_out << emit_array1d(out[0]) << " =\n"
          << "    " << emit_array1d(args[0]) << ".sin();\n";
+#else
+    m_out << "#pragma omp parallel for\n";
+    m_out << "for (size_t i = 0; i < " << out[0].get_size() << "; i++)\n";
+    m_out << "{\n";
+    m_out << "    " << out[0].get_name() << "[i] = sin(" << args[0].get_name() << "[i]);\n";
+    m_out << "}\n";
+#endif
    m_out.indent--;
    m_out << "}\n";
 }
@@ -1009,8 +1238,16 @@ void runtime::cpu::CPU_Emitter::EmitSinh(const ngraph::Node* n,
 {
    m_out << "{   // " << n->get_name() << "\n";
    m_out.indent++;
+#if PREFER_EIGEN == 1
    m_out << emit_array1d(out[0]) << " =\n"
          << "    " << emit_array1d(args[0]) << ".sinh();\n";
+#else
+    m_out << "#pragma omp parallel for\n";
+    m_out << "for (size_t i = 0; i < " << out[0].get_size() << "; i++)\n";
+    m_out << "{\n";
+    m_out << "    " << out[0].get_name() << "[i] = sinh(" << args[0].get_name() << "[i]);\n";
+    m_out << "}\n";
+#endif
    m_out.indent--;
    m_out << "}\n";
 }
@@ -1021,8 +1258,16 @@ void runtime::cpu::CPU_Emitter::EmitCos(const ngraph::Node* n,
 {
    m_out << "{   // " << n->get_name() << "\n";
    m_out.indent++;
+#if PREFER_EIGEN == 1
    m_out << emit_array1d(out[0]) << " =\n"
          << "    " << emit_array1d(args[0]) << ".cos();\n";
+#else
+    m_out << "#pragma omp parallel for\n";
+    m_out << "for (size_t i = 0; i < " << out[0].get_size() << "; i++)\n";
+    m_out << "{\n";
+    m_out << "    " << out[0].get_name() << "[i] = cos(" << args[0].get_name() << "[i]);\n";
+    m_out << "}\n";
+#endif
    m_out.indent--;
    m_out << "}\n";
 }
@@ -1033,8 +1278,16 @@ void runtime::cpu::CPU_Emitter::EmitCosh(const ngraph::Node* n,
 {
    m_out << "{   // " << n->get_name() << "\n";
    m_out.indent++;
+#if PREFER_EIGEN == 1
    m_out << emit_array1d(out[0]) << " =\n"
          << "    " << emit_array1d(args[0]) << ".cosh();\n";
+#else
+    m_out << "#pragma omp parallel for\n";
+    m_out << "for (size_t i = 0; i < " << out[0].get_size() << "; i++)\n";
+    m_out << "{\n";
+    m_out << "    " << out[0].get_name() << "[i] = cosh(" << args[0].get_name() << "[i]);\n";
+    m_out << "}\n";
+#endif
    m_out.indent--;
    m_out << "}\n";
 }
@@ -1045,8 +1298,16 @@ void runtime::cpu::CPU_Emitter::EmitTan(const ngraph::Node* n,
 {
    m_out << "{   // " << n->get_name() << "\n";
    m_out.indent++;
+#if PREFER_EIGEN == 1
    m_out << emit_array1d(out[0]) << " =\n"
          << "    " << emit_array1d(args[0]) << ".tan();\n";
+#else
+    m_out << "#pragma omp parallel for\n";
+    m_out << "for (size_t i = 0; i < " << out[0].get_size() << "; i++)\n";
+    m_out << "{\n";
+    m_out << "    " << out[0].get_name() << "[i] = tan(" << args[0].get_name() << "[i]);\n";
+    m_out << "}\n";
+#endif
    m_out.indent--;
    m_out << "}\n";
 }
@@ -1061,6 +1322,9 @@ void runtime::cpu::CPU_Emitter::EmitTanh(const ngraph::Node* n,
    // by models
    m_out << "{   // " << n->get_name() << "\n";
    m_out.indent++;
+#if PREFER_EIGEN == 0
+    m_out << "#pragma omp parallel for\n";
+#endif
    m_out << "for (size_t i=0; i<" << out[0].get_size() << "; i++)\n";
    m_out << "{\n";
    m_out << "    " << out[0].get_name() << "[i] = tanh(" << args[0].get_name() << "[i]);\n";
@@ -1075,8 +1339,16 @@ void runtime::cpu::CPU_Emitter::EmitAsin(const ngraph::Node* n,
 {
    m_out << "{   // " << n->get_name() << "\n";
    m_out.indent++;
+#if PREFER_EIGEN == 1
    m_out << emit_array1d(out[0]) << " =\n"
          << "    " << emit_array1d(args[0]) << ".asin();\n";
+#else
+    m_out << "#pragma omp parallel for\n";
+    m_out << "for (size_t i = 0; i < " << out[0].get_size() << "; i++)\n";
+    m_out << "{\n";
+    m_out << "    " << out[0].get_name() << "[i] = asin(" << args[0].get_name() << "[i]);\n";
+    m_out << "}\n";
+#endif
    m_out.indent--;
    m_out << "}\n";
 }
@@ -1087,8 +1359,16 @@ void runtime::cpu::CPU_Emitter::EmitAcos(const ngraph::Node* n,
 {
    m_out << "{   // " << n->get_name() << "\n";
    m_out.indent++;
+#if PREFER_EIGEN == 1
    m_out << emit_array1d(out[0]) << " =\n"
          << "    " << emit_array1d(args[0]) << ".acos();\n";
+#else
+    m_out << "#pragma omp parallel for\n";
+    m_out << "for (size_t i = 0; i < " << out[0].get_size() << "; i++)\n";
+    m_out << "{\n";
+    m_out << "    " << out[0].get_name() << "[i] = acos(" << args[0].get_name() << "[i]);\n";
+    m_out << "}\n";
+#endif
    m_out.indent--;
    m_out << "}\n";
 }
@@ -1099,8 +1379,16 @@ void runtime::cpu::CPU_Emitter::EmitAtan(const ngraph::Node* n,
 {
    m_out << "{   // " << n->get_name() << "\n";
    m_out.indent++;
+#if PREFER_EIGEN == 1
    m_out << emit_array1d(out[0]) << " =\n"
          << "    " << emit_array1d(args[0]) << ".atan();\n";
+#else
+    m_out << "#pragma omp parallel for\n";
+    m_out << "for (size_t i = 0; i < " << out[0].get_size() << "; i++)\n";
+    m_out << "{\n";
+    m_out << "    " << out[0].get_name() << "[i] = atan(" << args[0].get_name() << "[i]);\n";
+    m_out << "}\n";
+#endif
    m_out.indent--;
    m_out << "}\n";
 }
@@ -1111,11 +1399,21 @@ void runtime::cpu::CPU_Emitter::EmitPower(const ngraph::Node* n,
 {
    m_out << "{   // " << n->get_name() << "\n";
    m_out.indent++;
+#if PREFER_EIGEN == 1
    m_out << emit_array1d(out[0]) << " = \n";
    m_out.indent++;
    m_out << emit_array1d(args[0]) << ".pow(\n ";
    m_out << emit_array1d(args[1]) << ");\n";
-    m_out.indent -= 2;
+    m_out.indent--;
+#else
+    m_out << "#pragma omp parallel for\n";
+    m_out << "for (size_t i = 0; i < " << out[0].get_size() << "; i++)\n";
+    m_out << "{\n";
+    m_out << "    " << out[0].get_name() << "[i] = pow(" << args[0].get_name() << "[i], "
+          << args[1].get_name() << "[i]);\n";
+    m_out << "}\n";
+#endif
+    m_out.indent--;
    m_out << "}\n";
 }

@@ -1125,7 +1423,9 @@ void runtime::cpu::CPU_Emitter::EmitReplaceSlice(
    const vector<runtime::cpu::TensorViewWrapper>& out)
 {
    auto replace_slice = static_cast<const op::Slice*>(n);
-
+    m_out << "{   // " << n->get_name() << "\n";
+    m_out.indent++;
+#if PREFER_EIGEN == 1
    size_t arg0_rank = args[0].get_shape().size();

    auto& lower_bounds = replace_slice->get_lower_bounds();
@@ -1192,6 +1492,20 @@ void runtime::cpu::CPU_Emitter::EmitReplaceSlice(
        m_out << "                         {" << join(replace_slice->get_strides()) << "},\n";
        m_out << "                         {" << join(out[0].get_shape()) << "});\n";
    }
+#else
+    kernel::emit_replace_slice(m_out,
+                               args[0].get_element_type().c_type_string(),
+                               args[0].get_name(),
+                               args[1].get_name(),
+                               out[0].get_name(),
+                               args[1].get_shape(),
+                               out[0].get_shape(),
+                               replace_slice->get_lower_bounds(),
+                               replace_slice->get_upper_bounds(),
+                               replace_slice->get_strides());
+#endif
+    m_out.indent--;
+    m_out << "}\n";
 }

 void runtime::cpu::CPU_Emitter::EmitOneHot(const ngraph::Node* n,
@@ -1295,6 +1609,9 @@ void runtime::cpu::CPU_Emitter::EmitCeiling(const ngraph::Node* n,
    m_out << "{   // " << n->get_name() << "\n";
    m_out.indent++;
    size_t element_count = out[0].get_size();
+#if PREFER_EIGEN == 0
+    m_out << "#pragma omp parallel for\n";
+#endif
    m_out << "for (size_t i = 0; i < " << element_count << "; i++)\n";
    m_out << "{\n";
    m_out << "    " << out[0].get_name() << "[i] = ceil(" << args[0].get_name() << "[i]);\n";
@@ -1310,6 +1627,9 @@ void runtime::cpu::CPU_Emitter::EmitFloor(const ngraph::Node* n,
    m_out << "{   // " << n->get_name() << "\n";
    m_out.indent++;
    size_t element_count = out[0].get_size();
+#if PREFER_EIGEN == 0
+    m_out << "#pragma omp parallel for\n";
+#endif
    m_out << "for (size_t i = 0; i < " << element_count << "; i++)\n";
    m_out << "{\n";
    m_out << "    " << out[0].get_name() << "[i] = floor(" << args[0].get_name() << "[i]);\n";
@@ -1325,6 +1645,9 @@ void runtime::cpu::CPU_Emitter::EmitSqrt(const ngraph::Node* n,
    m_out << "{   // " << n->get_name() << "\n";
    m_out.indent++;
    size_t element_count = out[0].get_size();
+#if PREFER_EIGEN == 0
+    m_out << "#pragma omp parallel for\n";
+#endif
    m_out << "for (size_t i = 0; i < " << element_count << "; i++)\n";
    m_out << "{\n";
    m_out << "    " << out[0].get_name() << "[i] = sqrt(" << args[0].get_name() << "[i]);\n";

--- a/src/ngraph/runtime/cpu/cpu_kernel_emitters.cpp
+++ b/src/ngraph/runtime/cpu/cpu_kernel_emitters.cpp
@@ -11,24 +11,129 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // ----------------------------------------------------------------------------
+#include <algorithm>
+#include <map>

-#include "ngraph/runtime/cpu/cpu_kernel_emitters.hpp"
 #include "ngraph/codegen/code_writer.hpp"
+#include "ngraph/runtime/cpu/cpu_kernel_emitters.hpp"
 #include "ngraph/runtime/cpu/cpu_kernel_utils.hpp"

 using namespace ngraph;
-using namespace ngraph::runtime::cpu::kernels;
+using namespace ngraph::runtime::cpu::kernel;
+
+// Function to take a vector of data, say 1,2,3 and return
+// a string representing multi-index access, i.e "[1][2][3]"
+template <typename T>
+std::string emit_bracketed_string(std::vector<T> data)
+{
+    std::stringstream ss;
+
+    if (data.size() == 0)
+        return "";
+
+    for (auto s : data)
+    {
+        ss << "[" << s << "]";
+    }
+
+    return ss.str();
+}
+
+// Convert a buffer into a C-style multi-index array
+std::string recast_tmp_var(codegen::CodeWriter& writer,
+                           const std::string& element_type,
+                           const std::string& arg_name,
+                           const Shape& arg_shape,
+                           const std::string& tmp_name)
+{
+    std::string nd_name = writer.generate_temporary_name(tmp_name);
+    std::string bracketed_shape = emit_bracketed_string(arg_shape);
+
+    writer << element_type << "(&" << nd_name << ")" << bracketed_shape << " = *reinterpret_cast<"
+           << element_type << "(*)" << bracketed_shape << ">(" << arg_name << ");\n";
+    return nd_name;
+}
+
+// write openings to for loops, for variables in the order of top,
+// where each loop ranges from bottom[i] to top[i]
+// creates index variables for each loop and returns them
+std::vector<std::string>
+    open_for_loops(codegen::CodeWriter& writer, const Shape& top, const Shape& bottom = {})
+{
+    Shape new_bottom;
+    if (bottom.size() == 0)
+    {
+        new_bottom = Shape(top.size(), 0);
+    }
+    else
+    {
+        new_bottom = bottom;
+    }
+
+    std::vector<std::string> index_vars;
+    for (size_t i = 0; i < top.size(); i++)
+    {
+        std::string index_var = writer.generate_temporary_name("i");
+
+        writer << start_index_loop(index_var, new_bottom[i], top[i], i == 0);
+        writer.indent++;
+
+        index_vars.push_back(index_var);
+    }
+
+    return index_vars;
+}
+//close the for loops created by open_for_loops
+void close_for_loops(codegen::CodeWriter& writer, const std::vector<std::string>& index_vars)
+{
+    for (size_t i = index_vars.size(); i-- > 0;)
+    {
+        writer.indent--;
+        writer << end_index_loop(index_vars[i]);
+    }
+}
+
+void ngraph::runtime::cpu::kernel::emit_broadcast(codegen::CodeWriter& writer,
+                                                  const std::string& element_type,
+                                                  const std::string& arg0, // replacement context
+                                                  const std::string& out,
+                                                  const Shape& arg0_shape,
+                                                  const Shape& out_shape,
+                                                  const AxisSet& broadcast_axes)
+{
+    // create input and output arrays
+    auto source_nd_name = recast_tmp_var(writer, element_type, arg0, arg0_shape, "source_nd");
+    auto dest_nd_name = recast_tmp_var(writer, element_type, out, out_shape, "dest_nd");
+
+    // create the for loops
+    auto index_vars = open_for_loops(writer, out_shape);
+
+    // match positions in output to positions in the input
+    std::vector<std::string> source_indexes;
+    for (size_t i = 0; i < out_shape.size(); ++i)
+    {
+        if (broadcast_axes.count(i) == 0)
+        {
+            source_indexes.push_back(index_vars[i]);
+        }
+    }
+    // write the operation
+    writer << dest_nd_name << emit_bracketed_string(index_vars) << " = " << source_nd_name
+           << emit_bracketed_string(source_indexes) << ";\n";
+
+    close_for_loops(writer, index_vars);
+}

 //
 // For the reference kernel this is based on, see ngraph/runtime/kernel/concat.hpp.
 //
-void ngraph::runtime::cpu::kernels::emit_concat(codegen::CodeWriter& writer,
-                                                std::string element_type,
-                                                const std::vector<std::string> args,
-                                                std::string out,
-                                                const std::vector<Shape>& in_shapes,
-                                                const Shape& out_shape,
-                                                size_t concatenation_axis)
+void ngraph::runtime::cpu::kernel::emit_concat(codegen::CodeWriter& writer,
+                                               const std::string& element_type,
+                                               const std::vector<std::string>& args,
+                                               const std::string& out,
+                                               const std::vector<Shape>& in_shapes,
+                                               const Shape& out_shape,
+                                               size_t concatenation_axis)
 {
    size_t concatenation_pos = 0;

@@ -49,3 +154,213 @@ void ngraph::runtime::cpu::kernels::emit_concat(codegen::CodeWriter& writer,
        concatenation_pos += in_shapes[i][concatenation_axis];
    }
 }
+
+void ngraph::runtime::cpu::kernel::emit_replace_slice(
+    codegen::CodeWriter& writer,
+    const std::string& element_type,
+    const std::string& arg0, // replacement context
+    const std::string& arg1, // replacement value
+    const std::string& out,
+    const Shape& arg1_shape,
+    const Shape& out_shape,
+    const Coordinate& lower_bounds,
+    const Coordinate& upper_bounds,
+    const Strides& strides)
+{
+    // Step 1: Copy the entire replacement context to the output.
+    CoordinateTransform copy_transform(out_shape);
+    emit_pointwise_copy(writer, element_type, arg0, out, copy_transform, copy_transform);
+
+    // Step 2: Overwrite the slice for replacement.
+    CoordinateTransform input_transform(arg1_shape);
+    CoordinateTransform output_transform(out_shape, lower_bounds, upper_bounds, strides);
+
+    emit_pointwise_copy(writer, element_type, arg1, out, input_transform, output_transform);
+}
+
+void ngraph::runtime::cpu::kernel::emit_slice(codegen::CodeWriter& writer,
+                                              const std::string& element_type,
+                                              const std::string& arg0, // replacement context
+                                              const std::string& out,
+                                              const Shape& arg0_shape,
+                                              const Shape& out_shape,
+                                              const Coordinate& lower_bounds,
+                                              const Coordinate& upper_bounds,
+                                              const Strides& strides)
+{
+    // create input and output arrays
+    auto source_nd_name = recast_tmp_var(writer, element_type, arg0, arg0_shape, "source_nd");
+    auto dest_nd_name = recast_tmp_var(writer, element_type, out, out_shape, "dest_nd");
+
+    // create the for loops
+    auto index_vars = open_for_loops(writer, out_shape);
+
+    // map the position in the output to a position in the input
+    std::vector<std::string> source_indexes;
+    size_t j = 0;
+    for (size_t i = 0; i < lower_bounds.size(); ++i)
+    {
+        if (lower_bounds[i] == upper_bounds[i])
+        {
+            source_indexes.push_back(std::to_string(lower_bounds[i]));
+        }
+        else
+        {
+            std::stringstream ss;
+            ss << lower_bounds[i];
+            ss << " + " << index_vars[j];
+            ss << " * " << strides[i];
+            source_indexes.push_back(ss.str());
+            j += 1;
+        }
+    }
+
+    // write the element copy operation
+    writer << dest_nd_name << emit_bracketed_string(index_vars) << " = " << source_nd_name
+           << emit_bracketed_string(source_indexes) << ";\n";
+
+    close_for_loops(writer, index_vars);
+}
+
+void ngraph::runtime::cpu::kernel::emit_reshape(codegen::CodeWriter& writer,
+                                                const std::string& element_type,
+                                                const std::string& arg0, // replacement context
+                                                const std::string& out,
+                                                const Shape& arg0_shape,
+                                                const Shape& out_shape,
+                                                const AxisVector& arg0_axis_order)
+{
+    // get the total number of elements
+    size_t size = 1;
+    for (auto x : out_shape)
+    {
+        if (x != 0)
+            size *= x;
+    }
+
+    // create input and output arrays
+    auto source_nd_name = recast_tmp_var(writer, element_type, arg0, arg0_shape, "source_nd");
+    auto dest_nd_name = recast_tmp_var(writer, element_type, out, {size}, "dest_nd");
+
+    std::map<size_t, size_t> input_to_loop_pos;
+    std::map<size_t, size_t> loop_to_input_pos;
+    // loop over the input in the order of arg0_axis_order
+    int input_pos = 0;
+    Shape ordered_input_shape;
+    for (size_t i = 0; i < arg0_shape.size(); i++)
+    {
+        ordered_input_shape.push_back(arg0_shape[arg0_axis_order[i]]);
+        input_to_loop_pos[input_pos] = arg0_axis_order[i];
+        input_pos += 1;
+    }
+
+    for (auto kv : input_to_loop_pos)
+    {
+        loop_to_input_pos[kv.second] = kv.first;
+    }
+
+    auto index_vars = open_for_loops(writer, ordered_input_shape);
+
+    // write the output reshape as a 1D array by calculating the
+    // position of the input iterators in the output array
+    writer << dest_nd_name << "[ 0";
+
+    for (size_t i = 0; i < arg0_shape.size(); i++)
+    {
+        writer << " + " << index_vars[i];
+        for (auto j = i + 1; j < arg0_shape.size(); j++)
+        {
+            if (arg0_shape[j] > 0)
+            {
+                writer << " * " << ordered_input_shape[j];
+            }
+        }
+    }
+    writer << "] = " << source_nd_name;
+
+    for (size_t i = 0; i < arg0_shape.size(); i++)
+    {
+        writer << "[" << index_vars[loop_to_input_pos[i]] << "]";
+    }
+    writer << ";\n";
+
+    close_for_loops(writer, index_vars);
+}
+
+void ngraph::runtime::cpu::kernel::emit_sum(codegen::CodeWriter& writer,
+                                            const std::string& element_type,
+                                            const std::string& arg0, // replacement context
+                                            const std::string& out,
+                                            const Shape& arg0_shape,
+                                            const Shape& out_shape,
+                                            const AxisSet& reduction_axes)
+{
+    // create input and output arrays
+    auto source_nd_name = recast_tmp_var(writer, element_type, arg0, arg0_shape, "source_nd");
+    auto dest_nd_name = recast_tmp_var(writer, element_type, out, out_shape, "dest_nd");
+
+    // zero the output to make sure we don't have randomly initialized data
+    if (out_shape.size() == 0)
+    {
+        writer << dest_nd_name << " = 0;\n";
+    }
+    else
+    {
+        auto output_vars = open_for_loops(writer, out_shape);
+
+        writer << dest_nd_name << emit_bracketed_string(output_vars) << " = 0;\n";
+
+        close_for_loops(writer, output_vars);
+    }
+
+    // If we don't have a zero index in the input, perform the sum
+    if (std::find(arg0_shape.begin(), arg0_shape.end(), 0) == arg0_shape.end())
+    {
+        // create the the interation variables without writing the for loops
+        std::vector<std::string> index_vars;
+        for (size_t i = 0; i < arg0_shape.size(); i++)
+        {
+            std::string index_var = writer.generate_temporary_name("i");
+            index_vars.push_back(index_var);
+        }
+
+        // calculate the output indexes based on what's being reduced
+        std::vector<std::string> out_indexes;
+        size_t outer_arg_index = -1;
+        for (size_t i = 0; i < index_vars.size(); ++i)
+        {
+            if (reduction_axes.count(i) == 0)
+            {
+                if (out_indexes.size() == 0)
+                {
+                    outer_arg_index = i;
+                }
+                out_indexes.push_back(index_vars[i]);
+            }
+        }
+
+        // make the first output shape our outer loop, optimize with openmp
+        if (outer_arg_index != -1)
+        {
+            writer << start_index_loop(
+                index_vars[outer_arg_index], 0, arg0_shape[outer_arg_index], true);
+            writer.indent++;
+        }
+
+        // create the rest of the loops, don't parallelize.
+        for (size_t i = 0; i < arg0_shape.size(); i++)
+        {
+            if (i != outer_arg_index)
+            {
+                std::string index_var = index_vars[i];
+                writer << start_index_loop(index_var, 0, arg0_shape[i], false);
+                writer.indent++;
+            }
+        }
+
+        writer << dest_nd_name << emit_bracketed_string(out_indexes) << " += " << source_nd_name
+               << emit_bracketed_string(index_vars) << ";\n";
+
+        close_for_loops(writer, index_vars);
+    }
+}
--- a/src/ngraph/runtime/cpu/cpu_kernel_emitters.hpp
+++ b/src/ngraph/runtime/cpu/cpu_kernel_emitters.hpp
@@ -23,15 +23,56 @@ namespace ngraph
    {
        namespace cpu
        {
-            namespace kernels
+            namespace kernel
            {
+                void emit_broadcast(codegen::CodeWriter& writer,
+                                    const std::string& element_type,
+                                    const std::string& arg0, // replacement context
+                                    const std::string& out,
+                                    const Shape& arg0_shape,
+                                    const Shape& out_shape,
+                                    const AxisSet& broadcast_axes);
                void emit_concat(codegen::CodeWriter& writer,
-                                 std::string element_type,
-                                 const std::vector<std::string> args,
-                                 std::string out,
+                                 const std::string& element_type,
+                                 const std::vector<std::string>& args,
+                                 const std::string& out,
                                 const std::vector<Shape>& in_shapes,
                                 const Shape& out_shape,
-                                 size_t concatenation_axis);
+                                 const size_t concatenation_axis);
+
+                void emit_replace_slice(codegen::CodeWriter& writer,
+                                        const std::string& element_type,
+                                        const std::string& arg0, // replacement context
+                                        const std::string& arg1, // replacement value
+                                        const std::string& out,
+                                        const Shape& arg1_shape,
+                                        const Shape& out_shape,
+                                        const Coordinate& lower_bounds,
+                                        const Coordinate& upper_bounds,
+                                        const Strides& strides);
+                void emit_slice(codegen::CodeWriter& writer,
+                                const std::string& element_type,
+                                const std::string& arg0, // replacement context
+                                const std::string& out,
+                                const Shape& arg0_shape,
+                                const Shape& out_shape,
+                                const Coordinate& lower_bounds,
+                                const Coordinate& upper_bounds,
+                                const Strides& strides);
+                void emit_reshape(codegen::CodeWriter& writer,
+                                  const std::string& element_type,
+                                  const std::string& arg0, // replacement context
+                                  const std::string& out,
+                                  const Shape& arg0_shape,
+                                  const Shape& out_shape,
+                                  const AxisVector& arg0_axis_order);
+                void emit_sum(codegen::CodeWriter& writer,
+                              const std::string& element_type,
+                              const std::string& arg0, // replacement context
+                              const std::string& out,
+                              const Shape& arg0_shape,
+                              const Shape& out_shape,
+                              const AxisSet& reduction_axes);
            }
        }
    }

--- a/src/ngraph/runtime/cpu/cpu_kernel_utils.cpp
+++ b/src/ngraph/runtime/cpu/cpu_kernel_utils.cpp
@@ -19,7 +19,7 @@
 #include "ngraph/util.hpp"

 using namespace ngraph;
-using namespace ngraph::runtime::cpu::kernels;
+using namespace ngraph::runtime::cpu::kernel;

 //
 // Given a coordinate transform and a vector of index expressions relative to
@@ -38,8 +38,8 @@ using namespace ngraph::runtime::cpu::kernels;
 //
 //
 std::vector<std::string>
-    ngraph::runtime::cpu::kernels::emit_multi_indices(CoordinateTransform trans,
-                                                      std::vector<std::string> index_vars)
+    ngraph::runtime::cpu::kernel::emit_multi_indices(CoordinateTransform& trans,
+                                                     const std::vector<std::string>& index_vars)
 {
    std::vector<std::string> result;

@@ -90,8 +90,9 @@ std::vector<std::string>
 //    "((4 * ((k) * 2 + 5)) + (2 * ((i) * 2 + 3)) + ((j) * 2 + 4))"
 //
 //
-std::string ngraph::runtime::cpu::kernels::emit_linear_index(CoordinateTransform trans,
-                                                             std::vector<std::string> index_vars)
+std::string
+    ngraph::runtime::cpu::kernel::emit_linear_index(CoordinateTransform& trans,
+                                                    const std::vector<std::string>& index_vars)
 {
    std::vector<std::string> multi_indices = emit_multi_indices(trans, index_vars);

@@ -122,10 +123,10 @@ std::string ngraph::runtime::cpu::kernels::emit_linear_index(CoordinateTransform
 //
 // Optionally emits an OpenMP parallel pragma, if "omp" is true.
 //
-std::string ngraph::runtime::cpu::kernels::start_index_loop(std::string index_var,
-                                                            size_t start,
-                                                            size_t end,
-                                                            bool omp)
+std::string ngraph::runtime::cpu::kernel::start_index_loop(const std::string& index_var,
+                                                           size_t start,
+                                                           size_t end,
+                                                           bool omp)
 {
    std::stringstream ss;

@@ -144,7 +145,7 @@ std::string ngraph::runtime::cpu::kernels::start_index_loop(std::string index_va
 //
 // Ends an indexing loop on the index variable [index_var].
 //
-std::string ngraph::runtime::cpu::kernels::end_index_loop(std::string index_var)
+std::string ngraph::runtime::cpu::kernel::end_index_loop(const std::string& index_var)
 {
    std::stringstream ss;

@@ -153,7 +154,7 @@ std::string ngraph::runtime::cpu::kernels::end_index_loop(std::string index_var)
    return ss.str();
 }

-std::string ngraph::runtime::cpu::kernels::emit_nd_sizes(CoordinateTransform trans)
+std::string ngraph::runtime::cpu::kernel::emit_nd_sizes(CoordinateTransform& trans)
 {
    std::stringstream ss;

@@ -165,8 +166,8 @@ std::string ngraph::runtime::cpu::kernels::emit_nd_sizes(CoordinateTransform tra
    return ss.str();
 }

-std::string ngraph::runtime::cpu::kernels::emit_nd_index(CoordinateTransform trans,
-                                                         std::vector<std::string> index_vars)
+std::string ngraph::runtime::cpu::kernel::emit_nd_index(CoordinateTransform& trans,
+                                                        const std::vector<std::string>& index_vars)
 {
    std::stringstream ss;

@@ -182,12 +183,12 @@ std::string ngraph::runtime::cpu::kernels::emit_nd_index(CoordinateTransform tra
 // Emits a pointwise copy from source_buffer mediated by in_trans, to
 // dest_buffer mediated by dest_trans.
 //
-void ngraph::runtime::cpu::kernels::emit_pointwise_copy(codegen::CodeWriter& writer,
-                                                        std::string element_type,
-                                                        std::string source_buffer,
-                                                        std::string dest_buffer,
-                                                        CoordinateTransform source_trans,
-                                                        CoordinateTransform dest_trans)
+void ngraph::runtime::cpu::kernel::emit_pointwise_copy(codegen::CodeWriter& writer,
+                                                       const std::string& element_type,
+                                                       const std::string& source_buffer,
+                                                       const std::string& dest_buffer,
+                                                       CoordinateTransform& source_trans,
+                                                       CoordinateTransform& dest_trans)
 {
    std::vector<std::string> index_vars;


--- a/src/ngraph/runtime/cpu/cpu_kernel_utils.hpp
+++ b/src/ngraph/runtime/cpu/cpu_kernel_utils.hpp
@@ -24,24 +24,27 @@ namespace ngraph
    {
        namespace cpu
        {
-            namespace kernels
+            namespace kernel
            {
-                std::vector<std::string> emit_multi_indices(CoordinateTransform trans,
-                                                            std::vector<std::string> index_vars);
-                std::string emit_linear_index(CoordinateTransform trans,
-                                              std::vector<std::string> index_vars);
-                std::string
-                    start_index_loop(std::string index_var, size_t start, size_t end, bool omp);
-                std::string end_index_loop(std::string index_var);
-                std::string emit_nd_sizes(CoordinateTransform trans);
-                std::string emit_nd_index(CoordinateTransform trans,
-                                          std::vector<std::string> index_vars);
+                std::vector<std::string>
+                    emit_multi_indices(CoordinateTransform& trans,
+                                       const std::vector<std::string>& index_vars);
+                std::string emit_linear_index(CoordinateTransform& trans,
+                                              const std::vector<std::string>& index_vars);
+                std::string start_index_loop(const std::string& index_var,
+                                             size_t start,
+                                             size_t end,
+                                             bool omp);
+                std::string end_index_loop(const std::string& index_var);
+                std::string emit_nd_sizes(CoordinateTransform& trans);
+                std::string emit_nd_index(CoordinateTransform& trans,
+                                          const std::vector<std::string>& index_vars);
                void emit_pointwise_copy(codegen::CodeWriter& writer,
-                                         std::string element_type,
-                                         std::string source_buffer,
-                                         std::string dest_buffer,
-                                         CoordinateTransform source_trans,
-                                         CoordinateTransform dest_trans);
+                                         const std::string& element_type,
+                                         const std::string& source_buffer,
+                                         const std::string& dest_buffer,
+                                         CoordinateTransform& source_trans,
+                                         CoordinateTransform& dest_trans);
            }
        }
    }

--- a/test/backend_test.in.cpp
+++ b/test/backend_test.in.cpp
@@ -2368,9 +2368,7 @@ TEST(${BACKEND_NAME}, reshape_m2m_dim_change_transpose)
 //         198.,  270.,  206.,  278.,  214.,  286.,  199.,  271.,  207.,
 //         279.,  215.,  287.,  200.,  272.,  208.,  280.,  216.,  288.])
 //
-// Disabled because it doesn't work on CPU yet.
-//
-TEST(DISABLED_${BACKEND_NAME}, reshape_6d)
+TEST(${BACKEND_NAME}, reshape_6d)
 {
    vector<float> a_data(2 * 2 * 3 * 3 * 2 * 4);
    for (int i = 0; i < 2 * 2 * 3 * 3 * 2 * 4; i++)