Merge pull request #1249 from NervanaSystems/jmenon/dex4

CPU Direct Execution Part 4

Merge pull request #1249 from NervanaSystems/jmenon/dex4
CPU Direct Execution Part 4
8c1aad8f · Jayaram Bobba · GitHub · 7a35cf81 · 0b20b1a7 · 8c1aad8f
Unverified Commit 8c1aad8f authored Jul 25, 2018 by Jayaram Bobba Committed by GitHub Jul 25, 2018
57 changed files
--- a/src/ngraph/runtime/cpu/CMakeLists.txt
+++ b/src/ngraph/runtime/cpu/CMakeLists.txt
@@ -28,9 +28,25 @@ set(SRC
    cpu_tensor_view.cpp
    cpu_tracing.cpp
    builder/avg_pool.cpp
+    builder/batch_norm.cpp
+    builder/broadcast.cpp
+    builder/concat.cpp
+    builder/convert.cpp
    builder/convert_layout.cpp
    builder/convolution.cpp
+    builder/dot.cpp
+    builder/function_call.cpp
+    builder/matmul_bias.cpp
+    builder/max.cpp
+    builder/max_pool.cpp
+    builder/min.cpp
    builder/reshape.cpp
+    builder/reverse.cpp
+    builder/reverse_sequence.cpp
+    builder/select.cpp
+    builder/select_and_scatter.cpp
+    builder/sigmoid.cpp
+    builder/sum.cpp
    kernel/eigen_thread_pool.cpp
    kernel/pad.cpp
    kernel/reduce_max.cpp

--- a/src/ngraph/runtime/cpu/builder/avg_pool.cpp
+++ b/src/ngraph/runtime/cpu/builder/avg_pool.cpp
@@ -107,6 +107,8 @@ namespace ngraph
                    functors.emplace_back(functor);
                }
            }
+            REGISTER_OP_BUILDER(AvgPool);
        }
    }
 }
--- a/src/ngraph/runtime/cpu/builder/batch_norm.cpp
+++ b/src/ngraph/runtime/cpu/builder/batch_norm.cpp
--- a/src/ngraph/runtime/cpu/builder/broadcast.cpp
+++ b/src/ngraph/runtime/cpu/builder/broadcast.cpp
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#include <cstring>
+#include "ngraph/op/broadcast.hpp"
+#include "ngraph/runtime/cpu/cpu_builder.hpp"
+#include "ngraph/runtime/cpu/kernel/broadcast.hpp"
+using namespace std;
+using namespace ngraph;
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            template <>
+            void Builder::BUILDER_DECL(ngraph::op::Broadcast)
+            {
+                auto& functors = external_function->get_functors();
+                auto& tensor_data = external_function->get_tensor_data();
+                auto& arg_tensor = tensor_data[args[0].get_name()];
+                auto& out_tensor = tensor_data[out[0].get_name()];
+                auto arg_shape = args[0].get_shape();
+                auto out_shape = out[0].get_shape();
+                auto arg_rank = arg_shape.size();
+                auto out_rank = out_shape.size();
+                auto broadcast = static_cast<const ngraph::op::Broadcast*>(node);
+                if (broadcast->get_broadcast_axes().empty())
+                {
+                    size_t size = out[0].get_size() * out[0].get_element_type().size();
+                    auto functor = [&, size](CPURuntimeContext* ctx) {
+                        memcpy(out_tensor, arg_tensor, size);
+                    };
+                    functors.emplace_back(functor);
+                    return;
+                }
+                if (!arg_rank)
+                {
+                    arg_rank = 1;
+                    arg_shape = Shape{1};
+                }
+                auto new_shape = Shape(out_rank, 1);
+                const auto& broadcast_axes = broadcast->get_broadcast_axes();
+                size_t i = 0;
+                for (size_t j = 0; j < out_rank; j++)
+                {
+                    if (broadcast_axes.count(j))
+                    {
+                        new_shape[j] = 1;
+                    }
+                    else
+                    {
+                        new_shape[j] = arg_shape[i++];
+                    }
+                }
+                std::function<decltype(runtime::cpu::kernel::broadcast<float, 2>)> kernel;
+                SELECT_KERNEL_BY_RANK(
+                    kernel, args[0].get_element_type(), out_rank, runtime::cpu::kernel::broadcast);
+                auto functor = [&, kernel, new_shape, out_shape](CPURuntimeContext* ctx) {
+                    kernel(arg_tensor, out_tensor, new_shape, out_shape);
+                };
+                functors.emplace_back(functor);
+            }
+            REGISTER_OP_BUILDER(Broadcast);
+        }
+    }
+}
--- a/src/ngraph/runtime/cpu/builder/concat.cpp
+++ b/src/ngraph/runtime/cpu/builder/concat.cpp
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#include "ngraph/op/concat.hpp"
+#include "ngraph/runtime/cpu/cpu_builder.hpp"
+#include "ngraph/runtime/cpu/kernel/concat.hpp"
+using namespace std;
+using namespace ngraph;
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            template <>
+            void Builder::BUILDER_DECL(ngraph::op::Concat)
+            {
+                auto axis =
+                    (dynamic_cast<const ngraph::op::Concat*>(node))->get_concatenation_axis();
+                auto& functors = external_function->get_functors();
+                auto& tensor_data = external_function->get_tensor_data();
+                std::function<decltype(runtime::cpu::kernel::concat<float, 1>)> kernel;
+                SELECT_KERNEL_BY_RANK(kernel,
+                                      out[0].get_element_type(),
+                                      out[0].get_shape().size(),
+                                      runtime::cpu::kernel::concat);
+                vector<reference_wrapper<void*>> arg_tensors;
+                vector<Shape> arg_shapes;
+                for (auto& arg : args)
+                {
+                    if (shape_size(arg.get_shape()))
+                    {
+                        arg_tensors.emplace_back(tensor_data[arg.get_name()]);
+                        arg_shapes.emplace_back(arg.get_shape());
+                    }
+                }
+                auto& out_tensor = tensor_data[out[0].get_name()];
+                auto out_shape = out[0].get_shape();
+                auto functor =
+                    [&, kernel, arg_tensors, arg_shapes, out_shape, axis](CPURuntimeContext* ctx) {
+                        kernel(arg_tensors, arg_shapes, out_tensor, out_shape, axis);
+                    };
+                functors.emplace_back(functor);
+            }
+            REGISTER_OP_BUILDER(Concat);
+        }
+    }
+}
--- a/src/ngraph/runtime/cpu/builder/convert.cpp
+++ b/src/ngraph/runtime/cpu/builder/convert.cpp
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#include "ngraph/op/convert.hpp"
+#include "ngraph/runtime/cpu/cpu_builder.hpp"
+#include "ngraph/runtime/cpu/kernel/convert.hpp"
+using namespace std;
+using namespace ngraph;
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            template <>
+            void Builder::BUILDER_DECL(ngraph::op::Convert)
+            {
+                auto& functors = external_function->get_functors();
+                auto& tensor_data = external_function->get_tensor_data();
+                auto& arg_tensor = tensor_data[args[0].get_name()];
+                auto& out_tensor = tensor_data[out[0].get_name()];
+                auto element_count = out[0].get_size();
+                std::function<decltype(runtime::cpu::kernel::convert<float, int>)> kernel;
+                if (out[0].get_element_type() == element::boolean)
+                {
+                    SELECT_KERNEL(
+                        kernel, args[0].get_element_type(), runtime::cpu::kernel::convert_to_i8);
+                }
+                else if (out[0].get_element_type() == element::f32)
+                {
+                    SELECT_KERNEL(kernel,
+                                  args[0].get_element_type(),
+                                  runtime::cpu::kernel::convert_to_float32);
+                }
+                else if (out[0].get_element_type() == element::f64)
+                {
+                    SELECT_KERNEL(kernel,
+                                  args[0].get_element_type(),
+                                  runtime::cpu::kernel::convert_to_float64);
+                }
+                else if (out[0].get_element_type() == element::i8)
+                {
+                    SELECT_KERNEL(
+                        kernel, args[0].get_element_type(), runtime::cpu::kernel::convert_to_i8);
+                }
+                else if (out[0].get_element_type() == element::i16)
+                {
+                    SELECT_KERNEL(
+                        kernel, args[0].get_element_type(), runtime::cpu::kernel::convert_to_i16);
+                }
+                else if (out[0].get_element_type() == element::i32)
+                {
+                    SELECT_KERNEL(
+                        kernel, args[0].get_element_type(), runtime::cpu::kernel::convert_to_i32);
+                }
+                else if (out[0].get_element_type() == element::i64)
+                {
+                    SELECT_KERNEL(
+                        kernel, args[0].get_element_type(), runtime::cpu::kernel::convert_to_i64);
+                }
+                else if (out[0].get_element_type() == element::u8)
+                {
+                    SELECT_KERNEL(
+                        kernel, args[0].get_element_type(), runtime::cpu::kernel::convert_to_u8);
+                }
+                else if (out[0].get_element_type() == element::u16)
+                {
+                    SELECT_KERNEL(
+                        kernel, args[0].get_element_type(), runtime::cpu::kernel::convert_to_u16);
+                }
+                else if (out[0].get_element_type() == element::u32)
+                {
+                    SELECT_KERNEL(
+                        kernel, args[0].get_element_type(), runtime::cpu::kernel::convert_to_u32);
+                }
+                else if (out[0].get_element_type() == element::u64)
+                {
+                    SELECT_KERNEL(
+                        kernel, args[0].get_element_type(), runtime::cpu::kernel::convert_to_u64);
+                }
+                else
+                {
+                    throw ngraph_error("Cannot convert from an invalid input element type");
+                }
+                auto functor = [&, kernel, element_count](CPURuntimeContext* ctx) {
+                    kernel(arg_tensor, out_tensor, element_count);
+                };
+                functors.emplace_back(functor);
+            }
+            REGISTER_OP_BUILDER(Convert);
+        }
+    }
+}
--- a/src/ngraph/runtime/cpu/builder/convolution.cpp
+++ b/src/ngraph/runtime/cpu/builder/convolution.cpp
@@ -403,6 +403,14 @@ namespace ngraph
                        "ConvolutionBiasBackpropFiltersBias is only supported with MKLDNN kernel.");
                }
            }
+            REGISTER_OP_BUILDER(Convolution);
+            REGISTER_OP_BUILDER(ConvolutionRelu);
+            REGISTER_OP_BUILDER(ConvolutionBias);
+            REGISTER_OP_BUILDER(ConvolutionBiasAdd);
+            REGISTER_OP_BUILDER(ConvolutionBackpropData);
+            REGISTER_OP_BUILDER(ConvolutionBackpropFilters);
+            REGISTER_OP_BUILDER(ConvolutionBiasBackpropFiltersBias);
        }
    }
 }
--- a/src/ngraph/runtime/cpu/builder/dot.cpp
+++ b/src/ngraph/runtime/cpu/builder/dot.cpp
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#include <cstring>
+#include "ngraph/op/dot.hpp"
+#include "ngraph/runtime/cpu/cpu_builder.hpp"
+#include "ngraph/runtime/cpu/kernel/dot.hpp"
+using namespace std;
+using namespace ngraph;
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            template <>
+            void Builder::BUILDER_DECL(ngraph::op::Dot)
+            {
+                auto dot = static_cast<const ngraph::op::Dot*>(node);
+                auto& functors = external_function->get_functors();
+                auto& tensor_data = external_function->get_tensor_data();
+                auto arg0_shape = args[0].get_shape();
+                auto arg1_shape = args[1].get_shape();
+                auto result_shape = out[0].get_shape();
+                auto& arg0_tensor = tensor_data[args[0].get_name()];
+                auto& arg1_tensor = tensor_data[args[1].get_name()];
+                auto& out_tensor = tensor_data[out[0].get_name()];
+                auto reduction_axes_count = dot->get_reduction_axes_count();
+                if (!shape_size(result_shape))
+                {
+                    auto functor = [](CPURuntimeContext* ctx) {};
+                    functors.emplace_back(functor);
+                    return;
+                }
+                if (!shape_size(arg0_shape) || !shape_size(arg1_shape))
+                {
+                    auto size = shape_size(result_shape) * out[0].get_element_type().size();
+                    auto functor = [&, size](CPURuntimeContext* ctx) {
+                        memset(out_tensor, 0, size);
+                    };
+                    functors.emplace_back(functor);
+                    return;
+                }
+                if (arg0_shape.empty() || arg1_shape.empty())
+                {
+                    auto first = (arg0_shape.empty() ? args[0] : args[1]);
+                    auto second = (arg0_shape.empty() ? args[1] : args[0]);
+                    auto& first_tensor = tensor_data[first.get_name()];
+                    auto& second_tensor = tensor_data[second.get_name()];
+                    std::function<decltype(runtime::cpu::kernel::dot_scalar<float>)> kernel;
+                    SELECT_KERNEL(
+                        kernel, out[0].get_element_type(), runtime::cpu::kernel::dot_scalar);
+                    auto element_count = shape_size(second.get_shape());
+                    auto functor = [&, kernel, element_count](CPURuntimeContext* ctx) {
+                        kernel(first_tensor, second_tensor, out_tensor, element_count);
+                    };
+                    functors.emplace_back(functor);
+                    return;
+                }
+                if ((arg0_shape.size() == 1) && (arg1_shape.size() == 1) &&
+                    reduction_axes_count == 1)
+                {
+                    std::function<decltype(runtime::cpu::kernel::dot_1d_1d_1rd<float>)> kernel;
+                    SELECT_KERNEL(
+                        kernel, out[0].get_element_type(), runtime::cpu::kernel::dot_1d_1d_1rd);
+                    auto functor =
+                        [&, kernel, arg0_shape, arg1_shape, result_shape](CPURuntimeContext* ctx) {
+                            kernel(arg0_tensor,
+                                   arg1_tensor,
+                                   out_tensor,
+                                   arg0_shape,
+                                   arg1_shape,
+                                   result_shape);
+                        };
+                    functors.emplace_back(functor);
+                    return;
+                }
+                if ((arg0_shape.size() == 2) && (arg1_shape.size() == 1) &&
+                    reduction_axes_count == 1)
+                {
+                    std::function<decltype(runtime::cpu::kernel::dot_2d_1d_1rd<float>)> kernel;
+                    SELECT_KERNEL(
+                        kernel, out[0].get_element_type(), runtime::cpu::kernel::dot_2d_1d_1rd);
+                    auto functor =
+                        [&, kernel, arg0_shape, arg1_shape, result_shape](CPURuntimeContext* ctx) {
+                            kernel(arg0_tensor,
+                                   arg1_tensor,
+                                   out_tensor,
+                                   arg0_shape,
+                                   arg1_shape,
+                                   result_shape);
+                        };
+                    functors.emplace_back(functor);
+                    return;
+                }
+                if ((arg0_shape.size() == 3) && (arg1_shape.size() == 3) &&
+                    reduction_axes_count == 1)
+                {
+                    std::function<decltype(runtime::cpu::kernel::dot_3d_3d_1rd<float>)> kernel;
+                    SELECT_KERNEL(
+                        kernel, out[0].get_element_type(), runtime::cpu::kernel::dot_3d_3d_1rd);
+                    auto functor =
+                        [&, kernel, arg0_shape, arg1_shape, result_shape](CPURuntimeContext* ctx) {
+                            kernel(arg0_tensor,
+                                   arg1_tensor,
+                                   out_tensor,
+                                   arg0_shape,
+                                   arg1_shape,
+                                   result_shape);
+                        };
+                    functors.emplace_back(functor);
+                    return;
+                }
+                if ((arg0_shape.size() == 3) && (arg1_shape.size() == 2) &&
+                    reduction_axes_count == 1)
+                {
+                    std::function<decltype(runtime::cpu::kernel::dot_3d_2d_1rd<float>)> kernel;
+                    SELECT_KERNEL(
+                        kernel, out[0].get_element_type(), runtime::cpu::kernel::dot_3d_2d_1rd);
+                    auto functor =
+                        [&, kernel, arg0_shape, arg1_shape, result_shape](CPURuntimeContext* ctx) {
+                            kernel(arg0_tensor,
+                                   arg1_tensor,
+                                   out_tensor,
+                                   arg0_shape,
+                                   arg1_shape,
+                                   result_shape);
+                        };
+                    functors.emplace_back(functor);
+                    return;
+                }
+                std::function<decltype(runtime::cpu::kernel::dot<float>)> kernel;
+                SELECT_KERNEL(kernel, out[0].get_element_type(), runtime::cpu::kernel::dot);
+                auto functor =
+                    [&, kernel, arg0_shape, arg1_shape, result_shape, reduction_axes_count](
+                        CPURuntimeContext* ctx) {
+                        kernel(arg0_tensor,
+                               arg1_tensor,
+                               out_tensor,
+                               arg0_shape,
+                               arg1_shape,
+                               result_shape,
+                               reduction_axes_count);
+                    };
+                functors.emplace_back(functor);
+            }
+            REGISTER_OP_BUILDER(Dot);
+        }
+    }
+}
--- a/src/ngraph/runtime/cpu/builder/function_call.cpp
+++ b/src/ngraph/runtime/cpu/builder/function_call.cpp
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#include "ngraph/op/function_call.hpp"
+#include "ngraph/runtime/backend.hpp"
+#include "ngraph/runtime/cpu/cpu_builder.hpp"
+#include "ngraph/runtime/tensor_view.hpp"
+using namespace std;
+using namespace ngraph;
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            template <>
+            void Builder::BUILDER_DECL(ngraph::op::FunctionCall)
+            {
+                auto function_call = static_cast<const ngraph::op::FunctionCall*>(node);
+                auto function = function_call->get_functions()[0];
+                auto backend = runtime::Backend::create("CPU");
+                auto& functors = external_function->get_functors();
+                auto& tensor_data = external_function->get_tensor_data();
+                auto& callees = external_function->get_callees();
+                // Note: We bypass the completely broken ngraph "backend" API here
+                vector<reference_wrapper<void *>> arg_tensors, out_tensors;
+                vector<Shape> arg_shapes, out_shapes;
+                vector<element::Type> arg_types, out_types;
+                for (const auto& arg : args)
+                {
+                    arg_shapes.emplace_back(arg.get_shape());
+                    arg_types.emplace_back(arg.get_element_type());
+                    arg_tensors.emplace_back(tensor_data[arg.get_name()]);
+                }
+                for (const auto& result : out)
+                {
+                    out_shapes.emplace_back(result.get_shape());
+                    out_types.emplace_back(result.get_element_type());
+                    out_tensors.emplace_back(tensor_data[result.get_name()]);
+                }
+                if (!callees.count(function->get_name()))
+                {
+                    callees[function->get_name()] = make_shared<CPU_ExternalFunction>(function);
+                }
+                auto& callee_external_function = callees[function->get_name()];
+                auto functor = [&,
+                                backend,
+                                arg_shapes,
+                                arg_types,
+                                arg_tensors,
+                                out_shapes,
+                                out_types,
+                                out_tensors](CPURuntimeContext* ctx) {
+                    TensorViewPtrs inputs, outputs;
+                    for (int i = 0; i < arg_shapes.size(); i++)
+                    {
+                        inputs.emplace_back(
+                            backend->create_tensor(arg_types[i], arg_shapes[i], arg_tensors[i]));
+                    }
+                    for (int i = 0; i < out_shapes.size(); i++)
+                    {
+                        outputs.emplace_back(
+                            backend->create_tensor(out_types[i], out_shapes[i], out_tensors[i]));
+                    }
+                    auto call_frame = callee_external_function->make_call_frame();
+                    call_frame->call(outputs, inputs);
+                };
+                functors.emplace_back(functor);
+            }
+            REGISTER_OP_BUILDER(FunctionCall);
+        }
+    }
+}
--- a/src/ngraph/runtime/cpu/builder/matmul_bias.cpp
+++ b/src/ngraph/runtime/cpu/builder/matmul_bias.cpp
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#include "ngraph/runtime/cpu/op/matmul_bias.hpp"
+#include "ngraph/runtime/cpu/cpu_builder.hpp"
+#include "ngraph/runtime/cpu/cpu_kernels.hpp"
+using namespace std;
+using namespace ngraph;
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            template <>
+            void Builder::BUILDER_DECL(ngraph::op::MatmulBias)
+            {
+                auto& functors = external_function->get_functors();
+                auto& tensor_data = external_function->get_tensor_data();
+                auto& arg0_tensor = tensor_data[args[0].get_name()];
+                auto& arg1_tensor = tensor_data[args[1].get_name()];
+                auto& out0_tensor = tensor_data[out[0].get_name()];
+                const ngraph::op::MatmulBias* mm = static_cast<const ngraph::op::MatmulBias*>(node);
+                const auto& arg0_shape = mm->get_arg0_shape();
+                const auto& arg1_shape = mm->get_arg1_shape();
+                const auto& arg2_shape = node->get_shape();
+                auto m = arg0_shape[0];
+                auto n = arg1_shape[1];
+                auto k = arg0_shape[1];
+                bool transpose_A = false, transpose_B = false;
+                auto lda = arg0_shape[1];
+                auto ldb = arg1_shape[1];
+                if (mm->get_is_arg0_transposed())
+                {
+                    transpose_A = true;
+                    m = arg0_shape[1];
+                    k = arg0_shape[0];
+                }
+                if (mm->get_is_arg1_transposed())
+                {
+                    transpose_B = true;
+                    n = arg1_shape[0];
+                }
+                const float beta = 0.0f;
+                auto mm_functor =
+                    [&, transpose_A, transpose_B, m, n, k, lda, ldb, beta, arg2_shape](
+                        CPURuntimeContext* ctx) {
+                        cblas::cblas_sgemm(
+                            cblas::Layout::RowMajor,
+                            transpose_A ? cblas::Transpose::Transpose : cblas::Transpose::None,
+                            transpose_B ? cblas::Transpose::Transpose : cblas::Transpose::None,
+                            m,
+                            n,
+                            k,
+                            1.0f,
+                            static_cast<float*>(arg0_tensor),
+                            max(1UL, lda),
+                            static_cast<float*>(arg1_tensor),
+                            max(1UL, ldb),
+                            beta,
+                            static_cast<float*>(out0_tensor),
+                            max(1UL, arg2_shape[1]));
+                    };
+                function<void(CPURuntimeContext*)> bias_functor = [](CPURuntimeContext* ctx) {};
+                if (args.size() > 2)
+                {
+                    auto& arg2_tensor = tensor_data[args[2].get_name()];
+                    auto axes = mm->get_broadcast_axes();
+                    if (axes.size() == 1)
+                    {
+                        if (*(axes.begin()) == 0)
+                        {
+                            vector<float> ones_row(arg2_shape[0], 1.0f);
+                            bias_functor = [&, ones_row, arg2_shape](CPURuntimeContext* ctx) {
+                                cblas::cblas_sgemm(cblas::Layout::RowMajor,
+                                                   cblas::Transpose::None,
+                                                   cblas::Transpose::None,
+                                                   arg2_shape[0],
+                                                   arg2_shape[1],
+                                                   1,
+                                                   1.0f,
+                                                   ones_row.data(),
+                                                   1UL,
+                                                   static_cast<float*>(arg2_tensor),
+                                                   max(1UL, arg2_shape[1]),
+                                                   1.0f,
+                                                   static_cast<float*>(out0_tensor),
+                                                   max(1UL, arg2_shape[1]));
+                            };
+                        }
+                        else
+                        {
+                            vector<float> ones_col(arg2_shape[1], 1.0f);
+                            bias_functor = [&, ones_col, arg2_shape](CPURuntimeContext* ctx) {
+                                cblas::cblas_sgemm(cblas::Layout::RowMajor,
+                                                   cblas::Transpose::None,
+                                                   cblas::Transpose::None,
+                                                   arg2_shape[0],
+                                                   arg2_shape[1],
+                                                   1,
+                                                   1.0f,
+                                                   static_cast<float*>(arg2_tensor),
+                                                   1UL,
+                                                   ones_col.data(),
+                                                   max(1UL, arg2_shape[1]),
+                                                   1.0f,
+                                                   static_cast<float*>(out0_tensor),
+                                                   max(1UL, arg2_shape[1]));
+                            };
+                        }
+                    }
+                    else
+                    {
+                        if (axes.size() != 2)
+                        {
+                            throw ngraph_error("unexpected broadcast rank");
+                        }
+                        vector<float> ones_scalar(arg2_shape[0], 1.0f);
+                        bias_functor = [&, ones_scalar, arg2_shape](CPURuntimeContext* ctx) {
+                            vector<float> bias(arg2_shape[1], *static_cast<float*>(arg2_tensor));
+                            cblas::cblas_sgemm(cblas::Layout::RowMajor,
+                                               cblas::Transpose::None,
+                                               cblas::Transpose::None,
+                                               arg2_shape[0],
+                                               arg2_shape[1],
+                                               1,
+                                               1.0f,
+                                               ones_scalar.data(),
+                                               1UL,
+                                               bias.data(),
+                                               max(1UL, arg2_shape[1]),
+                                               1.0f,
+                                               static_cast<float*>(out0_tensor),
+                                               max(1UL, arg2_shape[1]));
+                        };
+                    }
+                }
+                auto functor = [&, mm_functor, bias_functor](CPURuntimeContext* ctx) {
+                    mm_functor(ctx);
+                    bias_functor(ctx);
+                };
+                functors.emplace_back(functor);
+            }
+            REGISTER_OP_BUILDER(MatmulBias);
+        }
+    }
+}
--- a/src/ngraph/runtime/cpu/builder/max.cpp
+++ b/src/ngraph/runtime/cpu/builder/max.cpp
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#include <cstring>
+#include "ngraph/op/max.hpp"
+#include "ngraph/runtime/cpu/cpu_builder.hpp"
+#include "ngraph/runtime/cpu/kernel/reduce_max.hpp"
+#include "reduction.hpp"
+using namespace std;
+using namespace ngraph;
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            template <>
+            void Builder::BUILDER_DECL(ngraph::op::Max)
+            {
+                BUILD_REDUCTION_FUNCTOR(Max, max);
+            }
+            REGISTER_OP_BUILDER(Max);
+        }
+    }
+}
--- a/src/ngraph/runtime/cpu/builder/max_pool.cpp
+++ b/src/ngraph/runtime/cpu/builder/max_pool.cpp
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#include "ngraph/runtime/cpu/kernel/max_pool.hpp"
+#include "ngraph/op/max_pool.hpp"
+#include "ngraph/runtime/cpu/cpu_builder.hpp"
+#include "ngraph/runtime/cpu/mkldnn_invoke.hpp"
+#include "ngraph/runtime/cpu/mkldnn_utils.hpp"
+using namespace std;
+using namespace ngraph;
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            template <>
+            void Builder::BUILDER_DECL(ngraph::op::MaxPool)
+            {
+                auto max_pool = static_cast<const ngraph::op::MaxPool*>(node);
+                auto& functors = external_function->get_functors();
+                auto& tensor_data = external_function->get_tensor_data();
+                auto arg0_shape = args[0].get_shape();
+                auto out_shape = out[0].get_shape();
+                auto& arg0_tensor = tensor_data[args[0].get_name()];
+                auto& out_tensor = tensor_data[out[0].get_name()];
+                auto window_shape = max_pool->get_window_shape();
+                auto window_movement_strides = max_pool->get_window_movement_strides();
+                auto padding_below = max_pool->get_padding_below();
+                auto padding_above = max_pool->get_padding_above();
+                if (runtime::cpu::mkldnn_utils::use_mkldnn_kernel(node))
+                {
+                    auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
+                    auto input_desc = mkldnn_emitter->build_memory_descriptor(
+                        args[0], runtime::cpu::mkldnn_utils::get_input_mkldnn_format(node, 0));
+                    auto result_desc = mkldnn_emitter->build_memory_descriptor(
+                        out[0], runtime::cpu::mkldnn_utils::get_output_mkldnn_format(node, 0));
+                    size_t max_pool_index =
+                        mkldnn_emitter->build_pooling_forward(mkldnn::algorithm::pooling_max,
+                                                              input_desc,
+                                                              result_desc,
+                                                              window_movement_strides,
+                                                              window_shape,
+                                                              padding_below,
+                                                              padding_above);
+                    auto& deps = mkldnn_emitter->get_primitive_deps(max_pool_index);
+                    auto functor = [&, max_pool_index](CPURuntimeContext* ctx) {
+                        cpu::mkldnn_utils::set_memory_ptr(ctx, deps[0], arg0_tensor);
+                        cpu::mkldnn_utils::set_memory_ptr(ctx, deps[1], out_tensor);
+                        cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, max_pool_index);
+                    };
+                    functors.emplace_back(functor);
+                }
+                else
+                {
+                    std::function<decltype(runtime::cpu::kernel::max_pool<float>)> kernel;
+                    SELECT_KERNEL(
+                        kernel, out[0].get_element_type(), runtime::cpu::kernel::max_pool);
+                    auto functor = [&,
+                                    kernel,
+                                    arg0_shape,
+                                    out_shape,
+                                    window_shape,
+                                    window_movement_strides,
+                                    padding_below,
+                                    padding_above](CPURuntimeContext* ctx) {
+                        kernel(arg0_tensor,
+                               out_tensor,
+                               arg0_shape,
+                               out_shape,
+                               window_shape,
+                               window_movement_strides,
+                               padding_below,
+                               padding_above);
+                    };
+                    functors.emplace_back(functor);
+                }
+            }
+            REGISTER_OP_BUILDER(MaxPool);
+        }
+    }
+}
--- a/src/ngraph/runtime/cpu/builder/min.cpp
+++ b/src/ngraph/runtime/cpu/builder/min.cpp
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#include <cstring>
+#include "ngraph/op/min.hpp"
+#include "ngraph/runtime/cpu/cpu_builder.hpp"
+#include "ngraph/runtime/cpu/kernel/reduce_min.hpp"
+#include "reduction.hpp"
+using namespace std;
+using namespace ngraph;
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            template <>
+            void Builder::BUILDER_DECL(ngraph::op::Min)
+            {
+                BUILD_REDUCTION_FUNCTOR(Min, min);
+            }
+            REGISTER_OP_BUILDER(Min);
+        }
+    }
+}
--- a/src/ngraph/runtime/cpu/builder/reduction.hpp
+++ b/src/ngraph/runtime/cpu/builder/reduction.hpp
--- a/src/ngraph/runtime/cpu/builder/reshape.cpp
+++ b/src/ngraph/runtime/cpu/builder/reshape.cpp
@@ -171,6 +171,8 @@ namespace ngraph
                    functors.emplace_back(functor);
                }
            }
+            REGISTER_OP_BUILDER(Reshape);
        }
    }
 }
--- a/src/ngraph/runtime/cpu/builder/reverse.cpp
+++ b/src/ngraph/runtime/cpu/builder/reverse.cpp
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#include "ngraph/op/reverse.hpp"
+#include "ngraph/runtime/cpu/cpu_builder.hpp"
+#include "ngraph/runtime/cpu/kernel/reverse.hpp"
+using namespace std;
+using namespace ngraph;
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            template <>
+            void Builder::BUILDER_DECL(ngraph::op::Reverse)
+            {
+                auto reverse = static_cast<const ngraph::op::Reverse*>(node);
+                auto& functors = external_function->get_functors();
+                auto& tensor_data = external_function->get_tensor_data();
+                auto& arg_tensor = tensor_data[args[0].get_name()];
+                auto& out_tensor = tensor_data[out[0].get_name()];
+                auto arg_shape = args[0].get_shape();
+                auto result_shape = out[0].get_shape();
+                auto reversed_axes = reverse->get_reversed_axes();
+                std::function<decltype(runtime::cpu::kernel::reverse<float>)> kernel;
+                SELECT_KERNEL(kernel, out[0].get_element_type(), runtime::cpu::kernel::reverse);
+                auto functor =
+                    [&, kernel, arg_shape, result_shape, reversed_axes](CPURuntimeContext* ctx) {
+                        kernel(arg_tensor, out_tensor, arg_shape, result_shape, reversed_axes);
+                    };
+                functors.emplace_back(functor);
+            }
+            REGISTER_OP_BUILDER(Reverse);
+        }
+    }
+}
--- a/src/ngraph/runtime/cpu/builder/reverse_sequence.cpp
+++ b/src/ngraph/runtime/cpu/builder/reverse_sequence.cpp
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#include "ngraph/op/reverse_sequence.hpp"
+#include "ngraph/runtime/cpu/cpu_builder.hpp"
+#include "ngraph/runtime/cpu/kernel/reverse_sequence.hpp"
+using namespace std;
+using namespace ngraph;
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            template <>
+            void Builder::BUILDER_DECL(ngraph::op::ReverseSequence)
+            {
+                auto rev_seq = static_cast<const ngraph::op::ReverseSequence*>(node);
+                auto& functors = external_function->get_functors();
+                auto& tensor_data = external_function->get_tensor_data();
+                auto& arg_tensor = tensor_data[args[0].get_name()];
+                auto& seq_len_tensor = tensor_data[args[1].get_name()];
+                auto& out_tensor = tensor_data[out[0].get_name()];
+                auto arg_shape = args[0].get_shape();
+                auto sequence_axis = rev_seq->get_sequence_axis();
+                auto batch_axis = rev_seq->get_batch_axis();
+                std::function<decltype(runtime::cpu::kernel::reverse_sequence<int, int, 4>)> kernel;
+                if (args[1].get_element_type() == element::i32)
+                {
+                    SELECT_KERNEL_BY_RANK(kernel,
+                                          args[0].get_element_type(),
+                                          arg_shape.size(),
+                                          runtime::cpu::kernel::reverse_sequence_sli32);
+                }
+                else
+                {
+                    throw ngraph_error("Unsupported sequence length type " +
+                                       args[1].get_element_type().c_type_string() +
+                                       " requires a kernel instantiation to handle this type");
+                }
+                auto functor =
+                    [&, kernel, arg_shape, batch_axis, sequence_axis](CPURuntimeContext* ctx) {
+                        kernel(arg_tensor,
+                               out_tensor,
+                               arg_shape,
+                               batch_axis,
+                               sequence_axis,
+                               seq_len_tensor);
+                    };
+                functors.emplace_back(functor);
+            }
+            REGISTER_OP_BUILDER(ReverseSequence);
+        }
+    }
+}
--- a/src/ngraph/runtime/cpu/builder/select.cpp
+++ b/src/ngraph/runtime/cpu/builder/select.cpp
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#include "ngraph/op/select.hpp"
+#include "ngraph/runtime/cpu/cpu_builder.hpp"
+#include "ngraph/runtime/cpu/kernel/select.hpp"
+using namespace std;
+using namespace ngraph;
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            template <>
+            void Builder::BUILDER_DECL(ngraph::op::Select)
+            {
+                auto& functors = external_function->get_functors();
+                auto& tensor_data = external_function->get_tensor_data();
+                auto& arg0_tensor = tensor_data[args[0].get_name()];
+                auto& arg1_tensor = tensor_data[args[1].get_name()];
+                auto& arg2_tensor = tensor_data[args[2].get_name()];
+                auto& out_tensor = tensor_data[out[0].get_name()];
+                auto element_count = args[0].get_size();
+                std::function<decltype(runtime::cpu::kernel::select<float>)> kernel;
+                SELECT_KERNEL(kernel, out[0].get_element_type(), runtime::cpu::kernel::select);
+                auto functor = [&, kernel, element_count](CPURuntimeContext* ctx) {
+                    kernel(arg0_tensor, arg1_tensor, arg2_tensor, out_tensor, element_count);
+                };
+                functors.emplace_back(functor);
+            }
+            REGISTER_OP_BUILDER(Select);
+        }
+    }
+}
--- a/src/ngraph/runtime/cpu/builder/select_and_scatter.cpp
+++ b/src/ngraph/runtime/cpu/builder/select_and_scatter.cpp
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#include "ngraph/op/select_and_scatter.hpp"
+#include "ngraph/runtime/backend.hpp"
+#include "ngraph/runtime/cpu/cpu_builder.hpp"
+#include "ngraph/runtime/reference/select_and_scatter.hpp"
+#include "ngraph/runtime/tensor_view.hpp"
+using namespace std;
+using namespace ngraph;
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            template <>
+            void Builder::BUILDER_DECL(ngraph::op::SelectAndScatter)
+            {
+                auto select_and_scatter = static_cast<const ngraph::op::SelectAndScatter*>(node);
+                auto select_function = select_and_scatter->get_functions()[0];
+                auto scatter_function = select_and_scatter->get_functions()[1];
+                auto backend = runtime::Backend::create("CPU");
+                auto& functors = external_function->get_functors();
+                auto& tensor_data = external_function->get_tensor_data();
+                auto& callees = external_function->get_callees();
+                // Note: We bypass the completely broken ngraph "backend" API here
+                auto element_type = node->get_output_element_type(0);
+                if (element_type != element::f32)
+                {
+                    throw ngraph_error(
+                        "CPU direct execution mode does not support non-float inputs, use compiled "
+                        "mode instead");
+                }
+                auto arg0_shape = args[0].get_shape();
+                auto& arg0_tensor = tensor_data[args[0].get_name()];
+                auto arg1_shape = args[1].get_shape();
+                auto& arg1_tensor = tensor_data[args[1].get_name()];
+                auto& arg2_tensor = tensor_data[args[2].get_name()];
+                auto out_shape = out[0].get_shape();
+                auto& out_tensor = tensor_data[out[0].get_name()];
+                auto window_shape = select_and_scatter->get_window_shape();
+                auto window_movement_strides = select_and_scatter->get_window_movement_strides();
+                if (!callees.count(select_function->get_name()))
+                {
+                    callees[select_function->get_name()] =
+                        make_shared<CPU_ExternalFunction>(select_function);
+                }
+                if (!callees.count(scatter_function->get_name()))
+                {
+                    callees[scatter_function->get_name()] =
+                        make_shared<CPU_ExternalFunction>(scatter_function);
+                }
+                auto& select_external_function = callees[select_function->get_name()];
+                auto& scatter_external_function = callees[scatter_function->get_name()];
+                auto select = [&, backend](float x, float y) {
+                    TensorViewPtrs inputs, outputs;
+                    char output;
+                    inputs.emplace_back(backend->create_tensor(element::f32, Shape{}, &x));
+                    inputs.emplace_back(backend->create_tensor(element::f32, Shape{}, &y));
+                    outputs.emplace_back(backend->create_tensor(element::f32, Shape{}, &output));
+                    select_external_function->make_call_frame()->call(outputs, inputs);
+                    return output;
+                };
+                auto scatter = [&, backend](float x, float y) {
+                    TensorViewPtrs inputs, outputs;
+                    float output;
+                    inputs.emplace_back(backend->create_tensor(element::f32, Shape{}, &x));
+                    inputs.emplace_back(backend->create_tensor(element::f32, Shape{}, &y));
+                    outputs.emplace_back(backend->create_tensor(element::f32, Shape{}, &output));
+                    scatter_external_function->make_call_frame()->call(outputs, inputs);
+                    return output;
+                };
+                auto functor = [&,
+                                backend,
+                                select,
+                                scatter,
+                                arg0_shape,
+                                arg1_shape,
+                                out_shape,
+                                window_shape,
+                                window_movement_strides](CPURuntimeContext* ctx) {
+                    reference::select_and_scatter<float>(static_cast<float*>(arg0_tensor),
+                                                         static_cast<float*>(arg1_tensor),
+                                                         static_cast<float*>(arg2_tensor),
+                                                         static_cast<float*>(out_tensor),
+                                                         arg0_shape,
+                                                         arg1_shape,
+                                                         out_shape,
+                                                         select,
+                                                         scatter,
+                                                         window_shape,
+                                                         window_movement_strides);
+                };
+                functors.emplace_back(functor);
+            }
+            REGISTER_OP_BUILDER(SelectAndScatter);
+        }
+    }
+}
--- a/src/ngraph/runtime/cpu/builder/sigmoid.cpp
+++ b/src/ngraph/runtime/cpu/builder/sigmoid.cpp
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+//#include "ngraph/runtime/cpu/kernel/avg_pool.hpp"
+#include "ngraph/op/sigmoid.hpp"
+#include "ngraph/runtime/cpu/cpu_builder.hpp"
+#include "ngraph/runtime/cpu/mkldnn_invoke.hpp"
+#include "ngraph/runtime/cpu/mkldnn_utils.hpp"
+using namespace std;
+using namespace ngraph;
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            template <>
+            void Builder::BUILDER_DECL(ngraph::op::Sigmoid)
+            {
+                auto& functors = external_function->get_functors();
+                auto& tensor_data = external_function->get_tensor_data();
+                auto& arg0_tensor = tensor_data[args[0].get_name()];
+                auto& out_tensor = tensor_data[out[0].get_name()];
+                auto input_shape = args[0].get_shape();
+                auto out_shape = out[0].get_shape();
+                auto input_size = static_cast<int>(shape_size(input_shape));
+                auto out_size = static_cast<int>(shape_size(out_shape));
+                auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
+                auto input_desc = mkldnn::memory::desc(
+                    {input_size},
+                    mkldnn_utils::get_mkldnn_data_type(args[0].get_element_type()),
+                    mkldnn::memory::format::x);
+                auto out_desc = mkldnn::memory::desc(
+                    {out_size},
+                    mkldnn_utils::get_mkldnn_data_type(out[0].get_element_type()),
+                    mkldnn::memory::format::x);
+                auto sigmoid_index = mkldnn_emitter->build_sigmoid_forward(input_desc, out_desc);
+                auto& deps = mkldnn_emitter->get_primitive_deps(sigmoid_index);
+                auto functor = [&, sigmoid_index](CPURuntimeContext* ctx) {
+                    cpu::mkldnn_utils::set_memory_ptr(ctx, deps[0], arg0_tensor);
+                    cpu::mkldnn_utils::set_memory_ptr(ctx, deps[1], out_tensor);
+                    cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, sigmoid_index);
+                };
+                functors.emplace_back(functor);
+            }
+            template <>
+            void Builder::BUILDER_DECL(ngraph::op::SigmoidBackprop)
+            {
+                auto& functors = external_function->get_functors();
+                auto& tensor_data = external_function->get_tensor_data();
+                auto& arg0_tensor = tensor_data[args[0].get_name()];
+                auto& arg1_tensor = tensor_data[args[1].get_name()];
+                auto& out_tensor = tensor_data[out[0].get_name()];
+                auto input_shape = args[0].get_shape();
+                auto delta_shape = args[1].get_shape();
+                auto out_shape = out[0].get_shape();
+                int input_size = static_cast<int>(shape_size(input_shape));
+                int delta_size = static_cast<int>(shape_size(delta_shape));
+                int out_size = static_cast<int>(shape_size(out_shape));
+                auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
+                auto input_desc = mkldnn::memory::desc(
+                    {input_size},
+                    mkldnn_utils::get_mkldnn_data_type(args[0].get_element_type()),
+                    mkldnn::memory::format::x);
+                auto delta_desc = mkldnn::memory::desc(
+                    {delta_size},
+                    mkldnn_utils::get_mkldnn_data_type(args[1].get_element_type()),
+                    mkldnn::memory::format::x);
+                auto out_desc = mkldnn::memory::desc(
+                    {out_size},
+                    mkldnn_utils::get_mkldnn_data_type(out[0].get_element_type()),
+                    mkldnn::memory::format::x);
+                size_t sigmoid_index =
+                    mkldnn_emitter->build_sigmoid_backward(input_desc, delta_desc, out_desc);
+                auto& deps = mkldnn_emitter->get_primitive_deps(sigmoid_index);
+                auto functor = [&, sigmoid_index](CPURuntimeContext* ctx) {
+                    cpu::mkldnn_utils::set_memory_ptr(ctx, deps[0], arg0_tensor);
+                    cpu::mkldnn_utils::set_memory_ptr(ctx, deps[1], arg1_tensor);
+                    cpu::mkldnn_utils::set_memory_ptr(ctx, deps[2], out_tensor);
+                    cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, sigmoid_index);
+                };
+                functors.emplace_back(functor);
+            }
+            REGISTER_OP_BUILDER(Sigmoid);
+            REGISTER_OP_BUILDER(SigmoidBackprop);
+        }
+    }
+}
--- a/src/ngraph/runtime/cpu/builder/sum.cpp
+++ b/src/ngraph/runtime/cpu/builder/sum.cpp
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#include <cstring>
+#include "ngraph/op/sum.hpp"
+#include "ngraph/runtime/cpu/cpu_builder.hpp"
+#include "ngraph/runtime/cpu/kernel/reduce_sum.hpp"
+#include "reduction.hpp"
+using namespace std;
+using namespace ngraph;
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            template <>
+            void Builder::BUILDER_DECL(ngraph::op::Sum)
+            {
+                BUILD_REDUCTION_FUNCTOR(Sum, sum);
+            }
+            REGISTER_OP_BUILDER(Sum);
+        }
+    }
+}
--- a/src/ngraph/runtime/cpu/cpu_builder.cpp
+++ b/src/ngraph/runtime/cpu/cpu_builder.cpp
--- a/src/ngraph/runtime/cpu/cpu_builder.hpp
+++ b/src/ngraph/runtime/cpu/cpu_builder.hpp
@@ -157,6 +157,49 @@
        SELECT_RANK(KV, uint64_t, R, K);                                                           \
    }
+#define BUILD_UNARY_ELEMWISE_FUNCTOR(OP)                                                           \
+    auto& functors = external_function->get_functors();                                            \
+    auto& tensor_data = external_function->get_tensor_data();                                      \
+    std::function<void(void*, void*, size_t)> kernel;                                              \
+                                                                                                   \
+    SELECT_KERNEL(kernel, args[0].get_element_type(), OP);                                         \
+                                                                                                   \
+    auto element_count = out[0].get_size();                                                        \
+    auto& arg0_tensor = tensor_data[args[0].get_name()];                                           \
+    auto& out0_tensor = tensor_data[out[0].get_name()];                                            \
+                                                                                                   \
+    auto functor = [&, kernel, element_count](CPURuntimeContext* ctx) {                            \
+        kernel(arg0_tensor, out0_tensor, element_count);                                           \
+    };                                                                                             \
+    functors.emplace_back(functor);
+#define BUILD_BINARY_ELEMWISE_FUNCTOR(OP)                                                          \
+    auto& functors = external_function->get_functors();                                            \
+    auto& tensor_data = external_function->get_tensor_data();                                      \
+    std::function<void(void*, void*, void*, size_t)> kernel;                                       \
+                                                                                                   \
+    SELECT_KERNEL(kernel, args[0].get_element_type(), OP);                                         \
+                                                                                                   \
+    auto element_count = out[0].get_size();                                                        \
+    auto& arg0_tensor = tensor_data[args[0].get_name()];                                           \
+    auto& arg1_tensor = tensor_data[args[1].get_name()];                                           \
+    auto& out0_tensor = tensor_data[out[0].get_name()];                                            \
+                                                                                                   \
+    auto functor = [&, kernel, element_count](CPURuntimeContext* ctx) {                            \
+        kernel(arg0_tensor, arg1_tensor, out0_tensor, element_count);                              \
+    };                                                                                             \
+    functors.emplace_back(functor);
+#define REGISTER_OP_BUILDER(OP)                                                                    \
+    static struct __register_##OP##_builder                                                        \
+    {                                                                                              \
+        __register_##OP##_builder()                                                                \
+        {                                                                                          \
+            build_dispatcher.insert({type_index(typeid(ngraph::op::OP)),                           \
+                                     &runtime::cpu::Builder::build<ngraph::op::OP>});              \
+        }                                                                                          \
+    } __register_##OP##_builder_instance;
 namespace ngraph
 {
    namespace runtime
@@ -171,7 +214,7 @@ namespace ngraph
            using BuildOpMap = std::unordered_map<std::type_index, BuildOpFunction>;
-            extern const BuildOpMap build_dispatcher;
+            extern BuildOpMap build_dispatcher;
            class Builder
            {

--- a/src/ngraph/runtime/cpu/cpu_external_function.cpp
+++ b/src/ngraph/runtime/cpu/cpu_external_function.cpp
@@ -1043,7 +1043,7 @@ void runtime::cpu::CPU_ExternalFunction::build()
    pass_manager.register_pass<ngraph::pass::GetOutputElementElimination>();
    pass_manager.register_pass<ngraph::pass::Liveness>();
    pass_manager.register_pass<ngraph::pass::MemoryLayout>(s_memory_pool_alignment, true);
-    pass_manager.run_passes(m_function);
+    pass_manager.run_passes(m_function, false);
    // Store layouts assigned for arguments
    for (const auto& parameter : m_function->get_parameters())
@@ -1166,6 +1166,8 @@ void runtime::cpu::CPU_ExternalFunction::build()
            out_names.push_back(tv->get_tensor().get_name());
        }
+        m_op_attrs.emplace_back(node->description(), out_names, in_names);
        size_t functor_count = functors.size();
        handler->second(this, node.get(), in, out);
@@ -1191,6 +1193,9 @@ void runtime::cpu::CPU_ExternalFunction::build()
    executor = [&](CPURuntimeContext* ctx, vector<void*>& inputs, vector<void*>& outputs) {
        static bool first_iteration = true;
+        cpu::Timestamp start_ts;
+        int profiler_count = 0;
        for (auto& p : intermediates_offsets)
        {
            tensor_data[p.first] =
@@ -1215,16 +1220,40 @@ void runtime::cpu::CPU_ExternalFunction::build()
            {
                for (size_t j = 0; j < p.second; j++)
                {
+                    if (runtime::cpu::IsTracingEnabled())
+                    {
+                        start_ts = cpu::Clock::now();
+                    }
                    (*functor)(ctx);
+                    if (runtime::cpu::IsTracingEnabled())
+                    {
+                        ctx->op_durations[profiler_count++] =
+                            (std::chrono::duration_cast<cpu::Timescale>(cpu::Clock::now() -
+                                                                        start_ts))
+                                .count();
+                    }
                    std::advance(functor, 1);
                }
            }
            else
            {
+                if (runtime::cpu::IsTracingEnabled())
+                {
+                    for (size_t j = 0; j < p.second; j++)
+                    {
+                        ctx->op_durations[profiler_count++] = 0;
+                    }
+                }
                std::advance(functor, p.second);
            }
        }
        first_iteration = false;
+        if (runtime::cpu::IsTracingEnabled())
+        {
+            assert(m_op_attrs.size() == profiler_count);
+        }
    };
    m_is_built = true;

--- a/src/ngraph/runtime/cpu/cpu_external_function.hpp
+++ b/src/ngraph/runtime/cpu/cpu_external_function.hpp
@@ -105,6 +105,11 @@ namespace ngraph
                {
                    return executor;
                }
+                std::unordered_map<std::string, std::shared_ptr<CPU_ExternalFunction>>&
+                    get_callees()
+                {
+                    return callees;
+                }
                bool is_direct_execution() const { return m_direct_execution; }
            protected:
                void build();
@@ -167,6 +172,7 @@ namespace ngraph
                std::unordered_map<std::string, bool> tensor_stale;
                std::unordered_map<std::string, size_t> intermediates_offsets;
                std::unordered_map<std::string, size_t> function_input_index, function_output_index;
+                std::unordered_map<std::string, std::shared_ptr<CPU_ExternalFunction>> callees;
                bool m_is_built;
                bool m_direct_execution;
            };

--- a/src/ngraph/runtime/cpu/kernel/and.hpp
+++ b/src/ngraph/runtime/cpu/kernel/and.hpp
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#pragma once
+#define EIGEN_USE_THREADS
+#include <unsupported/Eigen/CXX11/Tensor>
+#include "ngraph/runtime/cpu/kernel/eigen_thread_pool.hpp"
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            namespace kernel
+            {
+                void logical_and(void* input0, void* input1, void* output, size_t count)
+                {
+                    Eigen::array<Eigen::Index, 1> out_dims, in_dims;
+                    out_dims[0] = in_dims[0] = count;
+                    Eigen::TensorMap<Eigen::Tensor<char, 1, Eigen::RowMajor>> out(
+                        static_cast<char*>(output), out_dims);
+                    Eigen::TensorMap<Eigen::Tensor<char, 1, Eigen::RowMajor>> in0(
+                        static_cast<char*>(input0), in_dims);
+                    Eigen::TensorMap<Eigen::Tensor<char, 1, Eigen::RowMajor>> in1(
+                        static_cast<char*>(input1), in_dims);
+                    out.device(eigen::global_thread_pool_device) =
+                        (in0 && in1).template cast<char>();
+                }
+            }
+        }
+    }
+}
--- a/src/ngraph/runtime/cpu/kernel/batchnorm.hpp
+++ b/src/ngraph/runtime/cpu/kernel/batchnorm.hpp
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#pragma once
+#include "ngraph/runtime/reference/batch_norm.hpp"
+#include "ngraph/shape.hpp"
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            namespace kernel
+            {
+                template <typename ElementType>
+                void batch_norm_three_outputs(double eps,
+                                              const void* arg0,
+                                              const void* arg1,
+                                              const void* arg2,
+                                              void* out0,
+                                              void* out1,
+                                              void* out2,
+                                              const Shape& arg2_shape)
+                {
+                    reference::batch_norm_three_outputs(eps,
+                                                        static_cast<const ElementType*>(arg0),
+                                                        static_cast<const ElementType*>(arg1),
+                                                        static_cast<const ElementType*>(arg2),
+                                                        static_cast<ElementType*>(out0),
+                                                        static_cast<ElementType*>(out1),
+                                                        static_cast<ElementType*>(out2),
+                                                        arg2_shape);
+                }
+                template <typename ElementType>
+                void batch_norm_one_output(double eps,
+                                           const void* arg0,
+                                           const void* arg1,
+                                           const void* arg2,
+                                           const void* arg3,
+                                           const void* arg4,
+                                           void* out0,
+                                           const Shape& arg2_shape)
+                {
+                    reference::batch_norm_one_output(eps,
+                                                     static_cast<const ElementType*>(arg0),
+                                                     static_cast<const ElementType*>(arg1),
+                                                     static_cast<const ElementType*>(arg2),
+                                                     static_cast<const ElementType*>(arg3),
+                                                     static_cast<const ElementType*>(arg4),
+                                                     static_cast<ElementType*>(out0),
+                                                     arg2_shape);
+                }
+            }
+        }
+    }
+}
--- a/src/ngraph/runtime/cpu/kernel/broadcast.hpp
+++ b/src/ngraph/runtime/cpu/kernel/broadcast.hpp
@@ -19,8 +19,10 @@
 #define EIGEN_USE_THREADS
 #include <unsupported/Eigen/CXX11/Tensor>
+#include "ngraph/axis_set.hpp"
 #include "ngraph/runtime/cpu/kernel/eigen_thread_pool.hpp"
 #include "ngraph/runtime/reference/broadcast.hpp"
+#include "ngraph/shape.hpp"
 namespace ngraph
 {
@@ -30,18 +32,33 @@ namespace ngraph
        {
            namespace kernel
            {
-                template <typename ElementType>
+                template <typename ElementType, unsigned int Rank>
-                void broadcast(void* input0,
+                void broadcast(void* input,
                               void* output,
-                               const Shape& arg0_shape,
+                               const Shape& input_shape,
-                               const Shape& result_shape,
+                               const Shape& output_shape)
-                               const AxisSet& broadcast_axes)
                {
-                    reference::broadcast<ElementType>(static_cast<const ElementType*>(input0),
+                    Eigen::array<Eigen::Index, Rank> out_dims;
-                                                      static_cast<ElementType*>(output),
+                    Eigen::array<Eigen::Index, Rank> in_dims;
-                                                      arg0_shape,
-                                                      result_shape,
+                    for (int i = 0; i < Rank; i++)
-                                                      broadcast_axes);
+                    {
+                        out_dims[i] = output_shape[i];
+                        in_dims[i] = input_shape[i];
+                    }
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, Rank, Eigen::RowMajor>> out(
+                        static_cast<ElementType*>(output), out_dims);
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, Rank, Eigen::RowMajor>> in(
+                        static_cast<ElementType*>(input), in_dims);
+                    Eigen::array<ptrdiff_t, Rank> factors;
+                    for (int i = 0; i < Rank; i++)
+                    {
+                        factors[i] = output_shape[i] / input_shape[i];
+                    }
+                    out.device(eigen::global_thread_pool_device) = in.broadcast(factors);
                }
            }
        }

--- a/src/ngraph/runtime/cpu/kernel/concat.hpp
+++ b/src/ngraph/runtime/cpu/kernel/concat.hpp
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#pragma once
+#include <functional>
+#include <iostream>
+#include <vector>
+#define EIGEN_USE_THREADS
+#include <unsupported/Eigen/CXX11/Tensor>
+#include "ngraph/runtime/cpu/kernel/eigen_thread_pool.hpp"
+#include "ngraph/shape.hpp"
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            namespace kernel
+            {
+                template <typename ElementType, int Rank>
+                void concat(std::vector<std::reference_wrapper<void*>> inputs,
+                            std::vector<Shape> input_shapes,
+                            void* output,
+                            Shape output_shape,
+                            size_t axis)
+                {
+                    Eigen::array<Eigen::Index, Rank> out_dims;
+                    for (int i = 0; i < Rank; i++)
+                    {
+                        out_dims[i] = output_shape[i];
+                    }
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, Rank, Eigen::RowMajor>> out(
+                        static_cast<ElementType*>(output), out_dims);
+                    Eigen::array<Eigen::Index, Rank> in_dims, concat_pos;
+                    concat_pos.fill(0);
+                    for (int i = 0; i < input_shapes.size(); i++)
+                    {
+                        for (int j = 0; j < Rank; j++)
+                        {
+                            in_dims[j] = input_shapes[i][j];
+                        }
+                        Eigen::TensorMap<Eigen::Tensor<ElementType, Rank, Eigen::RowMajor>> in(
+                            static_cast<ElementType*>(inputs[i].get()), in_dims);
+                        out.slice(concat_pos, in_dims).device(eigen::global_thread_pool_device) =
+                            in;
+                        concat_pos[axis] += in_dims[axis];
+                    }
+                }
+            }
+        }
+    }
+}
--- a/src/ngraph/runtime/cpu/kernel/convert.hpp
+++ b/src/ngraph/runtime/cpu/kernel/convert.hpp
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#pragma once
+#define EIGEN_USE_THREADS
+#include <unsupported/Eigen/CXX11/Tensor>
+#include "ngraph/runtime/cpu/kernel/eigen_thread_pool.hpp"
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            namespace kernel
+            {
+                template <typename InputElementType, typename OutputElementType>
+                void convert(void* input, void* output, size_t count)
+                {
+                    Eigen::array<Eigen::Index, 1> out_dims, in_dims;
+                    out_dims[0] = in_dims[0] = count;
+                    Eigen::TensorMap<Eigen::Tensor<OutputElementType, 1, Eigen::RowMajor>> out(
+                        static_cast<OutputElementType*>(output), out_dims);
+                    Eigen::TensorMap<Eigen::Tensor<InputElementType, 1, Eigen::RowMajor>> in(
+                        static_cast<InputElementType*>(input), in_dims);
+                    out.device(eigen::global_thread_pool_device) =
+                        in.template cast<OutputElementType>();
+                }
+                template <typename InputElementType>
+                void convert_to_float32(void* input, void* output, size_t count)
+                {
+                    convert<InputElementType, float>(input, output, count);
+                }
+                template <typename InputElementType>
+                void convert_to_float64(void* input, void* output, size_t count)
+                {
+                    convert<InputElementType, double>(input, output, count);
+                }
+                template <typename InputElementType>
+                void convert_to_i8(void* input, void* output, size_t count)
+                {
+                    convert<InputElementType, int8_t>(input, output, count);
+                }
+                template <typename InputElementType>
+                void convert_to_i16(void* input, void* output, size_t count)
+                {
+                    convert<InputElementType, int16_t>(input, output, count);
+                }
+                template <typename InputElementType>
+                void convert_to_i32(void* input, void* output, size_t count)
+                {
+                    convert<InputElementType, int32_t>(input, output, count);
+                }
+                template <typename InputElementType>
+                void convert_to_i64(void* input, void* output, size_t count)
+                {
+                    convert<InputElementType, int64_t>(input, output, count);
+                }
+                template <typename InputElementType>
+                void convert_to_u8(void* input, void* output, size_t count)
+                {
+                    convert<InputElementType, uint8_t>(input, output, count);
+                }
+                template <typename InputElementType>
+                void convert_to_u16(void* input, void* output, size_t count)
+                {
+                    convert<InputElementType, uint16_t>(input, output, count);
+                }
+                template <typename InputElementType>
+                void convert_to_u32(void* input, void* output, size_t count)
+                {
+                    convert<InputElementType, uint32_t>(input, output, count);
+                }
+                template <typename InputElementType>
+                void convert_to_u64(void* input, void* output, size_t count)
+                {
+                    convert<InputElementType, uint64_t>(input, output, count);
+                }
+            }
+        }
+    }
+}
--- a/src/ngraph/runtime/cpu/kernel/cwise_pow.hpp
+++ b/src/ngraph/runtime/cpu/kernel/cwise_pow.hpp
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#pragma once
+#define EIGEN_USE_THREADS
+#include <unsupported/Eigen/CXX11/Tensor>
+#include "ngraph/runtime/cpu/kernel/eigen_thread_pool.hpp"
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            namespace kernel
+            {
+                template <typename ElementType>
+                void cwise_pow(void* input0, void* input1, void* output, size_t count)
+                {
+                    Eigen::array<Eigen::Index, 1> out_dims, in_dims;
+                    out_dims[0] = in_dims[0] = count;
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, 1, Eigen::RowMajor>> out(
+                        static_cast<ElementType*>(output), out_dims);
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, 1, Eigen::RowMajor>> in0(
+                        static_cast<ElementType*>(input0), in_dims);
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, 1, Eigen::RowMajor>> in1(
+                        static_cast<ElementType*>(input1), in_dims);
+                    out.device(eigen::global_thread_pool_device) = in0.binaryExpr(
+                        in1, Eigen::internal::scalar_pow_op<ElementType, ElementType>());
+                }
+            }
+        }
+    }
+}
--- a/src/ngraph/runtime/cpu/kernel/divide.hpp
+++ b/src/ngraph/runtime/cpu/kernel/divide.hpp
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#pragma once
+#define EIGEN_USE_THREADS
+#include <unsupported/Eigen/CXX11/Tensor>
+#include "ngraph/runtime/cpu/kernel/eigen_thread_pool.hpp"
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            namespace kernel
+            {
+                template <typename ElementType>
+                void divide(void* input0, void* input1, void* output, size_t count)
+                {
+                    Eigen::array<Eigen::Index, 1> out_dims, in_dims;
+                    out_dims[0] = in_dims[0] = count;
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, 1, Eigen::RowMajor>> out(
+                        static_cast<ElementType*>(output), out_dims);
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, 1, Eigen::RowMajor>> in0(
+                        static_cast<ElementType*>(input0), in_dims);
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, 1, Eigen::RowMajor>> in1(
+                        static_cast<ElementType*>(input1), in_dims);
+                    out.device(eigen::global_thread_pool_device) = in0 / in1;
+                }
+            }
+        }
+    }
+}
--- a/src/ngraph/runtime/cpu/kernel/dot.hpp
+++ b/src/ngraph/runtime/cpu/kernel/dot.hpp
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#pragma once
+#define EIGEN_USE_THREADS
+#include <unsupported/Eigen/CXX11/Tensor>
+#include "ngraph/runtime/cpu/kernel/eigen_thread_pool.hpp"
+#include "ngraph/runtime/reference/dot.hpp"
+#include "ngraph/shape.hpp"
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            namespace kernel
+            {
+                template <typename ElementType,
+                          unsigned int Input0Rank,
+                          unsigned int Input1Rank,
+                          unsigned int DotDims>
+                void dot(void* input0,
+                         void* input1,
+                         void* output,
+                         const Shape& input0_shape,
+                         const Shape& input1_shape,
+                         const Shape& output_shape)
+                {
+                    constexpr unsigned int OutRank = Input0Rank + Input1Rank - 2 * DotDims;
+                    Eigen::array<Eigen::Index, OutRank> out_dims;
+                    Eigen::array<Eigen::Index, Input0Rank> in0_dims;
+                    Eigen::array<Eigen::Index, Input1Rank> in1_dims;
+                    Eigen::array<Eigen::IndexPair<Eigen::Index>, DotDims> dot_dims;
+                    for (int i = 0; i < OutRank; i++)
+                    {
+                        out_dims[i] = output_shape[i];
+                    }
+                    for (int i = 0; i < Input0Rank; i++)
+                    {
+                        in0_dims[i] = input0_shape[i];
+                    }
+                    for (int i = 0; i < Input1Rank; i++)
+                    {
+                        in1_dims[i] = input1_shape[i];
+                    }
+                    for (int i = 0; i < DotDims; i++)
+                    {
+                        dot_dims[i].first = Input0Rank - DotDims + i;
+                        dot_dims[i].second = i;
+                    }
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, OutRank, Eigen::RowMajor>> out(
+                        static_cast<ElementType*>(output), out_dims);
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, Input0Rank, Eigen::RowMajor>> in0(
+                        static_cast<ElementType*>(input0), in0_dims);
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, Input1Rank, Eigen::RowMajor>> in1(
+                        static_cast<ElementType*>(input1), in1_dims);
+                    out.device(eigen::global_thread_pool_device) = in0.contract(in1, dot_dims);
+                }
+                template <typename ElementType>
+                void dot_scalar(void* input0, void* input1, void* output, size_t element_count)
+                {
+                    Eigen::array<Eigen::Index, 1> out_dims;
+                    Eigen::array<Eigen::Index, 1> in1_dims;
+                    out_dims[0] = element_count;
+                    in1_dims[0] = element_count;
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, 1, Eigen::RowMajor>> out(
+                        static_cast<ElementType*>(output), out_dims);
+                    auto in0 = static_cast<ElementType*>(input0);
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, 1, Eigen::RowMajor>> in1(
+                        static_cast<ElementType*>(input1), in1_dims);
+                    out.device(eigen::global_thread_pool_device) = in0[0] * in1;
+                }
+                template <typename ElementType>
+                void dot_1d_1d_1rd(void* input0,
+                                   void* input1,
+                                   void* output,
+                                   const Shape& input0_shape,
+                                   const Shape& input1_shape,
+                                   const Shape& output_shape)
+                {
+                    dot<ElementType, 1, 1, 1>(
+                        input0, input1, output, input0_shape, input1_shape, output_shape);
+                }
+                template <typename ElementType>
+                void dot_2d_1d_1rd(void* input0,
+                                   void* input1,
+                                   void* output,
+                                   const Shape& input0_shape,
+                                   const Shape& input1_shape,
+                                   const Shape& output_shape)
+                {
+                    dot<ElementType, 2, 1, 1>(
+                        input0, input1, output, input0_shape, input1_shape, output_shape);
+                }
+                template <typename ElementType>
+                void dot_3d_3d_1rd(void* input0,
+                                   void* input1,
+                                   void* output,
+                                   const Shape& input0_shape,
+                                   const Shape& input1_shape,
+                                   const Shape& output_shape)
+                {
+                    dot<ElementType, 3, 3, 1>(
+                        input0, input1, output, input0_shape, input1_shape, output_shape);
+                }
+                template <typename ElementType>
+                void dot_3d_2d_1rd(void* input0,
+                                   void* input1,
+                                   void* output,
+                                   const Shape& input0_shape,
+                                   const Shape& input1_shape,
+                                   const Shape& output_shape)
+                {
+                    dot<ElementType, 3, 2, 1>(
+                        input0, input1, output, input0_shape, input1_shape, output_shape);
+                }
+                template <typename ElementType>
+                void dot(void* arg0,
+                         void* arg1,
+                         void* out,
+                         const Shape& arg0_shape,
+                         const Shape& arg1_shape,
+                         const Shape& out_shape,
+                         size_t reduction_axes_count)
+                {
+                    reference::dot(static_cast<const ElementType*>(arg0),
+                                   static_cast<const ElementType*>(arg1),
+                                   static_cast<ElementType*>(out),
+                                   arg0_shape,
+                                   arg1_shape,
+                                   out_shape,
+                                   reduction_axes_count);
+                }
+            }
+        }
+    }
+}
--- a/src/ngraph/runtime/cpu/kernel/equal.hpp
+++ b/src/ngraph/runtime/cpu/kernel/equal.hpp
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#pragma once
+#define EIGEN_USE_THREADS
+#include <unsupported/Eigen/CXX11/Tensor>
+#include "ngraph/runtime/cpu/kernel/eigen_thread_pool.hpp"
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            namespace kernel
+            {
+                template <typename ElementType>
+                void equal(void* input0, void* input1, void* output, size_t count)
+                {
+                    Eigen::array<Eigen::Index, 1> out_dims, in_dims;
+                    out_dims[0] = in_dims[0] = count;
+                    Eigen::TensorMap<Eigen::Tensor<char, 1, Eigen::RowMajor>> out(
+                        static_cast<char*>(output), out_dims);
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, 1, Eigen::RowMajor>> in0(
+                        static_cast<ElementType*>(input0), in_dims);
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, 1, Eigen::RowMajor>> in1(
+                        static_cast<ElementType*>(input1), in_dims);
+                    out.device(eigen::global_thread_pool_device) =
+                        (in0 == in1).template cast<char>();
+                }
+            }
+        }
+    }
+}
--- a/src/ngraph/runtime/cpu/kernel/exp.hpp
+++ b/src/ngraph/runtime/cpu/kernel/exp.hpp
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#pragma once
+#define EIGEN_USE_THREADS
+#include <unsupported/Eigen/CXX11/Tensor>
+#include "ngraph/runtime/cpu/kernel/eigen_thread_pool.hpp"
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            namespace kernel
+            {
+                template <typename ElementType>
+                void exp(void* input0, void* output, size_t count)
+                {
+                    Eigen::array<Eigen::Index, 1> out_dims, in_dims;
+                    out_dims[0] = in_dims[0] = count;
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, 1, Eigen::RowMajor>> out(
+                        static_cast<ElementType*>(output), out_dims);
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, 1, Eigen::RowMajor>> in0(
+                        static_cast<ElementType*>(input0), in_dims);
+                    out.device(eigen::global_thread_pool_device) = in0.exp();
+                }
+            }
+        }
+    }
+}
--- a/src/ngraph/runtime/cpu/kernel/floor.hpp
+++ b/src/ngraph/runtime/cpu/kernel/floor.hpp
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#pragma once
+#define EIGEN_USE_THREADS
+#include <unsupported/Eigen/CXX11/Tensor>
+#include "ngraph/runtime/cpu/kernel/eigen_thread_pool.hpp"
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            namespace kernel
+            {
+                template <typename ElementType>
+                void floor(void* input0, void* output, size_t count)
+                {
+                    Eigen::array<Eigen::Index, 1> out_dims, in_dims;
+                    out_dims[0] = in_dims[0] = count;
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, 1, Eigen::RowMajor>> out(
+                        static_cast<ElementType*>(output), out_dims);
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, 1, Eigen::RowMajor>> in0(
+                        static_cast<ElementType*>(input0), in_dims);
+                    out.device(eigen::global_thread_pool_device) = in0.floor();
+                }
+            }
+        }
+    }
+}
--- a/src/ngraph/runtime/cpu/kernel/greater.hpp
+++ b/src/ngraph/runtime/cpu/kernel/greater.hpp
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#pragma once
+#define EIGEN_USE_THREADS
+#include <unsupported/Eigen/CXX11/Tensor>
+#include "ngraph/runtime/cpu/kernel/eigen_thread_pool.hpp"
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            namespace kernel
+            {
+                template <typename ElementType>
+                void greater(void* input0, void* input1, void* output, size_t count)
+                {
+                    Eigen::array<Eigen::Index, 1> out_dims, in_dims;
+                    out_dims[0] = in_dims[0] = count;
+                    Eigen::TensorMap<Eigen::Tensor<char, 1, Eigen::RowMajor>> out(
+                        static_cast<char*>(output), out_dims);
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, 1, Eigen::RowMajor>> in0(
+                        static_cast<ElementType*>(input0), in_dims);
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, 1, Eigen::RowMajor>> in1(
+                        static_cast<ElementType*>(input1), in_dims);
+                    out.device(eigen::global_thread_pool_device) =
+                        (in0 > in1).template cast<char>();
+                }
+            }
+        }
+    }
+}
--- a/src/ngraph/runtime/cpu/kernel/greater_eq.hpp
+++ b/src/ngraph/runtime/cpu/kernel/greater_eq.hpp
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#pragma once
+#define EIGEN_USE_THREADS
+#include <unsupported/Eigen/CXX11/Tensor>
+#include "ngraph/runtime/cpu/kernel/eigen_thread_pool.hpp"
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            namespace kernel
+            {
+                template <typename ElementType>
+                void greater_eq(void* input0, void* input1, void* output, size_t count)
+                {
+                    Eigen::array<Eigen::Index, 1> out_dims, in_dims;
+                    out_dims[0] = in_dims[0] = count;
+                    Eigen::TensorMap<Eigen::Tensor<char, 1, Eigen::RowMajor>> out(
+                        static_cast<char*>(output), out_dims);
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, 1, Eigen::RowMajor>> in0(
+                        static_cast<ElementType*>(input0), in_dims);
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, 1, Eigen::RowMajor>> in1(
+                        static_cast<ElementType*>(input1), in_dims);
+                    out.device(eigen::global_thread_pool_device) =
+                        (in0 >= in1).template cast<char>();
+                }
+            }
+        }
+    }
+}
--- a/src/ngraph/runtime/cpu/kernel/less.hpp
+++ b/src/ngraph/runtime/cpu/kernel/less.hpp
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#pragma once
+#define EIGEN_USE_THREADS
+#include <unsupported/Eigen/CXX11/Tensor>
+#include "ngraph/runtime/cpu/kernel/eigen_thread_pool.hpp"
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            namespace kernel
+            {
+                template <typename ElementType>
+                void less(void* input0, void* input1, void* output, size_t count)
+                {
+                    Eigen::array<Eigen::Index, 1> out_dims, in_dims;
+                    out_dims[0] = in_dims[0] = count;
+                    Eigen::TensorMap<Eigen::Tensor<char, 1, Eigen::RowMajor>> out(
+                        static_cast<char*>(output), out_dims);
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, 1, Eigen::RowMajor>> in0(
+                        static_cast<ElementType*>(input0), in_dims);
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, 1, Eigen::RowMajor>> in1(
+                        static_cast<ElementType*>(input1), in_dims);
+                    out.device(eigen::global_thread_pool_device) =
+                        (in0 < in1).template cast<char>();
+                }
+            }
+        }
+    }
+}
--- a/src/ngraph/runtime/cpu/kernel/less_eq.hpp
+++ b/src/ngraph/runtime/cpu/kernel/less_eq.hpp
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#pragma once
+#define EIGEN_USE_THREADS
+#include <unsupported/Eigen/CXX11/Tensor>
+#include "ngraph/runtime/cpu/kernel/eigen_thread_pool.hpp"
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            namespace kernel
+            {
+                template <typename ElementType>
+                void less_eq(void* input0, void* input1, void* output, size_t count)
+                {
+                    Eigen::array<Eigen::Index, 1> out_dims, in_dims;
+                    out_dims[0] = in_dims[0] = count;
+                    Eigen::TensorMap<Eigen::Tensor<char, 1, Eigen::RowMajor>> out(
+                        static_cast<char*>(output), out_dims);
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, 1, Eigen::RowMajor>> in0(
+                        static_cast<ElementType*>(input0), in_dims);
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, 1, Eigen::RowMajor>> in1(
+                        static_cast<ElementType*>(input1), in_dims);
+                    out.device(eigen::global_thread_pool_device) =
+                        (in0 <= in1).template cast<char>();
+                }
+            }
+        }
+    }
+}
--- a/src/ngraph/runtime/cpu/kernel/log.hpp
+++ b/src/ngraph/runtime/cpu/kernel/log.hpp
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#pragma once
+#define EIGEN_USE_THREADS
+#include <unsupported/Eigen/CXX11/Tensor>
+#include "ngraph/runtime/cpu/kernel/eigen_thread_pool.hpp"
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            namespace kernel
+            {
+                template <typename ElementType>
+                void log(void* input0, void* output, size_t count)
+                {
+                    Eigen::array<Eigen::Index, 1> out_dims, in_dims;
+                    out_dims[0] = in_dims[0] = count;
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, 1, Eigen::RowMajor>> out(
+                        static_cast<ElementType*>(output), out_dims);
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, 1, Eigen::RowMajor>> in0(
+                        static_cast<ElementType*>(input0), in_dims);
+                    out.device(eigen::global_thread_pool_device) = in0.log();
+                }
+            }
+        }
+    }
+}
--- a/src/ngraph/runtime/cpu/kernel/max_pool.hpp
+++ b/src/ngraph/runtime/cpu/kernel/max_pool.hpp
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#pragma once
+#include "ngraph/runtime/reference/max_pool.hpp"
+#include "ngraph/shape.hpp"
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            namespace kernel
+            {
+                template <typename ElementType>
+                void max_pool(void* arg,
+                              void* out,
+                              const Shape& arg_shape,
+                              const Shape& out_shape,
+                              const Shape& window_shape,
+                              const Strides& window_movement_strides,
+                              const Shape& padding_below,
+                              const Shape& padding_above)
+                {
+                    reference::max_pool<ElementType>(static_cast<const ElementType*>(arg),
+                                                     static_cast<ElementType*>(out),
+                                                     arg_shape,
+                                                     out_shape,
+                                                     window_shape,
+                                                     window_movement_strides,
+                                                     padding_below,
+                                                     padding_above);
+                }
+            }
+        }
+    }
+}
--- a/src/ngraph/runtime/cpu/kernel/maximum.hpp
+++ b/src/ngraph/runtime/cpu/kernel/maximum.hpp
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#pragma once
+#define EIGEN_USE_THREADS
+#include <unsupported/Eigen/CXX11/Tensor>
+#include "ngraph/runtime/cpu/kernel/eigen_thread_pool.hpp"
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            namespace kernel
+            {
+                template <typename ElementType>
+                void maximum(void* input0, void* input1, void* output, size_t count)
+                {
+                    Eigen::array<Eigen::Index, 1> out_dims, in_dims;
+                    out_dims[0] = in_dims[0] = count;
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, 1, Eigen::RowMajor>> out(
+                        static_cast<ElementType*>(output), out_dims);
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, 1, Eigen::RowMajor>> in0(
+                        static_cast<ElementType*>(input0), in_dims);
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, 1, Eigen::RowMajor>> in1(
+                        static_cast<ElementType*>(input1), in_dims);
+                    out.device(eigen::global_thread_pool_device) = in0.cwiseMax(in1);
+                }
+            }
+        }
+    }
+}
--- a/src/ngraph/runtime/cpu/kernel/minimum.hpp
+++ b/src/ngraph/runtime/cpu/kernel/minimum.hpp
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#pragma once
+#define EIGEN_USE_THREADS
+#include <unsupported/Eigen/CXX11/Tensor>
+#include "ngraph/runtime/cpu/kernel/eigen_thread_pool.hpp"
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            namespace kernel
+            {
+                template <typename ElementType>
+                void minimum(void* input0, void* input1, void* output, size_t count)
+                {
+                    Eigen::array<Eigen::Index, 1> out_dims, in_dims;
+                    out_dims[0] = in_dims[0] = count;
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, 1, Eigen::RowMajor>> out(
+                        static_cast<ElementType*>(output), out_dims);
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, 1, Eigen::RowMajor>> in0(
+                        static_cast<ElementType*>(input0), in_dims);
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, 1, Eigen::RowMajor>> in1(
+                        static_cast<ElementType*>(input1), in_dims);
+                    out.device(eigen::global_thread_pool_device) = in0.cwiseMin(in1);
+                }
+            }
+        }
+    }
+}
--- a/src/ngraph/runtime/cpu/kernel/negative.hpp
+++ b/src/ngraph/runtime/cpu/kernel/negative.hpp
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#pragma once
+#define EIGEN_USE_THREADS
+#include <unsupported/Eigen/CXX11/Tensor>
+#include "ngraph/runtime/cpu/kernel/eigen_thread_pool.hpp"
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            namespace kernel
+            {
+                template <typename ElementType>
+                void negative(void* input0, void* output, size_t count)
+                {
+                    Eigen::array<Eigen::Index, 1> out_dims, in_dims;
+                    out_dims[0] = in_dims[0] = count;
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, 1, Eigen::RowMajor>> out(
+                        static_cast<ElementType*>(output), out_dims);
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, 1, Eigen::RowMajor>> in0(
+                        static_cast<ElementType*>(input0), in_dims);
+                    out.device(eigen::global_thread_pool_device) = -in0;
+                }
+            }
+        }
+    }
+}
--- a/src/ngraph/runtime/cpu/kernel/not.hpp
+++ b/src/ngraph/runtime/cpu/kernel/not.hpp
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#pragma once
+#define EIGEN_USE_THREADS
+#include <unsupported/Eigen/CXX11/Tensor>
+#include "ngraph/runtime/cpu/kernel/eigen_thread_pool.hpp"
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            namespace kernel
+            {
+                template <typename ElementType>
+                void logical_not(void* input0, void* output, size_t count)
+                {
+                    Eigen::array<Eigen::Index, 1> out_dims, in_dims;
+                    out_dims[0] = in_dims[0] = count;
+                    Eigen::TensorMap<Eigen::Tensor<char, 1, Eigen::RowMajor>> out(
+                        static_cast<char*>(output), out_dims);
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, 1, Eigen::RowMajor>> in0(
+                        static_cast<ElementType*>(input0), in_dims);
+                    out.device(eigen::global_thread_pool_device) =
+                        (in0 == ElementType(0)).template cast<char>();
+                }
+            }
+        }
+    }
+}
--- a/src/ngraph/runtime/cpu/kernel/not_equal.hpp
+++ b/src/ngraph/runtime/cpu/kernel/not_equal.hpp
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#pragma once
+#define EIGEN_USE_THREADS
+#include <unsupported/Eigen/CXX11/Tensor>
+#include "ngraph/runtime/cpu/kernel/eigen_thread_pool.hpp"
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            namespace kernel
+            {
+                template <typename ElementType>
+                void not_equal(void* input0, void* input1, void* output, size_t count)
+                {
+                    Eigen::array<Eigen::Index, 1> out_dims, in_dims;
+                    out_dims[0] = in_dims[0] = count;
+                    Eigen::TensorMap<Eigen::Tensor<char, 1, Eigen::RowMajor>> out(
+                        static_cast<char*>(output), out_dims);
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, 1, Eigen::RowMajor>> in0(
+                        static_cast<ElementType*>(input0), in_dims);
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, 1, Eigen::RowMajor>> in1(
+                        static_cast<ElementType*>(input1), in_dims);
+                    out.device(eigen::global_thread_pool_device) =
+                        (in0 != in1).template cast<char>();
+                }
+            }
+        }
+    }
+}
--- a/src/ngraph/runtime/cpu/kernel/or.hpp
+++ b/src/ngraph/runtime/cpu/kernel/or.hpp
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#pragma once
+#define EIGEN_USE_THREADS
+#include <unsupported/Eigen/CXX11/Tensor>
+#include "ngraph/runtime/cpu/kernel/eigen_thread_pool.hpp"
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            namespace kernel
+            {
+                void logical_or(void* input0, void* input1, void* output, size_t count)
+                {
+                    Eigen::array<Eigen::Index, 1> out_dims, in_dims;
+                    out_dims[0] = in_dims[0] = count;
+                    Eigen::TensorMap<Eigen::Tensor<char, 1, Eigen::RowMajor>> out(
+                        static_cast<char*>(output), out_dims);
+                    Eigen::TensorMap<Eigen::Tensor<char, 1, Eigen::RowMajor>> in0(
+                        static_cast<char*>(input0), in_dims);
+                    Eigen::TensorMap<Eigen::Tensor<char, 1, Eigen::RowMajor>> in1(
+                        static_cast<char*>(input1), in_dims);
+                    out.device(eigen::global_thread_pool_device) =
+                        (in0 || in1).template cast<char>();
+                }
+            }
+        }
+    }
+}
--- a/src/ngraph/runtime/cpu/kernel/reduce_max.hpp
+++ b/src/ngraph/runtime/cpu/kernel/reduce_max.hpp
@@ -20,6 +20,7 @@
 #include <unsupported/Eigen/CXX11/Tensor>
 #include "ngraph/runtime/cpu/kernel/eigen_thread_pool.hpp"
+#include "ngraph/runtime/reference/max.hpp"
 #include "ngraph/shape.hpp"
 namespace ngraph
@@ -31,8 +32,8 @@ namespace ngraph
            namespace kernel
            {
                template <typename ElementType, unsigned int Rank>
-                void reduce_max_all(ElementType* input,
+                void reduce_max_all(void* input,
-                                    ElementType* output,
+                                    void* output,
                                    const Shape& input_shape,
                                    const Shape& output_shape)
                {
@@ -44,16 +45,16 @@ namespace ngraph
                        in_dims[i] = input_shape[i];
                    }
-                    Eigen::TensorMap<Eigen::Tensor<ElementType, 0, Eigen::RowMajor>> out(output,
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, 0, Eigen::RowMajor>> out(
-                                                                                         out_dims);
+                        static_cast<ElementType*>(output), out_dims);
-                    Eigen::TensorMap<Eigen::Tensor<ElementType, Rank, Eigen::RowMajor>> in(input,
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, Rank, Eigen::RowMajor>> in(
-                                                                                           in_dims);
+                        static_cast<ElementType*>(input), in_dims);
                    out.device(eigen::global_thread_pool_device) = in.maximum();
                }
                template <typename ElementType, unsigned int Rank, unsigned int ReductionDims>
-                void reduce_max(ElementType* input,
+                void reduce_max(void* input,
-                                ElementType* output,
+                                void* output,
                                const Shape& input_shape,
                                const Shape& output_shape,
                                const AxisSet& reduction_axes)
@@ -80,11 +81,69 @@ namespace ngraph
                    Eigen::TensorMap<
                        Eigen::Tensor<ElementType, Rank - ReductionDims, Eigen::RowMajor>>
-                        out(output, out_dims);
+                        out(static_cast<ElementType*>(output), out_dims);
-                    Eigen::TensorMap<Eigen::Tensor<ElementType, Rank, Eigen::RowMajor>> in(input,
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, Rank, Eigen::RowMajor>> in(
-                                                                                           in_dims);
+                        static_cast<ElementType*>(input), in_dims);
                    out.device(eigen::global_thread_pool_device) = in.maximum(reduction_dims);
                }
+                template <typename ElementType, unsigned int Rank>
+                void reduce_max_1rd(void* input,
+                                    void* output,
+                                    const Shape& input_shape,
+                                    const Shape& output_shape,
+                                    const AxisSet& reduction_axes)
+                {
+                    reduce_max<ElementType, Rank, 1>(
+                        input, output, input_shape, output_shape, reduction_axes);
+                }
+                template <typename ElementType>
+                void reduce_max_3d_2rd(void* input,
+                                       void* output,
+                                       const Shape& input_shape,
+                                       const Shape& output_shape,
+                                       const AxisSet& reduction_axes)
+                {
+                    reduce_max<ElementType, 3, 2>(
+                        input, output, input_shape, output_shape, reduction_axes);
+                }
+                template <typename ElementType>
+                void reduce_max_4d_2rd(void* input,
+                                       void* output,
+                                       const Shape& input_shape,
+                                       const Shape& output_shape,
+                                       const AxisSet& reduction_axes)
+                {
+                    reduce_max<ElementType, 4, 2>(
+                        input, output, input_shape, output_shape, reduction_axes);
+                }
+                template <typename ElementType>
+                void reduce_max_5d_2rd(void* input,
+                                       void* output,
+                                       const Shape& input_shape,
+                                       const Shape& output_shape,
+                                       const AxisSet& reduction_axes)
+                {
+                    reduce_max<ElementType, 5, 2>(
+                        input, output, input_shape, output_shape, reduction_axes);
+                }
+                template <typename ElementType>
+                void max(void* arg,
+                         void* out,
+                         const Shape& in_shape,
+                         const Shape& out_shape,
+                         const AxisSet& reduction_axes)
+                {
+                    reference::max(static_cast<ElementType*>(arg),
+                                   static_cast<ElementType*>(out),
+                                   in_shape,
+                                   out_shape,
+                                   reduction_axes);
+                }
            }
        }
    }

--- a/src/ngraph/runtime/cpu/kernel/reduce_min.hpp
+++ b/src/ngraph/runtime/cpu/kernel/reduce_min.hpp
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#pragma once
+#define EIGEN_USE_THREADS
+#include <unsupported/Eigen/CXX11/Tensor>
+#include "ngraph/runtime/cpu/kernel/eigen_thread_pool.hpp"
+#include "ngraph/runtime/reference/min.hpp"
+#include "ngraph/shape.hpp"
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            namespace kernel
+            {
+                template <typename ElementType, unsigned int Rank>
+                void reduce_min_all(void* input,
+                                    void* output,
+                                    const Shape& input_shape,
+                                    const Shape& output_shape)
+                {
+                    Eigen::array<Eigen::Index, Rank> in_dims;
+                    Eigen::array<Eigen::Index, 0> out_dims;
+                    for (int i = 0; i < Rank; i++)
+                    {
+                        in_dims[i] = input_shape[i];
+                    }
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, 0, Eigen::RowMajor>> out(
+                        static_cast<ElementType*>(output), out_dims);
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, Rank, Eigen::RowMajor>> in(
+                        static_cast<ElementType*>(input), in_dims);
+                    out.device(eigen::global_thread_pool_device) = in.minimum();
+                }
+                template <typename ElementType, unsigned int Rank, unsigned int ReductionDims>
+                void reduce_min(void* input,
+                                void* output,
+                                const Shape& input_shape,
+                                const Shape& output_shape,
+                                const AxisSet& reduction_axes)
+                {
+                    Eigen::array<Eigen::Index, Rank> in_dims;
+                    Eigen::array<Eigen::Index, Rank - ReductionDims> out_dims;
+                    Eigen::array<Eigen::Index, ReductionDims> reduction_dims;
+                    for (int i = 0; i < Rank; i++)
+                    {
+                        in_dims[i] = input_shape[i];
+                    }
+                    for (int i = 0; i < Rank - ReductionDims; i++)
+                    {
+                        out_dims[i] = output_shape[i];
+                    }
+                    int i = 0;
+                    for (auto axis : reduction_axes)
+                    {
+                        reduction_dims[i++] = axis;
+                    }
+                    Eigen::TensorMap<
+                        Eigen::Tensor<ElementType, Rank - ReductionDims, Eigen::RowMajor>>
+                        out(static_cast<ElementType*>(output), out_dims);
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, Rank, Eigen::RowMajor>> in(
+                        static_cast<ElementType*>(input), in_dims);
+                    out.device(eigen::global_thread_pool_device) = in.minimum(reduction_dims);
+                }
+                template <typename ElementType, unsigned int Rank>
+                void reduce_min_1rd(void* input,
+                                    void* output,
+                                    const Shape& input_shape,
+                                    const Shape& output_shape,
+                                    const AxisSet& reduction_axes)
+                {
+                    reduce_min<ElementType, Rank, 1>(
+                        input, output, input_shape, output_shape, reduction_axes);
+                }
+                template <typename ElementType>
+                void reduce_min_3d_2rd(void* input,
+                                       void* output,
+                                       const Shape& input_shape,
+                                       const Shape& output_shape,
+                                       const AxisSet& reduction_axes)
+                {
+                    reduce_min<ElementType, 3, 2>(
+                        input, output, input_shape, output_shape, reduction_axes);
+                }
+                template <typename ElementType>
+                void reduce_min_4d_2rd(void* input,
+                                       void* output,
+                                       const Shape& input_shape,
+                                       const Shape& output_shape,
+                                       const AxisSet& reduction_axes)
+                {
+                    reduce_min<ElementType, 4, 2>(
+                        input, output, input_shape, output_shape, reduction_axes);
+                }
+                template <typename ElementType>
+                void reduce_min_5d_2rd(void* input,
+                                       void* output,
+                                       const Shape& input_shape,
+                                       const Shape& output_shape,
+                                       const AxisSet& reduction_axes)
+                {
+                    reduce_min<ElementType, 5, 2>(
+                        input, output, input_shape, output_shape, reduction_axes);
+                }
+                template <typename ElementType>
+                void min(void* arg,
+                         void* out,
+                         const Shape& in_shape,
+                         const Shape& out_shape,
+                         const AxisSet& reduction_axes)
+                {
+                    reference::min(static_cast<ElementType*>(arg),
+                                   static_cast<ElementType*>(out),
+                                   in_shape,
+                                   out_shape,
+                                   reduction_axes);
+                }
+            }
+        }
+    }
+}
--- a/src/ngraph/runtime/cpu/kernel/reduce_sum.hpp
+++ b/src/ngraph/runtime/cpu/kernel/reduce_sum.hpp
@@ -20,6 +20,7 @@
 #include <unsupported/Eigen/CXX11/Tensor>
 #include "ngraph/runtime/cpu/kernel/eigen_thread_pool.hpp"
+#include "ngraph/runtime/reference/sum.hpp"
 #include "ngraph/shape.hpp"
 namespace ngraph
@@ -31,8 +32,8 @@ namespace ngraph
            namespace kernel
            {
                template <typename ElementType, unsigned int Rank>
-                void reduce_sum_all(ElementType* input,
+                void reduce_sum_all(void* input,
-                                    ElementType* output,
+                                    void* output,
                                    const Shape& input_shape,
                                    const Shape& output_shape)
                {
@@ -44,16 +45,16 @@ namespace ngraph
                        in_dims[i] = input_shape[i];
                    }
-                    Eigen::TensorMap<Eigen::Tensor<ElementType, 0, Eigen::RowMajor>> out(output,
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, 0, Eigen::RowMajor>> out(
-                                                                                         out_dims);
+                        static_cast<ElementType*>(output), out_dims);
-                    Eigen::TensorMap<Eigen::Tensor<ElementType, Rank, Eigen::RowMajor>> in(input,
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, Rank, Eigen::RowMajor>> in(
-                                                                                           in_dims);
+                        static_cast<ElementType*>(input), in_dims);
                    out.device(eigen::global_thread_pool_device) = in.sum();
                }
                template <typename ElementType, unsigned int Rank, unsigned int ReductionDims>
-                void reduce_sum(ElementType* input,
+                void reduce_sum(void* input,
-                                ElementType* output,
+                                void* output,
                                const Shape& input_shape,
                                const Shape& output_shape,
                                const AxisSet& reduction_axes)
@@ -80,11 +81,69 @@ namespace ngraph
                    Eigen::TensorMap<
                        Eigen::Tensor<ElementType, Rank - ReductionDims, Eigen::RowMajor>>
-                        out(output, out_dims);
+                        out(static_cast<ElementType*>(output), out_dims);
-                    Eigen::TensorMap<Eigen::Tensor<ElementType, Rank, Eigen::RowMajor>> in(input,
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, Rank, Eigen::RowMajor>> in(
-                                                                                           in_dims);
+                        static_cast<ElementType*>(input), in_dims);
                    out.device(eigen::global_thread_pool_device) = in.sum(reduction_dims);
                }
+                template <typename ElementType, unsigned int Rank>
+                void reduce_sum_1rd(void* input,
+                                    void* output,
+                                    const Shape& input_shape,
+                                    const Shape& output_shape,
+                                    const AxisSet& reduction_axes)
+                {
+                    reduce_sum<ElementType, Rank, 1>(
+                        input, output, input_shape, output_shape, reduction_axes);
+                }
+                template <typename ElementType>
+                void reduce_sum_3d_2rd(void* input,
+                                       void* output,
+                                       const Shape& input_shape,
+                                       const Shape& output_shape,
+                                       const AxisSet& reduction_axes)
+                {
+                    reduce_sum<ElementType, 3, 2>(
+                        input, output, input_shape, output_shape, reduction_axes);
+                }
+                template <typename ElementType>
+                void reduce_sum_4d_2rd(void* input,
+                                       void* output,
+                                       const Shape& input_shape,
+                                       const Shape& output_shape,
+                                       const AxisSet& reduction_axes)
+                {
+                    reduce_sum<ElementType, 4, 2>(
+                        input, output, input_shape, output_shape, reduction_axes);
+                }
+                template <typename ElementType>
+                void reduce_sum_5d_2rd(void* input,
+                                       void* output,
+                                       const Shape& input_shape,
+                                       const Shape& output_shape,
+                                       const AxisSet& reduction_axes)
+                {
+                    reduce_sum<ElementType, 5, 2>(
+                        input, output, input_shape, output_shape, reduction_axes);
+                }
+                template <typename ElementType>
+                void sum(void* arg,
+                         void* out,
+                         const Shape& in_shape,
+                         const Shape& out_shape,
+                         const AxisSet& reduction_axes)
+                {
+                    reference::sum(static_cast<ElementType*>(arg),
+                                   static_cast<ElementType*>(out),
+                                   in_shape,
+                                   out_shape,
+                                   reduction_axes);
+                }
            }
        }
    }

--- a/src/ngraph/runtime/cpu/kernel/reverse.hpp
+++ b/src/ngraph/runtime/cpu/kernel/reverse.hpp
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#pragma once
+#include "ngraph/runtime/reference/reverse.hpp"
+#include "ngraph/shape.hpp"
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            namespace kernel
+            {
+                template <typename ElementType>
+                void reverse(const void* arg,
+                             void* out,
+                             const Shape& arg_shape,
+                             const Shape& out_shape,
+                             const AxisSet& reversed_axes)
+                {
+                    reference::reverse(static_cast<const ElementType*>(arg),
+                                       static_cast<ElementType*>(out),
+                                       arg_shape,
+                                       out_shape,
+                                       reversed_axes);
+                }
+            }
+        }
+    }
+}
--- a/src/ngraph/runtime/cpu/kernel/reverse_sequence.hpp
+++ b/src/ngraph/runtime/cpu/kernel/reverse_sequence.hpp
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#pragma once
+#include <cstdint>
+#define EIGEN_USE_THREADS
+#include <unsupported/Eigen/CXX11/Tensor>
+#include "ngraph/runtime/cpu/kernel/eigen_thread_pool.hpp"
+#include "ngraph/shape.hpp"
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            namespace kernel
+            {
+                template <typename InputElementType, typename SeqLenType, unsigned int Rank>
+                void reverse_sequence(void* input,
+                                      void* output,
+                                      const Shape& input_shape,
+                                      size_t batch_axis,
+                                      size_t sequence_axis,
+                                      void* sequence_lengths)
+                {
+                    Eigen::array<Eigen::Index, Rank> in_dims;
+                    for (int i = 0; i < Rank; i++)
+                    {
+                        in_dims[i] = input_shape[i];
+                    }
+                    Eigen::TensorMap<Eigen::Tensor<InputElementType, Rank, Eigen::RowMajor>> out(
+                        static_cast<InputElementType*>(output), in_dims);
+                    Eigen::TensorMap<Eigen::Tensor<InputElementType, Rank, Eigen::RowMajor>> in(
+                        static_cast<InputElementType*>(input), in_dims);
+                    auto slv = static_cast<SeqLenType*>(sequence_lengths);
+                    auto generator = [&](const Eigen::array<Eigen::DenseIndex, Rank>& i) {
+                        Eigen::array<Eigen::DenseIndex, Rank> k = i;
+                        if (i[sequence_axis] < slv[i[batch_axis]])
+                        {
+                            k[sequence_axis] = slv[i[batch_axis]] - i[sequence_axis] - 1;
+                        }
+                        return in(k);
+                    };
+                    out.device(eigen::global_thread_pool_device) = in.generate(generator);
+                }
+                template <typename InputElementType, unsigned int Rank>
+                void reverse_sequence_sli32(void* input,
+                                            void* output,
+                                            const Shape& input_shape,
+                                            size_t batch_axis,
+                                            size_t sequence_axis,
+                                            void* sequence_lengths)
+                {
+                    reverse_sequence<InputElementType, int32_t, Rank>(
+                        input, output, input_shape, batch_axis, sequence_axis, sequence_lengths);
+                }
+            }
+        }
+    }
+}
--- a/src/ngraph/runtime/cpu/kernel/select.hpp
+++ b/src/ngraph/runtime/cpu/kernel/select.hpp
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#pragma once
+#define EIGEN_USE_THREADS
+#include <unsupported/Eigen/CXX11/Tensor>
+#include "ngraph/runtime/cpu/kernel/eigen_thread_pool.hpp"
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            namespace kernel
+            {
+                template <typename ElementType>
+                void select(void* input0, void* input1, void* input2, void* output, size_t count)
+                {
+                    Eigen::array<Eigen::Index, 1> out_dims, in_dims;
+                    out_dims[0] = in_dims[0] = count;
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, 1, Eigen::RowMajor>> out(
+                        static_cast<ElementType*>(output), out_dims);
+                    Eigen::TensorMap<Eigen::Tensor<char, 1, Eigen::RowMajor>> in0(
+                        static_cast<char*>(input0), in_dims);
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, 1, Eigen::RowMajor>> in1(
+                        static_cast<ElementType*>(input1), in_dims);
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, 1, Eigen::RowMajor>> in2(
+                        static_cast<ElementType*>(input2), in_dims);
+                    out.device(eigen::global_thread_pool_device) = in0.select(in1, in2);
+                }
+            }
+        }
+    }
+}
--- a/src/ngraph/runtime/cpu/kernel/sign.hpp
+++ b/src/ngraph/runtime/cpu/kernel/sign.hpp
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#pragma once
+#define EIGEN_USE_THREADS
+#include <unsupported/Eigen/CXX11/Tensor>
+#include "ngraph/runtime/cpu/kernel/eigen_thread_pool.hpp"
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            namespace kernel
+            {
+                template <typename ElementType>
+                void sign(void* input0, void* output, size_t count)
+                {
+                    Eigen::array<Eigen::Index, 1> out_dims, in_dims;
+                    out_dims[0] = in_dims[0] = count;
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, 1, Eigen::RowMajor>> out(
+                        static_cast<ElementType*>(output), out_dims);
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, 1, Eigen::RowMajor>> in0(
+                        static_cast<ElementType*>(input0), in_dims);
+                    out.device(eigen::global_thread_pool_device) = in0.sign();
+                }
+            }
+        }
+    }
+}
--- a/src/ngraph/runtime/cpu/kernel/sqrt.hpp
+++ b/src/ngraph/runtime/cpu/kernel/sqrt.hpp
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#pragma once
+#define EIGEN_USE_THREADS
+#include <unsupported/Eigen/CXX11/Tensor>
+#include "ngraph/runtime/cpu/kernel/eigen_thread_pool.hpp"
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            namespace kernel
+            {
+                template <typename ElementType>
+                void sqrt(void* input, void* output, size_t count)
+                {
+                    Eigen::array<Eigen::Index, 1> out_dims, in_dims;
+                    out_dims[0] = in_dims[0] = count;
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, 1, Eigen::RowMajor>> out(
+                        static_cast<ElementType*>(output), out_dims);
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, 1, Eigen::RowMajor>> in(
+                        static_cast<ElementType*>(input), in_dims);
+                    out.device(eigen::global_thread_pool_device) = in.sqrt();
+                }
+            }
+        }
+    }
+}
--- a/src/ngraph/runtime/cpu/kernel/subtract.hpp
+++ b/src/ngraph/runtime/cpu/kernel/subtract.hpp
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#pragma once
+#define EIGEN_USE_THREADS
+#include <unsupported/Eigen/CXX11/Tensor>
+#include "ngraph/runtime/cpu/kernel/eigen_thread_pool.hpp"
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            namespace kernel
+            {
+                template <typename ElementType>
+                void subtract(void* input0, void* input1, void* output, size_t count)
+                {
+                    Eigen::array<Eigen::Index, 1> out_dims, in_dims;
+                    out_dims[0] = in_dims[0] = count;
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, 1, Eigen::RowMajor>> out(
+                        static_cast<ElementType*>(output), out_dims);
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, 1, Eigen::RowMajor>> in0(
+                        static_cast<ElementType*>(input0), in_dims);
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, 1, Eigen::RowMajor>> in1(
+                        static_cast<ElementType*>(input1), in_dims);
+                    out.device(eigen::global_thread_pool_device) = in0 - in1;
+                }
+            }
+        }
+    }
+}