Jmenon/dexec (#1092)

* CPU: Direct Execution Part 1 with bare minimum infrastructure * Refactor: Move build related functionality to a separate TU and external function method * Add TU back after merge * Remove an assert * Remove commented-out code

Jmenon/dexec (#1092)
* CPU: Direct Execution Part 1 with bare minimum infrastructure * Refactor: Move build related functionality to a separate TU and external function method * Add TU back after merge * Remove an assert * Remove commented-out code
abb68627 · Jaikrishnan Menon · Scott Cyphers · 79dd92d3 · abb68627 · abb68627
Commit abb68627 authored Jun 08, 2018 by Jaikrishnan Menon Committed by Scott Cyphers Jun 08, 2018
11 changed files
--- a/src/ngraph/runtime/cpu/CMakeLists.txt
+++ b/src/ngraph/runtime/cpu/CMakeLists.txt
@@ -16,6 +16,7 @@

 set(SRC
    cpu_backend.cpp
+    cpu_builder.cpp
    cpu_call_frame.cpp
    cpu_emitter.cpp
    cpu_external_function.cpp

--- a/src/ngraph/runtime/cpu/cpu_builder.cpp
+++ b/src/ngraph/runtime/cpu/cpu_builder.cpp
+/*******************************************************************************
+* Copyright 2017-2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "ngraph/runtime/cpu/cpu_builder.hpp"
+
+#include <algorithm>
+#include <cmath>
+#include <numeric>
+#include <string>
+#include <typeindex>
+#include <unordered_map>
+#include <vector>
+#include "ngraph/node.hpp"
+#include "ngraph/op/abs.hpp"
+#include "ngraph/op/acos.hpp"
+#include "ngraph/op/add.hpp"
+#include "ngraph/op/allreduce.hpp"
+#include "ngraph/op/and.hpp"
+#include "ngraph/op/asin.hpp"
+#include "ngraph/op/atan.hpp"
+#include "ngraph/op/avg_pool.hpp"
+#include "ngraph/op/batch_norm.hpp"
+#include "ngraph/op/broadcast.hpp"
+#include "ngraph/op/ceiling.hpp"
+#include "ngraph/op/concat.hpp"
+#include "ngraph/op/constant.hpp"
+#include "ngraph/op/convert.hpp"
+#include "ngraph/op/convolution.hpp"
+#include "ngraph/op/cos.hpp"
+#include "ngraph/op/cosh.hpp"
+#include "ngraph/op/divide.hpp"
+#include "ngraph/op/dot.hpp"
+#include "ngraph/op/equal.hpp"
+#include "ngraph/op/exp.hpp"
+#include "ngraph/op/floor.hpp"
+#include "ngraph/op/function_call.hpp"
+#include "ngraph/op/get_output_element.hpp"
+#include "ngraph/op/greater.hpp"
+#include "ngraph/op/greater_eq.hpp"
+#include "ngraph/op/less.hpp"
+#include "ngraph/op/less_eq.hpp"
+#include "ngraph/op/log.hpp"
+#include "ngraph/op/max.hpp"
+#include "ngraph/op/max_pool.hpp"
+#include "ngraph/op/maximum.hpp"
+#include "ngraph/op/min.hpp"
+#include "ngraph/op/minimum.hpp"
+#include "ngraph/op/multiply.hpp"
+#include "ngraph/op/negative.hpp"
+#include "ngraph/op/not.hpp"
+#include "ngraph/op/not_equal.hpp"
+#include "ngraph/op/one_hot.hpp"
+#include "ngraph/op/op.hpp"
+#include "ngraph/op/or.hpp"
+#include "ngraph/op/pad.hpp"
+#include "ngraph/op/parameter.hpp"
+#include "ngraph/op/power.hpp"
+#include "ngraph/op/product.hpp"
+#include "ngraph/op/reduce.hpp"
+#include "ngraph/op/reduce_window.hpp"
+#include "ngraph/op/relu.hpp"
+#include "ngraph/op/remainder.hpp"
+#include "ngraph/op/replace_slice.hpp"
+#include "ngraph/op/reshape.hpp"
+#include "ngraph/op/result.hpp"
+#include "ngraph/op/reverse.hpp"
+#include "ngraph/op/reverse_sequence.hpp"
+#include "ngraph/op/select.hpp"
+#include "ngraph/op/select_and_scatter.hpp"
+#include "ngraph/op/sign.hpp"
+#include "ngraph/op/sin.hpp"
+#include "ngraph/op/sinh.hpp"
+#include "ngraph/op/slice.hpp"
+#include "ngraph/op/softmax.hpp"
+#include "ngraph/op/sqrt.hpp"
+#include "ngraph/op/subtract.hpp"
+#include "ngraph/op/sum.hpp"
+#include "ngraph/op/tan.hpp"
+#include "ngraph/op/tanh.hpp"
+#include "ngraph/runtime/cpu/cpu_op_annotations.hpp"
+#include "ngraph/runtime/cpu/kernel/abs.hpp"
+#include "ngraph/runtime/cpu/kernel/add.hpp"
+#include "ngraph/runtime/cpu/kernel/multiply.hpp"
+#include "ngraph/runtime/cpu/kernel/result.hpp"
+#include "ngraph/runtime/cpu/mkldnn_utils.hpp"
+#include "ngraph/runtime/cpu/op/batch_norm_relu.hpp"
+#include "ngraph/runtime/cpu/op/conv_bias.hpp"
+#include "ngraph/runtime/cpu/op/conv_relu.hpp"
+#include "ngraph/runtime/cpu/op/convert_layout.hpp"
+#include "ngraph/runtime/cpu/op/lstm.hpp"
+#include "ngraph/runtime/cpu/op/matmul_bias.hpp"
+#include "ngraph/runtime/cpu/op/max_pool_with_indices.hpp"
+#include "ngraph/runtime/cpu/op/rnn.hpp"
+#include "ngraph/runtime/cpu/op/sigmoid.hpp"
+#include "ngraph/type/element_type.hpp"
+#include "ngraph/util.hpp"
+
+#ifdef NGRAPH_DISTRIBUTED
+#include <mpi.h>
+#include "ngraph/op/allreduce.hpp"
+#endif
+
+using namespace std;
+using namespace ngraph;
+
+// Per-type kernel macro
+#define SELECT_KERNEL(KV, ET, K)                                                                   \
+    if (ET == element::boolean)                                                                    \
+    {                                                                                              \
+        KV = K<char>;                                                                              \
+    }                                                                                              \
+    else if (ET == element::f32)                                                                   \
+    {                                                                                              \
+        KV = K<float>;                                                                             \
+    }                                                                                              \
+    else if (ET == element::f64)                                                                   \
+    {                                                                                              \
+        KV = K<double>;                                                                            \
+    }                                                                                              \
+    else if (ET == element::i8)                                                                    \
+    {                                                                                              \
+        KV = K<int8_t>;                                                                            \
+    }                                                                                              \
+    else if (ET == element::i16)                                                                   \
+    {                                                                                              \
+        KV = K<int16_t>;                                                                           \
+    }                                                                                              \
+    else if (ET == element::i32)                                                                   \
+    {                                                                                              \
+        KV = K<int32_t>;                                                                           \
+    }                                                                                              \
+    else if (ET == element::i64)                                                                   \
+    {                                                                                              \
+        KV = K<int64_t>;                                                                           \
+    }                                                                                              \
+    else if (ET == element::u8)                                                                    \
+    {                                                                                              \
+        KV = K<uint8_t>;                                                                           \
+    }                                                                                              \
+    else if (ET == element::u16)                                                                   \
+    {                                                                                              \
+        KV = K<uint16_t>;                                                                          \
+    }                                                                                              \
+    else if (ET == element::u32)                                                                   \
+    {                                                                                              \
+        KV = K<uint32_t>;                                                                          \
+    }                                                                                              \
+    else if (ET == element::u64)                                                                   \
+    {                                                                                              \
+        KV = K<uint64_t>;                                                                          \
+    }
+
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            template <>
+            void Builder::BUILDER_DECL(ngraph::op::Add)
+            {
+                auto& functors = external_function->get_functors();
+                auto& tensor_data = external_function->get_tensor_data();
+                std::function<void(void*, void*, void*, size_t)> kernel;
+
+                SELECT_KERNEL(kernel, out[0].get_element_type(), runtime::cpu::kernel::add);
+
+                auto element_count = out[0].get_size();
+                auto& arg0_tensor = tensor_data[args[0].get_name()];
+                auto& arg1_tensor = tensor_data[args[1].get_name()];
+                auto& out0_tensor = tensor_data[out[0].get_name()];
+
+                auto functor = [&, kernel, element_count](CPURuntimeContext* ctx) {
+                    kernel(arg0_tensor, arg1_tensor, out0_tensor, element_count);
+                };
+                functors.emplace_back(functor);
+            }
+
+            template <>
+            void Builder::BUILDER_DECL(ngraph::op::Multiply)
+            {
+                auto& functors = external_function->get_functors();
+                auto& tensor_data = external_function->get_tensor_data();
+                std::function<void(void*, void*, void*, size_t)> kernel;
+
+                SELECT_KERNEL(kernel, out[0].get_element_type(), runtime::cpu::kernel::multiply);
+
+                auto element_count = out[0].get_size();
+                auto& arg0_tensor = tensor_data[args[0].get_name()];
+                auto& arg1_tensor = tensor_data[args[1].get_name()];
+                auto& out0_tensor = tensor_data[out[0].get_name()];
+
+                auto functor = [&, kernel, element_count](CPURuntimeContext* ctx) {
+                    kernel(arg0_tensor, arg1_tensor, out0_tensor, element_count);
+                };
+                functors.emplace_back(functor);
+            }
+
+            template <>
+            void Builder::BUILDER_DECL(ngraph::op::Abs)
+            {
+                auto& functors = external_function->get_functors();
+                auto& tensor_data = external_function->get_tensor_data();
+                std::function<void(void*, void*, size_t)> kernel;
+
+                SELECT_KERNEL(kernel, out[0].get_element_type(), runtime::cpu::kernel::abs);
+
+                auto element_count = out[0].get_size();
+                auto& arg0_tensor = tensor_data[args[0].get_name()];
+                auto& out0_tensor = tensor_data[out[0].get_name()];
+
+                auto functor = [&, kernel, element_count](CPURuntimeContext* ctx) {
+                    kernel(arg0_tensor, out0_tensor, element_count);
+                };
+                functors.emplace_back(functor);
+            }
+
+            template <>
+            void Builder::BUILDER_DECL(ngraph::op::Result)
+            {
+                auto& functors = external_function->get_functors();
+                auto& tensor_data = external_function->get_tensor_data();
+                std::function<void(void*, void*, size_t)> kernel;
+
+                SELECT_KERNEL(kernel, out[0].get_element_type(), runtime::cpu::kernel::result);
+
+                auto& arg0_tensor = tensor_data[args[0].get_name()];
+                auto& out0_tensor = tensor_data[out[0].get_name()];
+                auto size = shape_size(node->get_shape());
+
+                auto functor = [&, kernel, size](CPURuntimeContext* ctx) {
+                    kernel(arg0_tensor, out0_tensor, size);
+                };
+                functors.emplace_back(functor);
+            }
+
+            template <>
+            void Builder::BUILDER_DECL(ngraph::op::Constant)
+            {
+                auto& functors = external_function->get_functors();
+                auto& tensor_data = external_function->get_tensor_data();
+
+                vector<void**> dest;
+                for (auto& result : external_function->get_function()->get_results())
+                {
+                    if (result.get() == node)
+                    {
+                        dest.push_back(&tensor_data[result->get_output_tensor(0).get_name()]);
+                    }
+                }
+                auto& src = tensor_data[node->get_output_tensor(0).get_name()];
+                auto size = node->get_output_tensor(0).size();
+                auto functor = [&, dest, src, size](CPURuntimeContext* ctx) {
+                    for (auto p : dest)
+                    {
+                        memcpy(*p, src, size);
+                    }
+                };
+                functors.emplace_back(functor);
+            }
+
+#define TI(x) type_index(typeid(x))
+
+            const BuildOpMap build_dispatcher{
+                {TI(ngraph::op::Add), &runtime::cpu::Builder::build<ngraph::op::Add>},
+                {TI(ngraph::op::Multiply), &runtime::cpu::Builder::build<ngraph::op::Multiply>},
+                {TI(ngraph::op::Parameter), &runtime::cpu::Builder::nop},
+                {TI(ngraph::op::Abs), &runtime::cpu::Builder::build<ngraph::op::Abs>},
+                {TI(ngraph::op::Result), &runtime::cpu::Builder::build<ngraph::op::Result>},
+                {TI(ngraph::op::Constant), &runtime::cpu::Builder::build<ngraph::op::Constant>}};
+        }
+    }
+}
--- a/src/ngraph/runtime/cpu/cpu_builder.hpp
+++ b/src/ngraph/runtime/cpu/cpu_builder.hpp
+/*******************************************************************************
+* Copyright 2017-2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "ngraph/node.hpp"
+#include "ngraph/runtime/cpu/cpu_external_function.hpp"
+#include "ngraph/runtime/cpu/cpu_tensor_view_wrapper.hpp"
+
+#define BUILDER_DECL(op_name)                                                                      \
+    build<op_name>(CPU_ExternalFunction * external_function,                                       \
+                   const ngraph::Node* node,                                                       \
+                   const std::vector<TensorViewWrapper>& args,                                     \
+                   const std::vector<TensorViewWrapper>& out)
+
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            using BuildOpFunction =
+                std::function<void(CPU_ExternalFunction* external_function,
+                                   const ngraph::Node*,
+                                   const std::vector<TensorViewWrapper>& inputs,
+                                   const std::vector<TensorViewWrapper>& outputs)>;
+
+            using BuildOpMap = std::unordered_map<std::type_index, BuildOpFunction>;
+
+            extern const BuildOpMap build_dispatcher;
+
+            class Builder
+            {
+            public:
+                template <typename OP>
+                static void build(CPU_ExternalFunction* external_function,
+                                  const ngraph::Node* node,
+                                  const std::vector<TensorViewWrapper>& args,
+                                  const std::vector<TensorViewWrapper>& out)
+                {
+                    throw std::runtime_error("Unimplemented op in CPU builder");
+                }
+
+                static void nop(CPU_ExternalFunction* external_function,
+                                const ngraph::Node* node,
+                                const std::vector<TensorViewWrapper>& args,
+                                const std::vector<TensorViewWrapper>& out)
+                {
+                }
+
+                static void buildBatchNorm(CPU_ExternalFunction* external_function,
+                                           const ngraph::Node* node,
+                                           const std::vector<TensorViewWrapper>& args,
+                                           const std::vector<TensorViewWrapper>& out,
+                                           bool append_relu = false);
+            };
+        }
+    }
+}
--- a/src/ngraph/runtime/cpu/cpu_call_frame.cpp
+++ b/src/ngraph/runtime/cpu/cpu_call_frame.cpp
@@ -63,7 +63,14 @@ void runtime::cpu::CPU_CallFrame::call(
    }

    // Invoke compiled computation
+    if (!m_external_function->is_direct_execution())
+    {
        m_compiled_function(inputs.data(), outputs.data(), ctx);
+    }
+    else
+    {
+        m_external_function->get_executor()(ctx, inputs, outputs);
+    }

    if (runtime::cpu::IsTracingEnabled())
    {

--- a/src/ngraph/runtime/cpu/cpu_emitter.cpp
+++ b/src/ngraph/runtime/cpu/cpu_emitter.cpp
@@ -89,7 +89,6 @@
 #include "ngraph/op/sum.hpp"
 #include "ngraph/op/tan.hpp"
 #include "ngraph/op/tanh.hpp"
-#include "ngraph/runtime/cpu/cpu_emitter.hpp"
 #include "ngraph/runtime/cpu/cpu_kernel_emitters.hpp"
 #include "ngraph/runtime/cpu/cpu_op_annotations.hpp"
 #include "ngraph/runtime/cpu/mkldnn_utils.hpp"

--- a/src/ngraph/runtime/cpu/cpu_external_function.cpp
+++ b/src/ngraph/runtime/cpu/cpu_external_function.cpp
@@ -109,7 +109,9 @@
 #include "ngraph/pass/memory_layout.hpp"
 #include "ngraph/pass/nop_elimination.hpp"
 #include "ngraph/pass/result_copy_elimination.hpp"
+#include "ngraph/runtime/aligned_buffer.hpp"
 #include "ngraph/runtime/cpu/cpu_backend.hpp"
+#include "ngraph/runtime/cpu/cpu_builder.hpp"
 #include "ngraph/runtime/cpu/cpu_call_frame.hpp"
 #include "ngraph/runtime/cpu/cpu_emitter.hpp"
 #include "ngraph/runtime/cpu/cpu_external_function.hpp"
@@ -314,6 +316,8 @@ runtime::cpu::CPU_ExternalFunction::CPU_ExternalFunction(
    , m_emit_timing(false)
    , m_use_tbb(std::getenv("NGRAPH_CPU_USE_TBB") != nullptr)
    , m_function_name(function->get_name())
+    , m_is_built(false)
+    , m_direct_execution(std::getenv("NGRAPH_DEX") != nullptr)
 {
 }

@@ -681,25 +685,6 @@ using namespace ngraph::runtime;
            }
        }

-        // create output alias map
-        /*
-        size_t output_index = 0;
-        unordered_map<descriptor::TensorView*, vector<size_t>> output_alias_map;
-        vector<size_t> aliases;
-        for (size_t i = 0; i < current_function->get_output_size(); ++i)
-        {
-            shared_ptr<Node> op = current_function->get_output_op(i);
-            shared_ptr<descriptor::TensorView> otv = op->get_output_tensor_view();
-            vector<size_t>& al = output_alias_map[otv.get()];
-            al.push_back(output_index);
-            if (al.size() > 1)
-            {
-                aliases.push_back(output_index);
-            }
-            output_index++;
-        }
-        */
-
        // Add outputs to the variable name map
        for (size_t i = 0; i < current_function->get_output_size(); ++i)
        {
@@ -960,6 +945,34 @@ using namespace ngraph::runtime;
        writer += "}\n\n";
    }

+    // TODO: Cleanup and make this a utility function
+    file_util::make_directory(s_output_dir);
+    string filename = file_util::path_join(s_output_dir, m_function_name + "_codegen.cpp");
+    ofstream out(filename);
+    string code = writer.get_code();
+    out << code;
+    out.close();
+
+    m_compiler.reset(new codegen::Compiler());
+    m_execution_engine.reset(new codegen::ExecutionEngine());
+
+    m_compiler->set_precompiled_header_source(pch_header_source);
+
+    auto codegen_module = m_compiler->compile(code);
+
+    if (codegen_module == nullptr)
+    {
+        throw runtime_error("function failed to compile");
+    }
+    m_execution_engine->add_module(codegen_module);
+    m_execution_engine->finalize();
+    m_compiled_function = m_execution_engine->find_function<EntryPoint_t>(m_function_name);
+
+    if (m_compiled_function == nullptr)
+    {
+        throw runtime_error("could not find compiled function");
+    }
+
    // Store layouts assigned for arguments
    for (const auto& parameter : m_function->get_parameters())
    {
@@ -975,6 +988,7 @@ using namespace ngraph::runtime;
                static_pointer_cast<runtime::cpu::LayoutDescriptor>(tv->get_tensor_view_layout()));
        }
    }
+
    // Store layouts assigned for results
    if (!result_layout_descriptors.empty())
    {
@@ -995,35 +1009,187 @@ using namespace ngraph::runtime;
        }
    }

-    // TODO: Cleanup and make this a utility function
-    file_util::make_directory(s_output_dir);
-    string filename = file_util::path_join(s_output_dir, m_function_name + "_codegen.cpp");
-    ofstream out(filename);
-    string code = writer.get_code();
-    out << code;
-    out.close();
+    m_is_compiled = true;
+    if (m_release_function)
+    {
+        release_function();
+    }
+}

-    m_compiler.reset(new codegen::Compiler());
-    m_execution_engine.reset(new codegen::ExecutionEngine());
+void runtime::cpu::CPU_ExternalFunction::build()
+{
+    if (m_is_built)
+    {
+        return;
+    }

-    m_compiler->set_precompiled_header_source(pch_header_source);
+    m_mkldnn_emitter.reset(new MKLDNNEmitter());

-    auto codegen_module = m_compiler->compile(code);
+    ngraph::pass::Manager pass_manager;

-    if (codegen_module == nullptr)
+    //nv_cwi is required only by some frontends
+    //in which case they should run this pass(CPUWorkspaceInsertion) explicitly
+    NodeVector nv_cwi;
+    pass_manager.register_pass<ngraph::pass::NopElimination>();
+    pass_manager.register_pass<runtime::cpu::pass::LSTMFusion>();
+    pass_manager.register_pass<runtime::cpu::pass::RNNFusion>();
+    pass_manager.register_pass<runtime::cpu::pass::ConcatInputs>();
+    pass_manager.register_pass<ngraph::pass::AlgebraicSimplification>();
+    pass_manager.register_pass<ngraph::pass::CommonSubexpressionElimination>();
+    pass_manager.register_pass<ngraph::pass::CoreFusion>();
+    pass_manager.register_pass<runtime::cpu::pass::CPUFusion>();
+    pass_manager.register_pass<runtime::cpu::pass::CPUWorkspaceInsertion>(nv_cwi);
+    pass_manager.register_pass<runtime::cpu::pass::CPUAssignment>(this);
+    pass_manager.register_pass<runtime::cpu::pass::CPULayout>(this);
+    pass_manager.register_pass<runtime::cpu::pass::CPUPostLayoutOptimizations>();
+    pass_manager.register_pass<runtime::cpu::pass::CPUShuffleFolding>();
+    pass_manager.register_pass<ngraph::pass::ResultCopyElimination>();
+    pass_manager.register_pass<ngraph::pass::GetOutputElementElimination>();
+    pass_manager.register_pass<ngraph::pass::Liveness>();
+    pass_manager.register_pass<ngraph::pass::MemoryLayout>(s_memory_pool_alignment, true);
+    pass_manager.run_passes(m_function);
+
+    // Store layouts assigned for arguments
+    for (const auto& parameter : m_function->get_parameters())
    {
-        throw runtime_error("function failed to compile");
+        for (size_t i = 0; i < parameter->get_output_size(); ++i)
+        {
+            auto tv = parameter->get_output_tensor_view(i);
+            if (tv->get_tensor_view_layout() == nullptr)
+            {
+                throw ngraph_error("layout missing on function parameter's tensor view: " +
+                                   tv->get_name());
+            }
+            parameter_layout_descriptors.emplace_back(
+                static_pointer_cast<runtime::cpu::LayoutDescriptor>(tv->get_tensor_view_layout()));
+        }
    }
-    m_execution_engine->add_module(codegen_module);
-    m_execution_engine->finalize();
-    m_compiled_function = m_execution_engine->find_function<EntryPoint_t>(m_function_name);

-    if (m_compiled_function == nullptr)
+    // Store layouts assigned for results
+    if (!result_layout_descriptors.empty())
    {
-        throw runtime_error("could not find compiled function");
+        throw ngraph_error("Function output layouts should not be pre-assigned");
+    }
+    for (size_t i = 0; i < m_function->get_output_size(); ++i)
+    {
+        const auto& output = m_function->get_output_op(i);
+        for (size_t j = 0; j < output->get_output_size(); ++j)
+        {
+            auto tv = output->get_output_tensor_view(j);
+            if (tv->get_tensor_view_layout() == nullptr)
+            {
+                throw ngraph_error("layout missing on function output tensor: " + tv->get_name());
+            }
+            result_layout_descriptors.emplace_back(
+                static_pointer_cast<runtime::cpu::LayoutDescriptor>(tv->get_tensor_view_layout()));
+        }
    }

-    m_is_compiled = true;
+    // Build executor
+    // Inputs
+    size_t arg_index = 0;
+    for (auto& param : m_function->get_parameters())
+    {
+        for (size_t i = 0; i < param->get_output_size(); ++i)
+        {
+            shared_ptr<descriptor::TensorView> tv = param->get_output_tensor_view(i);
+            function_input_index[tv->get_tensor().get_name()] = arg_index;
+            arg_index++;
+        }
+    }
+
+    // Outputs
+    for (size_t i = 0; i < m_function->get_output_size(); ++i)
+    {
+        shared_ptr<Node> op = m_function->get_output_op(i);
+        shared_ptr<descriptor::TensorView> tv = op->get_output_tensor_view();
+        function_output_index[tv->get_tensor().get_name()] = i;
+
+        auto res = std::dynamic_pointer_cast<ngraph::op::Result>(op);
+        if (!res->needs_copy())
+        {
+            shared_ptr<descriptor::TensorView> itv =
+                res->get_inputs().at(0).get_output().get_tensor_view();
+            function_output_index[itv->get_tensor().get_name()] = i;
+        }
+    }
+
+    // Intermediates
+    if (m_function->get_temporary_pool_size())
+    {
+        m_memory_buffer_sizes.push_back(m_function->get_temporary_pool_size());
+
+        for (auto& node : m_function->get_ordered_ops())
+        {
+            for (auto tensor : node->liveness_new_list)
+            {
+                intermediates_offsets[tensor->get_name()] = tensor->get_pool_offset();
+            }
+        }
+    }
+
+    // Constants
+    for (auto& node : m_function->get_ordered_ops())
+    {
+        const auto c = dynamic_cast<ngraph::op::Constant*>(node.get());
+        if (c)
+        {
+            auto tv = node->get_outputs()[0].get_tensor_view();
+            tensor_data[tv->get_tensor().get_name()] = const_cast<void*>(c->get_data_ptr());
+        }
+    }
+
+    for (shared_ptr<Node> node : m_function->get_ordered_ops())
+    {
+        auto& n = *node; // Work around a compiler warning (*node inside typeid may have effects
+        // with shared pointers, which is fine here but clang doesn't like it.)
+        auto handler = build_dispatcher.find(type_index(typeid(n)));
+        if (handler == build_dispatcher.end())
+        {
+            throw ngraph_error("Unhandled op during code generation : " + node->description());
+        }
+        vector<TensorViewWrapper> in;
+        for (const descriptor::Input& input : node->get_inputs())
+        {
+            const descriptor::Output& output = input.get_output();
+            shared_ptr<descriptor::TensorView> tv = output.get_tensor_view();
+            in.push_back(TensorViewWrapper(tv, tv->get_tensor().get_name()));
+        }
+        vector<TensorViewWrapper> out;
+        for (const descriptor::Output& output : node->get_outputs())
+        {
+            shared_ptr<descriptor::TensorView> tv = output.get_tensor_view();
+            out.push_back(TensorViewWrapper(tv, tv->get_tensor().get_name()));
+        }
+
+        handler->second(this, node.get(), in, out);
+    }
+
+    executor = [&](CPURuntimeContext* ctx, vector<void*>& inputs, vector<void*>& outputs) {
+        for (auto& p : intermediates_offsets)
+        {
+            tensor_data[p.first] =
+                static_cast<uint8_t*>(ctx->memory_buffers[0]->get_ptr()) + p.second;
+        }
+
+        for (const auto& p : function_input_index)
+        {
+            tensor_data[p.first] = inputs[p.second];
+        }
+
+        for (const auto& p : function_output_index)
+        {
+            tensor_data[p.first] = outputs[p.second];
+        }
+
+        for (const auto& functor : functors)
+        {
+            functor(ctx);
+        }
+    };
+
+    m_is_built = true;
+
    if (m_release_function)
    {
        release_function();
@@ -1033,11 +1199,16 @@ using namespace ngraph::runtime;
 shared_ptr<ngraph::runtime::cpu::CPU_CallFrame>
    runtime::cpu::CPU_ExternalFunction::make_call_frame()
 {
-    if (!m_is_compiled)
+    if (!m_is_compiled && !m_direct_execution)
    {
        compile();
    }

+    if (!m_is_built && m_direct_execution)
+    {
+        build();
+    }
+
    return make_shared<ngraph::runtime::cpu::CPU_CallFrame>(shared_from_this(),
                                                            m_compiled_function);
 }

--- a/src/ngraph/runtime/cpu/cpu_external_function.hpp
+++ b/src/ngraph/runtime/cpu/cpu_external_function.hpp
@@ -17,6 +17,7 @@
 #pragma once

 #include <functional>
+#include <list>
 #include <map>
 #include <memory>
 #include <string>
@@ -94,7 +95,19 @@ namespace ngraph
                // Temporary Memory Pool alignment
                static const size_t s_memory_pool_alignment;

+                std::list<std::function<void(CPURuntimeContext*)>>& get_functors()
+                {
+                    return functors;
+                }
+                std::unordered_map<std::string, void*>& get_tensor_data() { return tensor_data; }
+                std::function<void(CPURuntimeContext*, std::vector<void*>&, std::vector<void*>&)>&
+                    get_executor()
+                {
+                    return executor;
+                }
+                bool is_direct_execution() const { return m_direct_execution; }
            protected:
+                void build();
                void compile();

            private:
@@ -126,6 +139,7 @@ namespace ngraph
                std::unique_ptr<codegen::ExecutionEngine> m_execution_engine;
                bool m_emit_timing;
                bool m_use_tbb;
+
                std::unordered_map<std::string, std::string> m_variable_name_map;
                std::map<std::string, size_t> m_name_index_map;

@@ -142,6 +156,15 @@ namespace ngraph
                std::unique_ptr<MKLDNNEmitter> m_mkldnn_emitter;

                std::string m_function_name;
+
+                std::list<std::function<void(CPURuntimeContext*)>> functors;
+                std::function<void(CPURuntimeContext*, std::vector<void*>&, std::vector<void*>&)>
+                    executor;
+                std::unordered_map<std::string, void*> tensor_data;
+                std::unordered_map<std::string, size_t> intermediates_offsets;
+                std::unordered_map<std::string, size_t> function_input_index, function_output_index;
+                bool m_is_built;
+                bool m_direct_execution;
            };
        }
    }

--- a/src/ngraph/runtime/cpu/kernel/abs.hpp
+++ b/src/ngraph/runtime/cpu/kernel/abs.hpp
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#pragma once
+
+#define EIGEN_USE_THREADS
+#include <unsupported/Eigen/CXX11/Tensor>
+
+#include "ngraph/runtime/cpu/kernel/eigen_thread_pool.hpp"
+
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            namespace kernel
+            {
+                template <typename ElementType>
+                void abs(void* input0, void* output, size_t count)
+                {
+                    Eigen::array<Eigen::Index, 1> out_dims, in_dims;
+
+                    out_dims[0] = in_dims[0] = count;
+
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, 1, Eigen::RowMajor>> out(
+                        static_cast<ElementType*>(output), out_dims);
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, 1, Eigen::RowMajor>> in0(
+                        static_cast<ElementType*>(input0), in_dims);
+
+                    out.device(eigen::global_thread_pool_device) = in0.abs();
+                }
+            }
+        }
+    }
+}
--- a/src/ngraph/runtime/cpu/kernel/add.hpp
+++ b/src/ngraph/runtime/cpu/kernel/add.hpp
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#pragma once
+
+#define EIGEN_USE_THREADS
+#include <unsupported/Eigen/CXX11/Tensor>
+
+#include "ngraph/runtime/cpu/kernel/eigen_thread_pool.hpp"
+
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            namespace kernel
+            {
+                template <typename ElementType>
+                void add(void* input0, void* input1, void* output, size_t count)
+                {
+                    Eigen::array<Eigen::Index, 1> out_dims, in_dims;
+
+                    out_dims[0] = in_dims[0] = count;
+
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, 1, Eigen::RowMajor>> out(
+                        static_cast<ElementType*>(output), out_dims);
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, 1, Eigen::RowMajor>> in0(
+                        static_cast<ElementType*>(input0), in_dims);
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, 1, Eigen::RowMajor>> in1(
+                        static_cast<ElementType*>(input1), in_dims);
+
+                    out.device(eigen::global_thread_pool_device) = in0 + in1;
+                }
+            }
+        }
+    }
+}
--- a/src/ngraph/runtime/cpu/kernel/multiply.hpp
+++ b/src/ngraph/runtime/cpu/kernel/multiply.hpp
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#pragma once
+
+#define EIGEN_USE_THREADS
+#include <unsupported/Eigen/CXX11/Tensor>
+
+#include "ngraph/runtime/cpu/kernel/eigen_thread_pool.hpp"
+
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            namespace kernel
+            {
+                template <typename ElementType>
+                void multiply(void* input0, void* input1, void* output, size_t count)
+                {
+                    Eigen::array<Eigen::Index, 1> out_dims, in_dims;
+
+                    out_dims[0] = in_dims[0] = count;
+
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, 1, Eigen::RowMajor>> out(
+                        static_cast<ElementType*>(output), out_dims);
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, 1, Eigen::RowMajor>> in0(
+                        static_cast<ElementType*>(input0), in_dims);
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, 1, Eigen::RowMajor>> in1(
+                        static_cast<ElementType*>(input1), in_dims);
+
+                    out.device(eigen::global_thread_pool_device) = in0 * in1;
+                }
+            }
+        }
+    }
+}
--- a/src/ngraph/runtime/cpu/kernel/result.hpp
+++ b/src/ngraph/runtime/cpu/kernel/result.hpp
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#pragma once
+
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            namespace kernel
+            {
+                template <typename ElementType>
+                void result(const void* arg, void* out, size_t count)
+                {
+                    memcpy(out, arg, sizeof(ElementType) * count);
+                }
+            }
+        }
+    }
+}