Jbobba/dex computation reuse (#1219)

* CPU Direct Execution: Implement ConvertLayout and refactor * CPU Direct Execution: Implement Convolution * 1) Adds computation reuse to direct execution 2) Add avg_pool, broadcast and convolution_bias to direct execution 3) Moved some computation reuse utility functions to graph_utils * Use lists instead of vectors to avoid reallocation overheads * - Style fix * style fix

Jbobba/dex computation reuse (#1219)
* CPU Direct Execution: Implement ConvertLayout and refactor * CPU Direct Execution: Implement Convolution * 1) Adds computation reuse to direct execution 2) Add avg_pool, broadcast and convolution_bias to direct execution 3) Moved some computation reuse utility functions to graph_utils * Use lists instead of vectors to avoid reallocation overheads * - Style fix * style fix
7d59542d · Jayaram Bobba · Scott Cyphers · 260cb90d · 7d59542d · 7d59542d
Commit 7d59542d authored Jul 13, 2018 by Jayaram Bobba Committed by Scott Cyphers Jul 13, 2018
10 changed files
--- a/src/ngraph/graph_util.cpp
+++ b/src/ngraph/graph_util.cpp
@@ -503,3 +503,49 @@ size_t ngraph::get_user_count(Node* node)
    }
    return count;
 }
+bool ngraph::computes_result(Node* node)
+{
+    if (node->is_output())
+    {
+        return true;
+    }
+    // Check if node feeds a result node that has been copy eliminated
+    for (const descriptor::Output& output : node->get_outputs())
+    {
+        for (const descriptor::Input* input : output.get_inputs())
+        {
+            auto res = std::dynamic_pointer_cast<ngraph::op::Result>(input->get_node());
+            if (res && !res->needs_copy())
+            {
+                return true;
+            }
+        }
+    }
+    return false;
+}
+bool ngraph::possibly_overwritten(Node* node)
+{
+    for (const descriptor::Output& output : node->get_outputs())
+    {
+        for (const descriptor::Input* input : output.get_inputs())
+        {
+            if (auto op = std::dynamic_pointer_cast<ngraph::op::Op>(input->get_node()))
+            {
+                if (auto op_annotations = op->get_op_annotations())
+                {
+                    for (auto oi_pair : op_annotations->get_in_place_oi_pairs())
+                    {
+                        if (input->get_index() == oi_pair.second)
+                        {
+                            return true;
+                        }
+                    }
+                }
+            }
+        }
+    }
+    return false;
+}
--- a/src/ngraph/graph_util.hpp
+++ b/src/ngraph/graph_util.hpp
@@ -141,4 +141,11 @@ namespace ngraph
    // Returns count of `node` users that are still live in the graph
    size_t get_user_count(Node* node);
+    // Returns true if `node` computes an output tensor
+    bool computes_result(Node* node);
+    // Return true if a node's user could potentially overwrite
+    // the output of this node with in-place kernels
+    bool possibly_overwritten(Node* node);
 }
--- a/src/ngraph/runtime/cpu/CMakeLists.txt
+++ b/src/ngraph/runtime/cpu/CMakeLists.txt
@@ -27,6 +27,7 @@ set(SRC
    cpu_tensor_view_wrapper.cpp
    cpu_tensor_view.cpp
    cpu_tracing.cpp
+    builder/avg_pool.cpp
    builder/convert_layout.cpp
    builder/convolution.cpp
    kernel/eigen_thread_pool.cpp

--- a/src/ngraph/runtime/cpu/builder/avg_pool.cpp
+++ b/src/ngraph/runtime/cpu/builder/avg_pool.cpp
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#include "ngraph/runtime/cpu/kernel/avg_pool.hpp"
+#include "ngraph/op/avg_pool.hpp"
+#include "ngraph/runtime/cpu/cpu_builder.hpp"
+#include "ngraph/runtime/cpu/mkldnn_invoke.hpp"
+#include "ngraph/runtime/cpu/mkldnn_utils.hpp"
+using namespace std;
+using namespace ngraph;
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            template <>
+            void Builder::BUILDER_DECL(ngraph::op::AvgPool)
+            {
+                auto avg_pool = static_cast<const ngraph::op::AvgPool*>(node);
+                auto& functors = external_function->get_functors();
+                auto& tensor_data = external_function->get_tensor_data();
+                auto arg0_shape = args[0].get_shape();
+                auto out_shape = out[0].get_shape();
+                auto& arg0_tensor = tensor_data[args[0].get_name()];
+                auto& out_tensor = tensor_data[out[0].get_name()];
+                auto window_shape = avg_pool->get_window_shape();
+                auto window_movement_strides = avg_pool->get_window_movement_strides();
+                auto padding_below = avg_pool->get_padding_below();
+                auto padding_above = avg_pool->get_padding_above();
+                auto include_padding_in_avg_computation =
+                    avg_pool->get_include_padding_in_avg_computation();
+                if (runtime::cpu::mkldnn_utils::use_mkldnn_kernel(node))
+                {
+                    auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
+                    auto input_desc = mkldnn_emitter->build_memory_descriptor(
+                        args[0], runtime::cpu::mkldnn_utils::get_input_mkldnn_format(node, 0));
+                    auto result_desc = mkldnn_emitter->build_memory_descriptor(
+                        out[0], runtime::cpu::mkldnn_utils::get_output_mkldnn_format(node, 0));
+                    size_t avg_pool_index = mkldnn_emitter->build_pooling_forward(
+                        (include_padding_in_avg_computation
+                             ? mkldnn::algorithm::pooling_avg_include_padding
+                             : mkldnn::algorithm::pooling_avg_exclude_padding),
+                        input_desc,
+                        result_desc,
+                        window_movement_strides,
+                        window_shape,
+                        padding_below,
+                        padding_above);
+                    auto& deps = mkldnn_emitter->get_primitive_deps(avg_pool_index);
+                    auto functor = [&, avg_pool_index](CPURuntimeContext* ctx) {
+                        cpu::mkldnn_utils::set_memory_ptr(ctx, deps[0], arg0_tensor);
+                        cpu::mkldnn_utils::set_memory_ptr(ctx, deps[1], out_tensor);
+                        cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, avg_pool_index);
+                    };
+                    functors.emplace_back(functor);
+                }
+                else
+                {
+                    std::function<decltype(runtime::cpu::kernel::avg_pool<float>)> kernel;
+                    SELECT_KERNEL(
+                        kernel, out[0].get_element_type(), runtime::cpu::kernel::avg_pool);
+                    auto functor = [&,
+                                    kernel,
+                                    arg0_shape,
+                                    out_shape,
+                                    window_shape,
+                                    window_movement_strides,
+                                    padding_below,
+                                    padding_above,
+                                    include_padding_in_avg_computation](CPURuntimeContext* ctx) {
+                        kernel(arg0_tensor,
+                               out_tensor,
+                               arg0_shape,
+                               out_shape,
+                               window_shape,
+                               window_movement_strides,
+                               padding_below,
+                               padding_above,
+                               include_padding_in_avg_computation);
+                    };
+                    functors.emplace_back(functor);
+                }
+            }
+        }
+    }
+}
--- a/src/ngraph/runtime/cpu/builder/convolution.cpp
+++ b/src/ngraph/runtime/cpu/builder/convolution.cpp
@@ -19,6 +19,7 @@
 #include "ngraph/runtime/cpu/kernel/convolution.hpp"
 #include "ngraph/runtime/cpu/mkldnn_invoke.hpp"
 #include "ngraph/runtime/cpu/mkldnn_utils.hpp"
+#include "ngraph/runtime/cpu/op/conv_bias.hpp"
 using namespace std;
 using namespace ngraph;
@@ -141,6 +142,83 @@ namespace ngraph
                }
            }
+            template <>
+            void Builder::BUILDER_DECL(ngraph::op::ConvolutionBias)
+            {
+                auto convolution = static_cast<const ngraph::op::ConvolutionBias*>(node);
+                auto& functors = external_function->get_functors();
+                auto& tensor_data = external_function->get_tensor_data();
+                auto arg0_shape = args[0].get_shape();
+                auto arg1_shape = args[1].get_shape();
+                auto result_shape = out[0].get_shape();
+                auto& arg0_tensor = tensor_data[args[0].get_name()];
+                auto& arg1_tensor = tensor_data[args[1].get_name()];
+                auto& arg2_tensor = tensor_data[args[2].get_name()];
+                auto& out_tensor = tensor_data[out[0].get_name()];
+                if (runtime::cpu::mkldnn_utils::use_mkldnn_kernel(node))
+                {
+                    // For dilation, MKLDNN wants to know how many elements to insert between, not how far
+                    // apart to space the elements like nGraph. So we have to subtract 1 from each pos.
+                    Strides window_dilation_strides_adjusted;
+                    for (size_t s : convolution->get_window_dilation_strides())
+                    {
+                        window_dilation_strides_adjusted.push_back(s - 1);
+                    }
+                    auto input_format =
+                        runtime::cpu::mkldnn_utils::get_input_mkldnn_format(node, 0);
+                    auto weights_format =
+                        runtime::cpu::mkldnn_utils::get_input_mkldnn_format(node, 1);
+                    // HACK to help MKLDNN pick the right implementation
+                    if (weights_format == mkldnn::memory::format::nchw)
+                    {
+                        weights_format = mkldnn::memory::format::oihw;
+                    }
+                    auto bias_format = mkldnn_utils::get_input_mkldnn_format(node, 2);
+                    auto output_format =
+                        runtime::cpu::mkldnn_utils::get_output_mkldnn_format(node, 0);
+                    auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
+                    auto input_data_desc =
+                        mkldnn_emitter->build_memory_descriptor(args[0], input_format);
+                    auto weights_desc =
+                        mkldnn_emitter->build_memory_descriptor(args[1], weights_format);
+                    auto bias_desc = mkldnn_emitter->build_memory_descriptor(args[2], bias_format);
+                    auto result_desc =
+                        mkldnn_emitter->build_memory_descriptor(out[0], output_format);
+                    size_t conv_index = 0;
+                    conv_index = mkldnn_emitter->build_convolution_forward(
+                        input_data_desc,
+                        weights_desc,
+                        bias_desc,
+                        result_desc,
+                        convolution->get_window_movement_strides(),
+                        window_dilation_strides_adjusted,
+                        convolution->get_padding_below(),
+                        convolution->get_padding_above());
+                    auto& deps = mkldnn_emitter->get_primitive_deps(conv_index);
+                    auto functor = [&, conv_index](CPURuntimeContext* ctx) {
+                        cpu::mkldnn_utils::set_memory_ptr(ctx, deps[0], arg0_tensor);
+                        cpu::mkldnn_utils::set_memory_ptr(ctx, deps[1], arg1_tensor);
+                        cpu::mkldnn_utils::set_memory_ptr(ctx, deps[2], arg2_tensor);
+                        cpu::mkldnn_utils::set_memory_ptr(ctx, deps[3], out_tensor);
+                        cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, conv_index);
+                    };
+                    functors.emplace_back(functor);
+                }
+                else
+                {
+                    throw ngraph_error("ConvolutionBias is only supported with MKLDNN kernel.");
+                }
+            }
            template <>
            void Builder::BUILDER_DECL(ngraph::op::ConvolutionBackpropData)
            {

--- a/src/ngraph/runtime/cpu/cpu_builder.cpp
+++ b/src/ngraph/runtime/cpu/cpu_builder.cpp
@@ -94,6 +94,7 @@
 #include "ngraph/runtime/cpu/cpu_op_annotations.hpp"
 #include "ngraph/runtime/cpu/kernel/abs.hpp"
 #include "ngraph/runtime/cpu/kernel/add.hpp"
+#include "ngraph/runtime/cpu/kernel/broadcast.hpp"
 #include "ngraph/runtime/cpu/kernel/ceil.hpp"
 #include "ngraph/runtime/cpu/kernel/multiply.hpp"
 #include "ngraph/runtime/cpu/kernel/relu.hpp"
@@ -175,6 +176,33 @@ namespace ngraph
                BUILD_UNARY_ELEMWISE_FUNCTOR(runtime::cpu::kernel::abs);
            }
+            template <>
+            void Builder::BUILDER_DECL(ngraph::op::Broadcast)
+            {
+                std::function<void(void*, void*, const Shape&, const Shape&, const AxisSet&)>
+                    kernel;
+                SELECT_KERNEL(kernel, out[0].get_element_type(), runtime::cpu::kernel::broadcast);
+                auto& functors = external_function->get_functors();
+                auto& tensor_data = external_function->get_tensor_data();
+                auto arg0_shape = args[0].get_shape();
+                auto result_shape = out[0].get_shape();
+                auto& arg0_tensor = tensor_data[args[0].get_name()];
+                auto& out_tensor = tensor_data[out[0].get_name()];
+                auto broadcast = static_cast<const ngraph::op::Broadcast*>(node);
+                auto broadcast_axes = broadcast->get_broadcast_axes();
+                auto functor =
+                    [&, kernel, arg0_shape, result_shape, broadcast_axes](CPURuntimeContext* ctx) {
+                        kernel(arg0_tensor, out_tensor, arg0_shape, result_shape, broadcast_axes);
+                    };
+                functors.emplace_back(functor);
+            }
            template <>
            void Builder::BUILDER_DECL(ngraph::op::Ceiling)
            {
@@ -370,11 +398,15 @@ namespace ngraph
                {TI(ngraph::op::Multiply), &runtime::cpu::Builder::build<ngraph::op::Multiply>},
                {TI(ngraph::op::Parameter), &runtime::cpu::Builder::nop},
                {TI(ngraph::op::Abs), &runtime::cpu::Builder::build<ngraph::op::Abs>},
+                {TI(ngraph::op::AvgPool), &runtime::cpu::Builder::build<ngraph::op::AvgPool>},
+                {TI(ngraph::op::Broadcast), &runtime::cpu::Builder::build<ngraph::op::Broadcast>},
                {TI(ngraph::op::Ceiling), &runtime::cpu::Builder::build<ngraph::op::Ceiling>},
                {TI(ngraph::runtime::cpu::op::ConvertLayout),
                 &runtime::cpu::Builder::build<ngraph::runtime::cpu::op::ConvertLayout>},
                {TI(ngraph::op::Convolution),
                 &runtime::cpu::Builder::build<ngraph::op::Convolution>},
+                {TI(ngraph::op::ConvolutionBias),
+                 &runtime::cpu::Builder::build<ngraph::op::ConvolutionBias>},
                {TI(ngraph::op::ConvolutionBackpropData),
                 &runtime::cpu::Builder::build<ngraph::op::ConvolutionBackpropData>},
                {TI(ngraph::op::ConvolutionBackpropFilters),

--- a/src/ngraph/runtime/cpu/cpu_external_function.cpp
+++ b/src/ngraph/runtime/cpu/cpu_external_function.cpp
@@ -810,53 +810,9 @@ using namespace ngraph::runtime;
                    }
                }
-                auto computes_output = [&]() {
-                    if (node->is_output())
-                    {
-                        return true;
-                    }
-                    // Check if node feeds a result node that has been copy eliminated
-                    for (const descriptor::Output& output : node->get_outputs())
-                    {
-                        for (const descriptor::Input* input : output.get_inputs())
-                        {
-                            auto res =
-                                std::dynamic_pointer_cast<ngraph::op::Result>(input->get_node());
-                            if (res && !res->needs_copy())
-                            {
-                                return true;
-                            }
-                        }
-                    }
-                    return false;
-                };
-                auto possibly_overwritten = [&]() {
-                    for (const descriptor::Output& output : node->get_outputs())
-                    {
-                        for (const descriptor::Input* input : output.get_inputs())
-                        {
-                            if (auto op =
-                                    std::dynamic_pointer_cast<ngraph::op::Op>(input->get_node()))
-                            {
-                                if (auto op_annotations = op->get_op_annotations())
-                                {
-                                    for (auto oi_pair : op_annotations->get_in_place_oi_pairs())
-                                    {
-                                        if (input->get_index() == oi_pair.second)
-                                        {
-                                            return true;
-                                        }
-                                    }
-                                }
-                            }
-                        }
-                    }
-                    return false;
-                };
                // Always enable nodes computing output tensors or nodes whose outputs might get
                // overwritten due to inplace kernels
-                if (computes_output() || possibly_overwritten())
+                if (computes_result(node.get()) || possibly_overwritten(node.get()))
                {
                    writer << " || 1";
                }
@@ -1182,6 +1138,10 @@ void runtime::cpu::CPU_ExternalFunction::build()
    for (shared_ptr<Node> node : m_function->get_ordered_ops())
    {
+        if (node->is_parameter() || node->is_constant())
+        {
+            continue;
+        }
        auto& n = *node; // Work around a compiler warning (*node inside typeid may have effects
        // with shared pointers, which is fine here but clang doesn't like it.)
        auto handler = build_dispatcher.find(type_index(typeid(n)));
@@ -1190,23 +1150,48 @@ void runtime::cpu::CPU_ExternalFunction::build()
            throw ngraph_error("Unhandled op during code generation : " + node->description());
        }
        vector<TensorViewWrapper> in;
+        vector<string> in_names;
        for (const descriptor::Input& input : node->get_inputs())
        {
            const descriptor::Output& output = input.get_output();
            shared_ptr<descriptor::TensorView> tv = output.get_tensor_view();
            in.push_back(TensorViewWrapper(tv, tv->get_tensor().get_name()));
+            in_names.push_back(tv->get_tensor().get_name());
        }
        vector<TensorViewWrapper> out;
+        vector<string> out_names;
        for (const descriptor::Output& output : node->get_outputs())
        {
            shared_ptr<descriptor::TensorView> tv = output.get_tensor_view();
            out.push_back(TensorViewWrapper(tv, tv->get_tensor().get_name()));
+            out_names.push_back(tv->get_tensor().get_name());
        }
+        size_t functor_count = functors.size();
        handler->second(this, node.get(), in, out);
+        bool disable_caching = computes_result(node.get()) || possibly_overwritten(node.get());
+        auto enable = [&, in_names, out_names, disable_caching](CPURuntimeContext* ctx) -> bool {
+            bool en = false;
+            for (const auto& name : in_names)
+            {
+                if (tensor_stale[name] || disable_caching)
+                {
+                    en = true;
+                }
+            }
+            for (const auto& name : out_names)
+            {
+                tensor_stale[name] = en;
+            }
+            return en;
+        };
+        enables.emplace_back(make_pair(enable, functors.size() - functor_count));
    }
    executor = [&](CPURuntimeContext* ctx, vector<void*>& inputs, vector<void*>& outputs) {
+        static bool first_iteration = true;
        for (auto& p : intermediates_offsets)
        {
            tensor_data[p.first] =
@@ -1216,6 +1201,7 @@ void runtime::cpu::CPU_ExternalFunction::build()
        for (const auto& p : function_input_index)
        {
            tensor_data[p.first] = inputs[p.second];
+            tensor_stale[p.first] = ctx->p_en[p.second];
        }
        for (const auto& p : function_output_index)
@@ -1223,10 +1209,23 @@ void runtime::cpu::CPU_ExternalFunction::build()
            tensor_data[p.first] = outputs[p.second];
        }
-        for (const auto& functor : functors)
+        auto functor = functors.begin();
+        for (const auto& p : enables)
        {
-            functor(ctx);
+            if (p.first(ctx) || first_iteration)
+            {
+                for (size_t j = 0; j < p.second; j++)
+                {
+                    (*functor)(ctx);
+                    std::advance(functor, 1);
+                }
+            }
+            else
+            {
+                std::advance(functor, p.second);
+            }
        }
+        first_iteration = false;
    };
    m_is_built = true;

--- a/src/ngraph/runtime/cpu/cpu_external_function.hpp
+++ b/src/ngraph/runtime/cpu/cpu_external_function.hpp
@@ -158,9 +158,11 @@ namespace ngraph
                std::string m_function_name;
                std::list<std::function<void(CPURuntimeContext*)>> functors;
+                std::list<std::pair<std::function<bool(CPURuntimeContext*)>, size_t>> enables;
                std::function<void(CPURuntimeContext*, std::vector<void*>&, std::vector<void*>&)>
                    executor;
                std::unordered_map<std::string, void*> tensor_data;
+                std::unordered_map<std::string, bool> tensor_stale;
                std::unordered_map<std::string, size_t> intermediates_offsets;
                std::unordered_map<std::string, size_t> function_input_index, function_output_index;
                bool m_is_built;

--- a/src/ngraph/runtime/cpu/kernel/avg_pool.hpp
+++ b/src/ngraph/runtime/cpu/kernel/avg_pool.hpp
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#pragma once
+#include "ngraph/runtime/reference/avg_pool.hpp"
+#include "ngraph/shape.hpp"
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            namespace kernel
+            {
+                template <typename ElementType>
+                void avg_pool(void* arg,
+                              void* out,
+                              const Shape& arg_shape,
+                              const Shape& out_shape,
+                              const Shape& window_shape,
+                              const Strides& window_movement_strides,
+                              const Shape& padding_below,
+                              const Shape& padding_above,
+                              bool include_padding_in_avg_computation)
+                {
+                    reference::avg_pool<ElementType>(static_cast<const ElementType*>(arg),
+                                                     static_cast<ElementType*>(out),
+                                                     arg_shape,
+                                                     out_shape,
+                                                     window_shape,
+                                                     window_movement_strides,
+                                                     padding_below,
+                                                     padding_above,
+                                                     include_padding_in_avg_computation);
+                }
+            }
+        }
+    }
+}
--- a/src/ngraph/runtime/cpu/kernel/broadcast.hpp
+++ b/src/ngraph/runtime/cpu/kernel/broadcast.hpp
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#pragma once
+#define EIGEN_USE_THREADS
+#include <unsupported/Eigen/CXX11/Tensor>
+#include "ngraph/runtime/cpu/kernel/eigen_thread_pool.hpp"
+#include "ngraph/runtime/reference/broadcast.hpp"
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            namespace kernel
+            {
+                template <typename ElementType>
+                void broadcast(void* input0,
+                               void* output,
+                               const Shape& arg0_shape,
+                               const Shape& result_shape,
+                               const AxisSet& broadcast_axes)
+                {
+                    reference::broadcast<ElementType>(static_cast<const ElementType*>(input0),
+                                                      static_cast<ElementType*>(output),
+                                                      arg0_shape,
+                                                      result_shape,
+                                                      broadcast_axes);
+                }
+            }
+        }
+    }
+}