DEX: Reduce function (#1349)

* CPU Direct Execution: Implement Reduce * Workarounds for ancient CI compilers * Fix return types * Review comments

DEX: Reduce function (#1349)
* CPU Direct Execution: Implement Reduce * Workarounds for ancient CI compilers * Fix return types * Review comments
ec45be4b · Jaikrishnan Menon · Scott Cyphers · 21012673 · ec45be4b · ec45be4b
Commit ec45be4b authored Aug 08, 2018 by Jaikrishnan Menon Committed by Scott Cyphers Aug 08, 2018
6 changed files
--- a/src/ngraph/runtime/cpu/CMakeLists.txt
+++ b/src/ngraph/runtime/cpu/CMakeLists.txt
@@ -44,6 +44,7 @@ set(SRC
    builder/relu.cpp
    builder/pad.cpp
    builder/product.cpp
+    builder/reduce_function.cpp
    builder/reduce_function_window.cpp
    builder/reshape.cpp
    builder/reverse.cpp

--- a/src/ngraph/runtime/cpu/builder/reduce_function.cpp
+++ b/src/ngraph/runtime/cpu/builder/reduce_function.cpp
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#include "ngraph/runtime/cpu/kernel/reduce_function.hpp"
+#include "ngraph/op/reduce.hpp"
+#include "ngraph/runtime/backend.hpp"
+#include "ngraph/runtime/cpu/cpu_builder.hpp"
+#include "ngraph/runtime/cpu/cpu_external_function.hpp"
+#include "ngraph/runtime/tensor_view.hpp"
+using namespace std;
+using namespace ngraph;
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            template <>
+            void Builder::BUILDER_DECL(ngraph::op::Reduce)
+            {
+                auto reduce = static_cast<const ngraph::op::Reduce*>(node);
+                auto function = reduce->get_functions()[0];
+                auto& functors = external_function->get_functors();
+                auto& tensor_data = external_function->get_tensor_data();
+                auto& callees = external_function->get_callees();
+                if (!callees.count(function->get_name()))
+                {
+                    callees[function->get_name()] = make_shared<CPU_ExternalFunction>(function);
+                }
+                auto& reducer_external_function = callees[function->get_name()];
+                auto& arg0_tensor = tensor_data[args[0].get_name()];
+                auto& arg1_tensor = tensor_data[args[1].get_name()];
+                auto& out_tensor = tensor_data[out[0].get_name()];
+                auto arg0_shape = args[0].get_shape();
+                auto out_shape = out[0].get_shape();
+                auto reduction_axes = reduce->get_reduction_axes();
+                if (reduction_axes.empty())
+                {
+                    size_t size = args[0].get_size() * args[0].get_element_type().size();
+                    auto functor = [&, size](CPURuntimeContext* ctx) {
+                        memcpy(out_tensor, arg0_tensor, size);
+                    };
+                    functors.emplace_back(functor);
+                }
+                else if (reduction_axes.size() == 1)
+                {
+                    std::function<decltype(runtime::cpu::kernel::reduce_function_1rd<float, 1>)>
+                        kernel;
+                    SELECT_KERNEL_BY_RANK(kernel,
+                                          args[0].get_element_type(),
+                                          arg0_shape.size(),
+                                          runtime::cpu::kernel::reduce_function_1rd);
+                    auto functor =
+                        [&, kernel, arg0_shape, out_shape, reduction_axes](CPURuntimeContext* ctx) {
+                            kernel(arg0_tensor,
+                                   arg1_tensor,
+                                   out_tensor,
+                                   arg0_shape,
+                                   out_shape,
+                                   reduction_axes,
+                                   reducer_external_function);
+                        };
+                    functors.emplace_back(functor);
+                }
+                else if (arg0_shape.size() == 2 && reduction_axes.size() == 2)
+                {
+                    std::function<decltype(runtime::cpu::kernel::reduce_function_2d_2rd<float>)>
+                        kernel;
+                    SELECT_KERNEL(kernel,
+                                  args[0].get_element_type(),
+                                  runtime::cpu::kernel::reduce_function_2d_2rd);
+                    auto functor =
+                        [&, kernel, arg0_shape, out_shape, reduction_axes](CPURuntimeContext* ctx) {
+                            kernel(arg0_tensor,
+                                   arg1_tensor,
+                                   out_tensor,
+                                   arg0_shape,
+                                   out_shape,
+                                   reduction_axes,
+                                   reducer_external_function);
+                        };
+                    functors.emplace_back(functor);
+                }
+                else if (arg0_shape.size() == 3 && reduction_axes.size() == 2)
+                {
+                    std::function<decltype(runtime::cpu::kernel::reduce_function_3d_2rd<float>)>
+                        kernel;
+                    SELECT_KERNEL(kernel,
+                                  args[0].get_element_type(),
+                                  runtime::cpu::kernel::reduce_function_3d_2rd);
+                    auto functor =
+                        [&, kernel, arg0_shape, out_shape, reduction_axes](CPURuntimeContext* ctx) {
+                            kernel(arg0_tensor,
+                                   arg1_tensor,
+                                   out_tensor,
+                                   arg0_shape,
+                                   out_shape,
+                                   reduction_axes,
+                                   reducer_external_function);
+                        };
+                    functors.emplace_back(functor);
+                }
+                else
+                {
+                    throw ngraph_error("Unsupported Reduce");
+                }
+            }
+            REGISTER_OP_BUILDER(Reduce);
+        }
+    }
+}
--- a/src/ngraph/runtime/cpu/cpu_external_function.cpp
+++ b/src/ngraph/runtime/cpu/cpu_external_function.cpp
@@ -1189,7 +1189,8 @@ void runtime::cpu::CPU_ExternalFunction::build()
        auto handler = build_dispatcher.find(type_index(typeid(n)));
        if (handler == build_dispatcher.end())
        {
-            throw ngraph_error("Unhandled op during code generation : " + node->description());
+            throw ngraph_error("Unhandled op during executor construction : " +
+                               node->description());
        }
        vector<TensorViewWrapper> in;
        vector<string> in_names;

--- a/src/ngraph/runtime/cpu/cpu_tensor_view.cpp
+++ b/src/ngraph/runtime/cpu/cpu_tensor_view.cpp
@@ -32,8 +32,6 @@ using namespace std;
 // TODO(jmenon): Refactor all the alignment specifications into
 // a single place and allow lower or no alignment when possible
-const size_t runtime::cpu::CPUTensorView::BufferAlignment = 64;
 runtime::cpu::CPUTensorView::CPUTensorView(const ngraph::element::Type& element_type,
                                           const Shape& shape,
                                           void* memory_pointer,

--- a/src/ngraph/runtime/cpu/cpu_tensor_view.hpp
+++ b/src/ngraph/runtime/cpu/cpu_tensor_view.hpp
@@ -57,9 +57,9 @@ namespace ngraph
                /// @param n Number of bytes to read, must be integral number of elements.
                void read(void* p, size_t tensor_offset, size_t n) const override;
-            private:
+                static constexpr size_t BufferAlignment = 64;
-                static const size_t BufferAlignment;
+            private:
                char* buffer;
                char* aligned_buffer;
                size_t buffer_size;

--- a/src/ngraph/runtime/cpu/kernel/reduce_function.hpp
+++ b/src/ngraph/runtime/cpu/kernel/reduce_function.hpp
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#pragma once
+#define EIGEN_USE_THREADS
+#include <unsupported/Eigen/CXX11/Tensor>
+#include "ngraph/axis_set.hpp"
+#include "ngraph/runtime/backend.hpp"
+#include "ngraph/runtime/cpu/cpu_external_function.hpp"
+#include "ngraph/runtime/cpu/cpu_tensor_view.hpp"
+#include "ngraph/runtime/cpu/kernel/eigen_thread_pool.hpp"
+#include "ngraph/shape.hpp"
+#include "ngraph/type/element_type.hpp"
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            namespace kernel
+            {
+                template <typename ElementType>
+                struct Reducer
+                {
+                    static const bool PacketAccess = false;
+                    static const bool IsStateful = false;
+                    ElementType initial;
+                    const std::shared_ptr<CPU_ExternalFunction>& external_function;
+                    std::shared_ptr<Backend> backend;
+                    Reducer(ElementType x, const std::shared_ptr<CPU_ExternalFunction>& ef)
+                        : initial(x)
+                        , external_function(ef)
+                        , backend(runtime::Backend::create("CPU"))
+                    {
+                    }
+                    void reduce(const ElementType v, ElementType* R)
+                    {
+                        TensorViewPtrs inputs, outputs;
+                        ElementType p __attribute__((aligned(CPUTensorView::BufferAlignment))) = v;
+                        ElementType q __attribute__((aligned(CPUTensorView::BufferAlignment))) = *R;
+                        ElementType r __attribute__((aligned(CPUTensorView::BufferAlignment)));
+                        inputs.emplace_back(backend->create_tensor(
+                            ngraph::element::from<ElementType>(), Shape{}, &p));
+                        inputs.emplace_back(backend->create_tensor(
+                            ngraph::element::from<ElementType>(), Shape{}, &q));
+                        outputs.emplace_back(backend->create_tensor(
+                            ngraph::element::from<ElementType>(), Shape{}, &r));
+                        auto call_frame = external_function->make_call_frame();
+                        call_frame->call(outputs, inputs);
+                        *R = r;
+                    }
+                    ElementType initialize() const { return initial; }
+                    ElementType finalize(const ElementType R) const { return R; }
+                };
+                template <typename ElementType, unsigned int Rank, unsigned int ReductionDims>
+                void reduce_function(void* input0,
+                                     void* input1,
+                                     void* output,
+                                     const Shape& input_shape,
+                                     const Shape& output_shape,
+                                     const AxisSet& reduction_axes,
+                                     const std::shared_ptr<CPU_ExternalFunction>& external_function)
+                {
+                    Eigen::array<Eigen::Index, Rank> in_dims;
+                    Eigen::array<Eigen::Index, Rank - ReductionDims> out_dims;
+                    Eigen::array<Eigen::Index, ReductionDims> reduction_dims;
+                    for (int i = 0; i < Rank; i++)
+                    {
+                        in_dims[i] = input_shape[i];
+                    }
+                    for (int i = 0; i < Rank - ReductionDims; i++)
+                    {
+                        out_dims[i] = output_shape[i];
+                    }
+                    int i = 0;
+                    for (auto axis : reduction_axes)
+                    {
+                        reduction_dims[i++] = axis;
+                    }
+                    Eigen::TensorMap<
+                        Eigen::Tensor<ElementType, Rank - ReductionDims, Eigen::RowMajor>>
+                        out(static_cast<ElementType*>(output), out_dims);
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, Rank, Eigen::RowMajor>> in(
+                        static_cast<ElementType*>(input0), in_dims);
+                    Reducer<ElementType> reducer(*static_cast<ElementType*>(input1),
+                                                 external_function);
+                    out.device(eigen::global_thread_pool_device) =
+                        in.reduce(reduction_dims, reducer);
+                }
+                template <typename ElementType, unsigned int Rank>
+                void reduce_function_1rd(
+                    void* input0,
+                    void* input1,
+                    void* output,
+                    const Shape& input_shape,
+                    const Shape& output_shape,
+                    const AxisSet& reduction_axes,
+                    const std::shared_ptr<CPU_ExternalFunction>& external_function)
+                {
+                    reduce_function<ElementType, Rank, 1>(input0,
+                                                          input1,
+                                                          output,
+                                                          input_shape,
+                                                          output_shape,
+                                                          reduction_axes,
+                                                          external_function);
+                }
+                template <typename ElementType>
+                void reduce_function_2d_2rd(
+                    void* input0,
+                    void* input1,
+                    void* output,
+                    const Shape& input_shape,
+                    const Shape& output_shape,
+                    const AxisSet& reduction_axes,
+                    const std::shared_ptr<CPU_ExternalFunction>& external_function)
+                {
+                    reduce_function<ElementType, 2, 2>(input0,
+                                                       input1,
+                                                       output,
+                                                       input_shape,
+                                                       output_shape,
+                                                       reduction_axes,
+                                                       external_function);
+                }
+                template <typename ElementType>
+                void reduce_function_3d_2rd(
+                    void* input0,
+                    void* input1,
+                    void* output,
+                    const Shape& input_shape,
+                    const Shape& output_shape,
+                    const AxisSet& reduction_axes,
+                    const std::shared_ptr<CPU_ExternalFunction>& external_function)
+                {
+                    reduce_function<ElementType, 3, 2>(input0,
+                                                       input1,
+                                                       output,
+                                                       input_shape,
+                                                       output_shape,
+                                                       reduction_axes,
+                                                       external_function);
+                }
+            }
+        }
+    }
+}