Optimize sum reductions for common shapes (#811)

* CPU: Optimize Sum reductions * CPU: Optimize 1D reduce all case * CPU: Optimize 4D reduce all sum * CPU: Tweaks * Formatting fixes

Optimize sum reductions for common shapes (#811)
* CPU: Optimize Sum reductions * CPU: Optimize 1D reduce all case * CPU: Optimize 4D reduce all sum * CPU: Tweaks * Formatting fixes
3bc71391 · Jaikrishnan Menon · Scott Cyphers · f27c2289 · 3bc71391 · 3bc71391
Commit 3bc71391 authored Apr 05, 2018 by Jaikrishnan Menon Committed by Scott Cyphers Apr 05, 2018
5 changed files
--- a/src/ngraph/CMakeLists.txt
+++ b/src/ngraph/CMakeLists.txt
@@ -205,6 +205,7 @@ if (NGRAPH_CPU_ENABLE AND LLVM_INCLUDE_DIR AND
        runtime/cpu/mkldnn_utils.cpp
        runtime/cpu/kernel/eigen_thread_pool.cpp
        runtime/cpu/kernel/pad.cpp
+        runtime/cpu/kernel/reduce_sum.cpp
        runtime/cpu/op/conv_bias.cpp
        runtime/cpu/op/conv_relu.cpp
        runtime/cpu/op/convert_layout.cpp

--- a/src/ngraph/runtime/cpu/cpu_emitter.cpp
+++ b/src/ngraph/runtime/cpu/cpu_emitter.cpp
@@ -1710,13 +1710,53 @@ namespace ngraph
                           << "});\n";
                }
 #else
-                kernel::emit_sum(writer,
-                                 args[0].get_element_type().c_type_string(),
-                                 args[0].get_name(),
-                                 out[0].get_name(),
-                                 args[0].get_shape(),
-                                 out[0].get_shape(),
-                                 sum->get_reduction_axes());
+                if (args[0].get_element_type() == element::f32 && args[0].get_shape().size() == 1 &&
+                    sum->get_reduction_axes().size() == 1)
+                {
+                    writer << "cpu::kernel::reduce_sum_all_1d_float32(" << args[0].get_name()
+                           << ", " << out[0].get_name() << ", "
+                           << "{" << join(args[0].get_shape()) << "}, "
+                           << "{" << join(out[0].get_shape()) << "}"
+                           << ");\n";
+                }
+                else if (args[0].get_element_type() == element::f32 &&
+                         args[0].get_shape().size() == 2 && sum->get_reduction_axes().size() == 2)
+                {
+                    writer << "cpu::kernel::reduce_sum_all_2d_float32(" << args[0].get_name()
+                           << ", " << out[0].get_name() << ", "
+                           << "{" << join(args[0].get_shape()) << "}, "
+                           << "{" << join(out[0].get_shape()) << "}"
+                           << ");\n";
+                }
+                else if (args[0].get_element_type() == element::f32 &&
+                         args[0].get_shape().size() == 2 && sum->get_reduction_axes().size() == 1)
+                {
+                    writer << "cpu::kernel::reduce_sum_2d_1rd_float32(" << args[0].get_name()
+                           << ", " << out[0].get_name() << ", "
+                           << "{" << join(args[0].get_shape()) << "}, "
+                           << "{" << join(out[0].get_shape()) << "}, "
+                           << "{" << join(sum->get_reduction_axes()) << "}"
+                           << ");\n";
+                }
+                else if (args[0].get_element_type() == element::f32 &&
+                         args[0].get_shape().size() == 4 && sum->get_reduction_axes().size() == 4)
+                {
+                    writer << "cpu::kernel::reduce_sum_all_4d_float32(" << args[0].get_name()
+                           << ", " << out[0].get_name() << ", "
+                           << "{" << join(args[0].get_shape()) << "}, "
+                           << "{" << join(out[0].get_shape()) << "}"
+                           << ");\n";
+                }
+                else
+                {
+                    kernel::emit_sum(writer,
+                                     args[0].get_element_type().c_type_string(),
+                                     args[0].get_name(),
+                                     out[0].get_name(),
+                                     args[0].get_shape(),
+                                     out[0].get_shape(),
+                                     sum->get_reduction_axes());
+                }
 #endif
                writer.block_end();
            }

--- a/src/ngraph/runtime/cpu/cpu_kernels.hpp
+++ b/src/ngraph/runtime/cpu/cpu_kernels.hpp
@@ -108,6 +108,7 @@ namespace mkl
 namespace ngraph
 {
    class Shape;
+    class AxisSet;

    namespace runtime
    {
@@ -122,6 +123,27 @@ namespace ngraph
                                    const Shape& output_shape,
                                    const Shape& padding_below,
                                    const Shape& padding_above);
+
+                void reduce_sum_all_1d_float32(float* input,
+                                               float* output,
+                                               const Shape& input_shape,
+                                               const Shape& output_shape);
+
+                void reduce_sum_all_2d_float32(float* input,
+                                               float* output,
+                                               const Shape& input_shape,
+                                               const Shape& output_shape);
+
+                void reduce_sum_2d_1rd_float32(float* input,
+                                               float* output,
+                                               const Shape& input_shape,
+                                               const Shape& output_shape,
+                                               const AxisSet& reduction_axes);
+
+                void reduce_sum_all_4d_float32(float* input,
+                                               float* output,
+                                               const Shape& input_shape,
+                                               const Shape& output_shape);
            }
        }
    }

--- a/src/ngraph/runtime/cpu/kernel/reduce_sum.cpp
+++ b/src/ngraph/runtime/cpu/kernel/reduce_sum.cpp
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "reduce_sum.hpp"
+
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            namespace kernel
+            {
+                void reduce_sum_all_1d_float32(float* input,
+                                               float* output,
+                                               const Shape& input_shape,
+                                               const Shape& output_shape)
+                {
+                    reduce_sum_all<float, 1>(input, output, input_shape, output_shape);
+                }
+
+                void reduce_sum_all_2d_float32(float* input,
+                                               float* output,
+                                               const Shape& input_shape,
+                                               const Shape& output_shape)
+                {
+                    reduce_sum_all<float, 2>(input, output, input_shape, output_shape);
+                }
+
+                void reduce_sum_2d_1rd_float32(float* input,
+                                               float* output,
+                                               const Shape& input_shape,
+                                               const Shape& output_shape,
+                                               const AxisSet& reduction_axes)
+                {
+                    reduce_sum<float, 2, 1>(
+                        input, output, input_shape, output_shape, reduction_axes);
+                }
+
+                void reduce_sum_all_4d_float32(float* input,
+                                               float* output,
+                                               const Shape& input_shape,
+                                               const Shape& output_shape)
+                {
+                    reduce_sum_all<float, 4>(input, output, input_shape, output_shape);
+                }
+            }
+        }
+    }
+}
--- a/src/ngraph/runtime/cpu/kernel/reduce_sum.hpp
+++ b/src/ngraph/runtime/cpu/kernel/reduce_sum.hpp
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#pragma once
+
+#define EIGEN_USE_THREADS
+#include <unsupported/Eigen/CXX11/Tensor>
+
+#include "ngraph/runtime/cpu/kernel/eigen_thread_pool.hpp"
+#include "ngraph/shape.hpp"
+
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            namespace kernel
+            {
+                template <typename ElementType, unsigned int Rank>
+                void reduce_sum_all(ElementType* input,
+                                    ElementType* output,
+                                    const Shape& input_shape,
+                                    const Shape& output_shape)
+                {
+                    Eigen::array<Eigen::Index, Rank> in_dims;
+                    Eigen::array<Eigen::Index, 0> out_dims;
+
+                    for (int i = 0; i < Rank; i++)
+                    {
+                        in_dims[i] = input_shape[i];
+                    }
+
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, 0, Eigen::RowMajor>> out(output,
+                                                                                         out_dims);
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, Rank, Eigen::RowMajor>> in(input,
+                                                                                           in_dims);
+                    out.device(eigen::global_thread_pool_device) = in.sum();
+                }
+
+                template <typename ElementType, unsigned int Rank, unsigned int ReductionDims>
+                void reduce_sum(ElementType* input,
+                                ElementType* output,
+                                const Shape& input_shape,
+                                const Shape& output_shape,
+                                const AxisSet& reduction_axes)
+                {
+                    Eigen::array<Eigen::Index, Rank> in_dims;
+                    Eigen::array<Eigen::Index, Rank - ReductionDims> out_dims;
+                    Eigen::array<Eigen::Index, ReductionDims> reduction_dims;
+
+                    for (int i = 0; i < Rank; i++)
+                    {
+                        in_dims[i] = input_shape[i];
+                    }
+
+                    for (int i = 0; i < Rank - ReductionDims; i++)
+                    {
+                        out_dims[i] = output_shape[i];
+                    }
+
+                    int i = 0;
+                    for (auto axis : reduction_axes)
+                    {
+                        reduction_dims[i++] = axis;
+                    }
+
+                    Eigen::TensorMap<
+                        Eigen::Tensor<ElementType, Rank - ReductionDims, Eigen::RowMajor>>
+                        out(output, out_dims);
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, Rank, Eigen::RowMajor>> in(input,
+                                                                                           in_dims);
+                    out.device(eigen::global_thread_pool_device) = in.sum(reduction_dims);
+                }
+            }
+        }
+    }
+}