DEX: Enable packetized innermost dim reducers (#1431)

d473eda9 · Jaikrishnan Menon · Scott Cyphers · d742c501 · d473eda9 · d473eda9
Commit d473eda9 authored Aug 17, 2018 by Jaikrishnan Menon Committed by Scott Cyphers Aug 17, 2018
5 changed files
--- a/src/ngraph/runtime/cpu/builder/reduction.hpp
+++ b/src/ngraph/runtime/cpu/builder/reduction.hpp
@@ -54,6 +54,21 @@
                                                                                                   \
    if (reduction_axes.size() == 1)                                                                \
    {                                                                                              \
+        if (*reduction_axes.begin() == arg_rank - 1)                                               \
+        {                                                                                          \
+            std::function<decltype(runtime::cpu::kernel::reduce_##K##_innermost_1rd<float, 2>)>    \
+                kernel;                                                                            \
+            SELECT_KERNEL_BY_RANK(kernel,                                                          \
+                                  result_element_type,                                             \
+                                  arg_rank,                                                        \
+                                  runtime::cpu::kernel::reduce_##K##_innermost_1rd);               \
+            auto functor = [&, kernel, arg_shape, result_shape](CPURuntimeContext* ctx) {          \
+                kernel(arg_tensor, out_tensor, arg_shape, result_shape);                           \
+            };                                                                                     \
+            functors.emplace_back(functor);                                                        \
+            return;                                                                                \
+        }                                                                                          \
+                                                                                                   \
        std::function<decltype(runtime::cpu::kernel::reduce_##K##_1rd<float, 2>)> kernel;          \
        SELECT_KERNEL_BY_RANK(                                                                     \
            kernel, result_element_type, arg_rank, runtime::cpu::kernel::reduce_##K##_1rd);        \

--- a/src/ngraph/runtime/cpu/kernel/reduce_max.hpp
+++ b/src/ngraph/runtime/cpu/kernel/reduce_max.hpp
@@ -52,6 +52,33 @@ namespace ngraph
                    out.device(eigen::global_thread_pool_device) = in.maximum();
                }

+                template <typename ElementType, unsigned int Rank>
+                void reduce_max_innermost_1rd(void* input,
+                                              void* output,
+                                              const Shape& input_shape,
+                                              const Shape& output_shape)
+                {
+                    Eigen::array<Eigen::Index, Rank> in_dims;
+                    Eigen::array<Eigen::Index, Rank - 1> out_dims;
+                    Eigen::IndexList<Eigen::type2index<Rank - 1>> reduction_dim;
+
+                    for (int i = 0; i < Rank; i++)
+                    {
+                        in_dims[i] = input_shape[i];
+                    }
+
+                    for (int i = 0; i < Rank - 1; i++)
+                    {
+                        out_dims[i] = output_shape[i];
+                    }
+
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, Rank - 1, Eigen::RowMajor>> out(
+                        static_cast<ElementType*>(output), out_dims);
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, Rank, Eigen::RowMajor>> in(
+                        static_cast<ElementType*>(input), in_dims);
+                    out.device(eigen::global_thread_pool_device) = in.maximum(reduction_dim);
+                }
+
                template <typename ElementType, unsigned int Rank, unsigned int ReductionDims>
                void reduce_max(void* input,
                                void* output,

--- a/src/ngraph/runtime/cpu/kernel/reduce_min.hpp
+++ b/src/ngraph/runtime/cpu/kernel/reduce_min.hpp
@@ -52,6 +52,33 @@ namespace ngraph
                    out.device(eigen::global_thread_pool_device) = in.minimum();
                }

+                template <typename ElementType, unsigned int Rank>
+                void reduce_min_innermost_1rd(void* input,
+                                              void* output,
+                                              const Shape& input_shape,
+                                              const Shape& output_shape)
+                {
+                    Eigen::array<Eigen::Index, Rank> in_dims;
+                    Eigen::array<Eigen::Index, Rank - 1> out_dims;
+                    Eigen::IndexList<Eigen::type2index<Rank - 1>> reduction_dim;
+
+                    for (int i = 0; i < Rank; i++)
+                    {
+                        in_dims[i] = input_shape[i];
+                    }
+
+                    for (int i = 0; i < Rank - 1; i++)
+                    {
+                        out_dims[i] = output_shape[i];
+                    }
+
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, Rank - 1, Eigen::RowMajor>> out(
+                        static_cast<ElementType*>(output), out_dims);
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, Rank, Eigen::RowMajor>> in(
+                        static_cast<ElementType*>(input), in_dims);
+                    out.device(eigen::global_thread_pool_device) = in.minimum(reduction_dim);
+                }
+
                template <typename ElementType, unsigned int Rank, unsigned int ReductionDims>
                void reduce_min(void* input,
                                void* output,

--- a/src/ngraph/runtime/cpu/kernel/reduce_product.hpp
+++ b/src/ngraph/runtime/cpu/kernel/reduce_product.hpp
@@ -52,6 +52,33 @@ namespace ngraph
                    out.device(eigen::global_thread_pool_device) = in.prod();
                }

+                template <typename ElementType, unsigned int Rank>
+                void reduce_product_innermost_1rd(void* input,
+                                                  void* output,
+                                                  const Shape& input_shape,
+                                                  const Shape& output_shape)
+                {
+                    Eigen::array<Eigen::Index, Rank> in_dims;
+                    Eigen::array<Eigen::Index, Rank - 1> out_dims;
+                    Eigen::IndexList<Eigen::type2index<Rank - 1>> reduction_dim;
+
+                    for (int i = 0; i < Rank; i++)
+                    {
+                        in_dims[i] = input_shape[i];
+                    }
+
+                    for (int i = 0; i < Rank - 1; i++)
+                    {
+                        out_dims[i] = output_shape[i];
+                    }
+
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, Rank - 1, Eigen::RowMajor>> out(
+                        static_cast<ElementType*>(output), out_dims);
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, Rank, Eigen::RowMajor>> in(
+                        static_cast<ElementType*>(input), in_dims);
+                    out.device(eigen::global_thread_pool_device) = in.prod(reduction_dim);
+                }
+
                template <typename ElementType, unsigned int Rank, unsigned int ReductionDims>
                void reduce_product(void* input,
                                    void* output,

--- a/src/ngraph/runtime/cpu/kernel/reduce_sum.hpp
+++ b/src/ngraph/runtime/cpu/kernel/reduce_sum.hpp
@@ -52,6 +52,33 @@ namespace ngraph
                    out.device(eigen::global_thread_pool_device) = in.sum();
                }

+                template <typename ElementType, unsigned int Rank>
+                void reduce_sum_innermost_1rd(void* input,
+                                              void* output,
+                                              const Shape& input_shape,
+                                              const Shape& output_shape)
+                {
+                    Eigen::array<Eigen::Index, Rank> in_dims;
+                    Eigen::array<Eigen::Index, Rank - 1> out_dims;
+                    Eigen::IndexList<Eigen::type2index<Rank - 1>> reduction_dim;
+
+                    for (int i = 0; i < Rank; i++)
+                    {
+                        in_dims[i] = input_shape[i];
+                    }
+
+                    for (int i = 0; i < Rank - 1; i++)
+                    {
+                        out_dims[i] = output_shape[i];
+                    }
+
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, Rank - 1, Eigen::RowMajor>> out(
+                        static_cast<ElementType*>(output), out_dims);
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, Rank, Eigen::RowMajor>> in(
+                        static_cast<ElementType*>(input), in_dims);
+                    out.device(eigen::global_thread_pool_device) = in.sum(reduction_dim);
+                }
+
                template <typename ElementType, unsigned int Rank, unsigned int ReductionDims>
                void reduce_sum(void* input,
                                void* output,