Commit 3bc71391 authored by Jaikrishnan Menon's avatar Jaikrishnan Menon Committed by Scott Cyphers

Optimize sum reductions for common shapes (#811)

* CPU: Optimize Sum reductions

* CPU: Optimize 1D reduce all case

* CPU: Optimize 4D reduce all sum

* CPU: Tweaks

* Formatting fixes
parent f27c2289
......@@ -205,6 +205,7 @@ if (NGRAPH_CPU_ENABLE AND LLVM_INCLUDE_DIR AND
runtime/cpu/mkldnn_utils.cpp
runtime/cpu/kernel/eigen_thread_pool.cpp
runtime/cpu/kernel/pad.cpp
runtime/cpu/kernel/reduce_sum.cpp
runtime/cpu/op/conv_bias.cpp
runtime/cpu/op/conv_relu.cpp
runtime/cpu/op/convert_layout.cpp
......
......@@ -1710,13 +1710,53 @@ namespace ngraph
<< "});\n";
}
#else
kernel::emit_sum(writer,
args[0].get_element_type().c_type_string(),
args[0].get_name(),
out[0].get_name(),
args[0].get_shape(),
out[0].get_shape(),
sum->get_reduction_axes());
if (args[0].get_element_type() == element::f32 && args[0].get_shape().size() == 1 &&
sum->get_reduction_axes().size() == 1)
{
writer << "cpu::kernel::reduce_sum_all_1d_float32(" << args[0].get_name()
<< ", " << out[0].get_name() << ", "
<< "{" << join(args[0].get_shape()) << "}, "
<< "{" << join(out[0].get_shape()) << "}"
<< ");\n";
}
else if (args[0].get_element_type() == element::f32 &&
args[0].get_shape().size() == 2 && sum->get_reduction_axes().size() == 2)
{
writer << "cpu::kernel::reduce_sum_all_2d_float32(" << args[0].get_name()
<< ", " << out[0].get_name() << ", "
<< "{" << join(args[0].get_shape()) << "}, "
<< "{" << join(out[0].get_shape()) << "}"
<< ");\n";
}
else if (args[0].get_element_type() == element::f32 &&
args[0].get_shape().size() == 2 && sum->get_reduction_axes().size() == 1)
{
writer << "cpu::kernel::reduce_sum_2d_1rd_float32(" << args[0].get_name()
<< ", " << out[0].get_name() << ", "
<< "{" << join(args[0].get_shape()) << "}, "
<< "{" << join(out[0].get_shape()) << "}, "
<< "{" << join(sum->get_reduction_axes()) << "}"
<< ");\n";
}
else if (args[0].get_element_type() == element::f32 &&
args[0].get_shape().size() == 4 && sum->get_reduction_axes().size() == 4)
{
writer << "cpu::kernel::reduce_sum_all_4d_float32(" << args[0].get_name()
<< ", " << out[0].get_name() << ", "
<< "{" << join(args[0].get_shape()) << "}, "
<< "{" << join(out[0].get_shape()) << "}"
<< ");\n";
}
else
{
kernel::emit_sum(writer,
args[0].get_element_type().c_type_string(),
args[0].get_name(),
out[0].get_name(),
args[0].get_shape(),
out[0].get_shape(),
sum->get_reduction_axes());
}
#endif
writer.block_end();
}
......
......@@ -108,6 +108,7 @@ namespace mkl
namespace ngraph
{
class Shape;
class AxisSet;
namespace runtime
{
......@@ -122,6 +123,27 @@ namespace ngraph
const Shape& output_shape,
const Shape& padding_below,
const Shape& padding_above);
void reduce_sum_all_1d_float32(float* input,
float* output,
const Shape& input_shape,
const Shape& output_shape);
void reduce_sum_all_2d_float32(float* input,
float* output,
const Shape& input_shape,
const Shape& output_shape);
void reduce_sum_2d_1rd_float32(float* input,
float* output,
const Shape& input_shape,
const Shape& output_shape,
const AxisSet& reduction_axes);
void reduce_sum_all_4d_float32(float* input,
float* output,
const Shape& input_shape,
const Shape& output_shape);
}
}
}
......
/*******************************************************************************
* Copyright 2018 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#include "reduce_sum.hpp"
namespace ngraph
{
namespace runtime
{
namespace cpu
{
namespace kernel
{
void reduce_sum_all_1d_float32(float* input,
float* output,
const Shape& input_shape,
const Shape& output_shape)
{
reduce_sum_all<float, 1>(input, output, input_shape, output_shape);
}
void reduce_sum_all_2d_float32(float* input,
float* output,
const Shape& input_shape,
const Shape& output_shape)
{
reduce_sum_all<float, 2>(input, output, input_shape, output_shape);
}
void reduce_sum_2d_1rd_float32(float* input,
float* output,
const Shape& input_shape,
const Shape& output_shape,
const AxisSet& reduction_axes)
{
reduce_sum<float, 2, 1>(
input, output, input_shape, output_shape, reduction_axes);
}
void reduce_sum_all_4d_float32(float* input,
float* output,
const Shape& input_shape,
const Shape& output_shape)
{
reduce_sum_all<float, 4>(input, output, input_shape, output_shape);
}
}
}
}
}
/*******************************************************************************
* Copyright 2018 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#pragma once
#define EIGEN_USE_THREADS
#include <unsupported/Eigen/CXX11/Tensor>
#include "ngraph/runtime/cpu/kernel/eigen_thread_pool.hpp"
#include "ngraph/shape.hpp"
namespace ngraph
{
namespace runtime
{
namespace cpu
{
namespace kernel
{
template <typename ElementType, unsigned int Rank>
void reduce_sum_all(ElementType* input,
ElementType* output,
const Shape& input_shape,
const Shape& output_shape)
{
Eigen::array<Eigen::Index, Rank> in_dims;
Eigen::array<Eigen::Index, 0> out_dims;
for (int i = 0; i < Rank; i++)
{
in_dims[i] = input_shape[i];
}
Eigen::TensorMap<Eigen::Tensor<ElementType, 0, Eigen::RowMajor>> out(output,
out_dims);
Eigen::TensorMap<Eigen::Tensor<ElementType, Rank, Eigen::RowMajor>> in(input,
in_dims);
out.device(eigen::global_thread_pool_device) = in.sum();
}
template <typename ElementType, unsigned int Rank, unsigned int ReductionDims>
void reduce_sum(ElementType* input,
ElementType* output,
const Shape& input_shape,
const Shape& output_shape,
const AxisSet& reduction_axes)
{
Eigen::array<Eigen::Index, Rank> in_dims;
Eigen::array<Eigen::Index, Rank - ReductionDims> out_dims;
Eigen::array<Eigen::Index, ReductionDims> reduction_dims;
for (int i = 0; i < Rank; i++)
{
in_dims[i] = input_shape[i];
}
for (int i = 0; i < Rank - ReductionDims; i++)
{
out_dims[i] = output_shape[i];
}
int i = 0;
for (auto axis : reduction_axes)
{
reduction_dims[i++] = axis;
}
Eigen::TensorMap<
Eigen::Tensor<ElementType, Rank - ReductionDims, Eigen::RowMajor>>
out(output, out_dims);
Eigen::TensorMap<Eigen::Tensor<ElementType, Rank, Eigen::RowMajor>> in(input,
in_dims);
out.device(eigen::global_thread_pool_device) = in.sum(reduction_dims);
}
}
}
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment