Commit ec45be4b authored by Jaikrishnan Menon's avatar Jaikrishnan Menon Committed by Scott Cyphers

DEX: Reduce function (#1349)

* CPU Direct Execution: Implement Reduce

* Workarounds for ancient CI compilers

* Fix return types

* Review comments
parent 21012673
......@@ -44,6 +44,7 @@ set(SRC
builder/relu.cpp
builder/pad.cpp
builder/product.cpp
builder/reduce_function.cpp
builder/reduce_function_window.cpp
builder/reshape.cpp
builder/reverse.cpp
......
/*******************************************************************************
* Copyright 2018 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#include "ngraph/runtime/cpu/kernel/reduce_function.hpp"
#include "ngraph/op/reduce.hpp"
#include "ngraph/runtime/backend.hpp"
#include "ngraph/runtime/cpu/cpu_builder.hpp"
#include "ngraph/runtime/cpu/cpu_external_function.hpp"
#include "ngraph/runtime/tensor_view.hpp"
using namespace std;
using namespace ngraph;
namespace ngraph
{
namespace runtime
{
namespace cpu
{
template <>
void Builder::BUILDER_DECL(ngraph::op::Reduce)
{
auto reduce = static_cast<const ngraph::op::Reduce*>(node);
auto function = reduce->get_functions()[0];
auto& functors = external_function->get_functors();
auto& tensor_data = external_function->get_tensor_data();
auto& callees = external_function->get_callees();
if (!callees.count(function->get_name()))
{
callees[function->get_name()] = make_shared<CPU_ExternalFunction>(function);
}
auto& reducer_external_function = callees[function->get_name()];
auto& arg0_tensor = tensor_data[args[0].get_name()];
auto& arg1_tensor = tensor_data[args[1].get_name()];
auto& out_tensor = tensor_data[out[0].get_name()];
auto arg0_shape = args[0].get_shape();
auto out_shape = out[0].get_shape();
auto reduction_axes = reduce->get_reduction_axes();
if (reduction_axes.empty())
{
size_t size = args[0].get_size() * args[0].get_element_type().size();
auto functor = [&, size](CPURuntimeContext* ctx) {
memcpy(out_tensor, arg0_tensor, size);
};
functors.emplace_back(functor);
}
else if (reduction_axes.size() == 1)
{
std::function<decltype(runtime::cpu::kernel::reduce_function_1rd<float, 1>)>
kernel;
SELECT_KERNEL_BY_RANK(kernel,
args[0].get_element_type(),
arg0_shape.size(),
runtime::cpu::kernel::reduce_function_1rd);
auto functor =
[&, kernel, arg0_shape, out_shape, reduction_axes](CPURuntimeContext* ctx) {
kernel(arg0_tensor,
arg1_tensor,
out_tensor,
arg0_shape,
out_shape,
reduction_axes,
reducer_external_function);
};
functors.emplace_back(functor);
}
else if (arg0_shape.size() == 2 && reduction_axes.size() == 2)
{
std::function<decltype(runtime::cpu::kernel::reduce_function_2d_2rd<float>)>
kernel;
SELECT_KERNEL(kernel,
args[0].get_element_type(),
runtime::cpu::kernel::reduce_function_2d_2rd);
auto functor =
[&, kernel, arg0_shape, out_shape, reduction_axes](CPURuntimeContext* ctx) {
kernel(arg0_tensor,
arg1_tensor,
out_tensor,
arg0_shape,
out_shape,
reduction_axes,
reducer_external_function);
};
functors.emplace_back(functor);
}
else if (arg0_shape.size() == 3 && reduction_axes.size() == 2)
{
std::function<decltype(runtime::cpu::kernel::reduce_function_3d_2rd<float>)>
kernel;
SELECT_KERNEL(kernel,
args[0].get_element_type(),
runtime::cpu::kernel::reduce_function_3d_2rd);
auto functor =
[&, kernel, arg0_shape, out_shape, reduction_axes](CPURuntimeContext* ctx) {
kernel(arg0_tensor,
arg1_tensor,
out_tensor,
arg0_shape,
out_shape,
reduction_axes,
reducer_external_function);
};
functors.emplace_back(functor);
}
else
{
throw ngraph_error("Unsupported Reduce");
}
}
REGISTER_OP_BUILDER(Reduce);
}
}
}
......@@ -1189,7 +1189,8 @@ void runtime::cpu::CPU_ExternalFunction::build()
auto handler = build_dispatcher.find(type_index(typeid(n)));
if (handler == build_dispatcher.end())
{
throw ngraph_error("Unhandled op during code generation : " + node->description());
throw ngraph_error("Unhandled op during executor construction : " +
node->description());
}
vector<TensorViewWrapper> in;
vector<string> in_names;
......
......@@ -32,8 +32,6 @@ using namespace std;
// TODO(jmenon): Refactor all the alignment specifications into
// a single place and allow lower or no alignment when possible
const size_t runtime::cpu::CPUTensorView::BufferAlignment = 64;
runtime::cpu::CPUTensorView::CPUTensorView(const ngraph::element::Type& element_type,
const Shape& shape,
void* memory_pointer,
......
......@@ -57,9 +57,9 @@ namespace ngraph
/// @param n Number of bytes to read, must be integral number of elements.
void read(void* p, size_t tensor_offset, size_t n) const override;
private:
static const size_t BufferAlignment;
static constexpr size_t BufferAlignment = 64;
private:
char* buffer;
char* aligned_buffer;
size_t buffer_size;
......
/*******************************************************************************
* Copyright 2018 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#pragma once
#define EIGEN_USE_THREADS
#include <unsupported/Eigen/CXX11/Tensor>
#include "ngraph/axis_set.hpp"
#include "ngraph/runtime/backend.hpp"
#include "ngraph/runtime/cpu/cpu_external_function.hpp"
#include "ngraph/runtime/cpu/cpu_tensor_view.hpp"
#include "ngraph/runtime/cpu/kernel/eigen_thread_pool.hpp"
#include "ngraph/shape.hpp"
#include "ngraph/type/element_type.hpp"
namespace ngraph
{
namespace runtime
{
namespace cpu
{
namespace kernel
{
template <typename ElementType>
struct Reducer
{
static const bool PacketAccess = false;
static const bool IsStateful = false;
ElementType initial;
const std::shared_ptr<CPU_ExternalFunction>& external_function;
std::shared_ptr<Backend> backend;
Reducer(ElementType x, const std::shared_ptr<CPU_ExternalFunction>& ef)
: initial(x)
, external_function(ef)
, backend(runtime::Backend::create("CPU"))
{
}
void reduce(const ElementType v, ElementType* R)
{
TensorViewPtrs inputs, outputs;
ElementType p __attribute__((aligned(CPUTensorView::BufferAlignment))) = v;
ElementType q __attribute__((aligned(CPUTensorView::BufferAlignment))) = *R;
ElementType r __attribute__((aligned(CPUTensorView::BufferAlignment)));
inputs.emplace_back(backend->create_tensor(
ngraph::element::from<ElementType>(), Shape{}, &p));
inputs.emplace_back(backend->create_tensor(
ngraph::element::from<ElementType>(), Shape{}, &q));
outputs.emplace_back(backend->create_tensor(
ngraph::element::from<ElementType>(), Shape{}, &r));
auto call_frame = external_function->make_call_frame();
call_frame->call(outputs, inputs);
*R = r;
}
ElementType initialize() const { return initial; }
ElementType finalize(const ElementType R) const { return R; }
};
template <typename ElementType, unsigned int Rank, unsigned int ReductionDims>
void reduce_function(void* input0,
void* input1,
void* output,
const Shape& input_shape,
const Shape& output_shape,
const AxisSet& reduction_axes,
const std::shared_ptr<CPU_ExternalFunction>& external_function)
{
Eigen::array<Eigen::Index, Rank> in_dims;
Eigen::array<Eigen::Index, Rank - ReductionDims> out_dims;
Eigen::array<Eigen::Index, ReductionDims> reduction_dims;
for (int i = 0; i < Rank; i++)
{
in_dims[i] = input_shape[i];
}
for (int i = 0; i < Rank - ReductionDims; i++)
{
out_dims[i] = output_shape[i];
}
int i = 0;
for (auto axis : reduction_axes)
{
reduction_dims[i++] = axis;
}
Eigen::TensorMap<
Eigen::Tensor<ElementType, Rank - ReductionDims, Eigen::RowMajor>>
out(static_cast<ElementType*>(output), out_dims);
Eigen::TensorMap<Eigen::Tensor<ElementType, Rank, Eigen::RowMajor>> in(
static_cast<ElementType*>(input0), in_dims);
Reducer<ElementType> reducer(*static_cast<ElementType*>(input1),
external_function);
out.device(eigen::global_thread_pool_device) =
in.reduce(reduction_dims, reducer);
}
template <typename ElementType, unsigned int Rank>
void reduce_function_1rd(
void* input0,
void* input1,
void* output,
const Shape& input_shape,
const Shape& output_shape,
const AxisSet& reduction_axes,
const std::shared_ptr<CPU_ExternalFunction>& external_function)
{
reduce_function<ElementType, Rank, 1>(input0,
input1,
output,
input_shape,
output_shape,
reduction_axes,
external_function);
}
template <typename ElementType>
void reduce_function_2d_2rd(
void* input0,
void* input1,
void* output,
const Shape& input_shape,
const Shape& output_shape,
const AxisSet& reduction_axes,
const std::shared_ptr<CPU_ExternalFunction>& external_function)
{
reduce_function<ElementType, 2, 2>(input0,
input1,
output,
input_shape,
output_shape,
reduction_axes,
external_function);
}
template <typename ElementType>
void reduce_function_3d_2rd(
void* input0,
void* input1,
void* output,
const Shape& input_shape,
const Shape& output_shape,
const AxisSet& reduction_axes,
const std::shared_ptr<CPU_ExternalFunction>& external_function)
{
reduce_function<ElementType, 3, 2>(input0,
input1,
output,
input_shape,
output_shape,
reduction_axes,
external_function);
}
}
}
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment