Commit 7d59542d authored by Jayaram Bobba's avatar Jayaram Bobba Committed by Scott Cyphers

Jbobba/dex computation reuse (#1219)

* CPU Direct Execution: Implement ConvertLayout and refactor

* CPU Direct Execution: Implement Convolution

* 1) Adds computation reuse to direct execution
2) Add avg_pool, broadcast and convolution_bias to direct execution
3) Moved some computation reuse utility functions to graph_utils

* Use lists instead of vectors to avoid reallocation overheads

* - Style fix

* style fix
parent 260cb90d
......@@ -503,3 +503,49 @@ size_t ngraph::get_user_count(Node* node)
}
return count;
}
bool ngraph::computes_result(Node* node)
{
if (node->is_output())
{
return true;
}
// Check if node feeds a result node that has been copy eliminated
for (const descriptor::Output& output : node->get_outputs())
{
for (const descriptor::Input* input : output.get_inputs())
{
auto res = std::dynamic_pointer_cast<ngraph::op::Result>(input->get_node());
if (res && !res->needs_copy())
{
return true;
}
}
}
return false;
}
bool ngraph::possibly_overwritten(Node* node)
{
for (const descriptor::Output& output : node->get_outputs())
{
for (const descriptor::Input* input : output.get_inputs())
{
if (auto op = std::dynamic_pointer_cast<ngraph::op::Op>(input->get_node()))
{
if (auto op_annotations = op->get_op_annotations())
{
for (auto oi_pair : op_annotations->get_in_place_oi_pairs())
{
if (input->get_index() == oi_pair.second)
{
return true;
}
}
}
}
}
}
return false;
}
......@@ -141,4 +141,11 @@ namespace ngraph
// Returns count of `node` users that are still live in the graph
size_t get_user_count(Node* node);
// Returns true if `node` computes an output tensor
bool computes_result(Node* node);
// Return true if a node's user could potentially overwrite
// the output of this node with in-place kernels
bool possibly_overwritten(Node* node);
}
......@@ -27,6 +27,7 @@ set(SRC
cpu_tensor_view_wrapper.cpp
cpu_tensor_view.cpp
cpu_tracing.cpp
builder/avg_pool.cpp
builder/convert_layout.cpp
builder/convolution.cpp
kernel/eigen_thread_pool.cpp
......
/*******************************************************************************
* Copyright 2018 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#include "ngraph/runtime/cpu/kernel/avg_pool.hpp"
#include "ngraph/op/avg_pool.hpp"
#include "ngraph/runtime/cpu/cpu_builder.hpp"
#include "ngraph/runtime/cpu/mkldnn_invoke.hpp"
#include "ngraph/runtime/cpu/mkldnn_utils.hpp"
using namespace std;
using namespace ngraph;
namespace ngraph
{
namespace runtime
{
namespace cpu
{
template <>
void Builder::BUILDER_DECL(ngraph::op::AvgPool)
{
auto avg_pool = static_cast<const ngraph::op::AvgPool*>(node);
auto& functors = external_function->get_functors();
auto& tensor_data = external_function->get_tensor_data();
auto arg0_shape = args[0].get_shape();
auto out_shape = out[0].get_shape();
auto& arg0_tensor = tensor_data[args[0].get_name()];
auto& out_tensor = tensor_data[out[0].get_name()];
auto window_shape = avg_pool->get_window_shape();
auto window_movement_strides = avg_pool->get_window_movement_strides();
auto padding_below = avg_pool->get_padding_below();
auto padding_above = avg_pool->get_padding_above();
auto include_padding_in_avg_computation =
avg_pool->get_include_padding_in_avg_computation();
if (runtime::cpu::mkldnn_utils::use_mkldnn_kernel(node))
{
auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
auto input_desc = mkldnn_emitter->build_memory_descriptor(
args[0], runtime::cpu::mkldnn_utils::get_input_mkldnn_format(node, 0));
auto result_desc = mkldnn_emitter->build_memory_descriptor(
out[0], runtime::cpu::mkldnn_utils::get_output_mkldnn_format(node, 0));
size_t avg_pool_index = mkldnn_emitter->build_pooling_forward(
(include_padding_in_avg_computation
? mkldnn::algorithm::pooling_avg_include_padding
: mkldnn::algorithm::pooling_avg_exclude_padding),
input_desc,
result_desc,
window_movement_strides,
window_shape,
padding_below,
padding_above);
auto& deps = mkldnn_emitter->get_primitive_deps(avg_pool_index);
auto functor = [&, avg_pool_index](CPURuntimeContext* ctx) {
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[0], arg0_tensor);
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[1], out_tensor);
cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, avg_pool_index);
};
functors.emplace_back(functor);
}
else
{
std::function<decltype(runtime::cpu::kernel::avg_pool<float>)> kernel;
SELECT_KERNEL(
kernel, out[0].get_element_type(), runtime::cpu::kernel::avg_pool);
auto functor = [&,
kernel,
arg0_shape,
out_shape,
window_shape,
window_movement_strides,
padding_below,
padding_above,
include_padding_in_avg_computation](CPURuntimeContext* ctx) {
kernel(arg0_tensor,
out_tensor,
arg0_shape,
out_shape,
window_shape,
window_movement_strides,
padding_below,
padding_above,
include_padding_in_avg_computation);
};
functors.emplace_back(functor);
}
}
}
}
}
......@@ -19,6 +19,7 @@
#include "ngraph/runtime/cpu/kernel/convolution.hpp"
#include "ngraph/runtime/cpu/mkldnn_invoke.hpp"
#include "ngraph/runtime/cpu/mkldnn_utils.hpp"
#include "ngraph/runtime/cpu/op/conv_bias.hpp"
using namespace std;
using namespace ngraph;
......@@ -141,6 +142,83 @@ namespace ngraph
}
}
template <>
void Builder::BUILDER_DECL(ngraph::op::ConvolutionBias)
{
auto convolution = static_cast<const ngraph::op::ConvolutionBias*>(node);
auto& functors = external_function->get_functors();
auto& tensor_data = external_function->get_tensor_data();
auto arg0_shape = args[0].get_shape();
auto arg1_shape = args[1].get_shape();
auto result_shape = out[0].get_shape();
auto& arg0_tensor = tensor_data[args[0].get_name()];
auto& arg1_tensor = tensor_data[args[1].get_name()];
auto& arg2_tensor = tensor_data[args[2].get_name()];
auto& out_tensor = tensor_data[out[0].get_name()];
if (runtime::cpu::mkldnn_utils::use_mkldnn_kernel(node))
{
// For dilation, MKLDNN wants to know how many elements to insert between, not how far
// apart to space the elements like nGraph. So we have to subtract 1 from each pos.
Strides window_dilation_strides_adjusted;
for (size_t s : convolution->get_window_dilation_strides())
{
window_dilation_strides_adjusted.push_back(s - 1);
}
auto input_format =
runtime::cpu::mkldnn_utils::get_input_mkldnn_format(node, 0);
auto weights_format =
runtime::cpu::mkldnn_utils::get_input_mkldnn_format(node, 1);
// HACK to help MKLDNN pick the right implementation
if (weights_format == mkldnn::memory::format::nchw)
{
weights_format = mkldnn::memory::format::oihw;
}
auto bias_format = mkldnn_utils::get_input_mkldnn_format(node, 2);
auto output_format =
runtime::cpu::mkldnn_utils::get_output_mkldnn_format(node, 0);
auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
auto input_data_desc =
mkldnn_emitter->build_memory_descriptor(args[0], input_format);
auto weights_desc =
mkldnn_emitter->build_memory_descriptor(args[1], weights_format);
auto bias_desc = mkldnn_emitter->build_memory_descriptor(args[2], bias_format);
auto result_desc =
mkldnn_emitter->build_memory_descriptor(out[0], output_format);
size_t conv_index = 0;
conv_index = mkldnn_emitter->build_convolution_forward(
input_data_desc,
weights_desc,
bias_desc,
result_desc,
convolution->get_window_movement_strides(),
window_dilation_strides_adjusted,
convolution->get_padding_below(),
convolution->get_padding_above());
auto& deps = mkldnn_emitter->get_primitive_deps(conv_index);
auto functor = [&, conv_index](CPURuntimeContext* ctx) {
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[0], arg0_tensor);
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[1], arg1_tensor);
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[2], arg2_tensor);
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[3], out_tensor);
cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, conv_index);
};
functors.emplace_back(functor);
}
else
{
throw ngraph_error("ConvolutionBias is only supported with MKLDNN kernel.");
}
}
template <>
void Builder::BUILDER_DECL(ngraph::op::ConvolutionBackpropData)
{
......
......@@ -94,6 +94,7 @@
#include "ngraph/runtime/cpu/cpu_op_annotations.hpp"
#include "ngraph/runtime/cpu/kernel/abs.hpp"
#include "ngraph/runtime/cpu/kernel/add.hpp"
#include "ngraph/runtime/cpu/kernel/broadcast.hpp"
#include "ngraph/runtime/cpu/kernel/ceil.hpp"
#include "ngraph/runtime/cpu/kernel/multiply.hpp"
#include "ngraph/runtime/cpu/kernel/relu.hpp"
......@@ -175,6 +176,33 @@ namespace ngraph
BUILD_UNARY_ELEMWISE_FUNCTOR(runtime::cpu::kernel::abs);
}
template <>
void Builder::BUILDER_DECL(ngraph::op::Broadcast)
{
std::function<void(void*, void*, const Shape&, const Shape&, const AxisSet&)>
kernel;
SELECT_KERNEL(kernel, out[0].get_element_type(), runtime::cpu::kernel::broadcast);
auto& functors = external_function->get_functors();
auto& tensor_data = external_function->get_tensor_data();
auto arg0_shape = args[0].get_shape();
auto result_shape = out[0].get_shape();
auto& arg0_tensor = tensor_data[args[0].get_name()];
auto& out_tensor = tensor_data[out[0].get_name()];
auto broadcast = static_cast<const ngraph::op::Broadcast*>(node);
auto broadcast_axes = broadcast->get_broadcast_axes();
auto functor =
[&, kernel, arg0_shape, result_shape, broadcast_axes](CPURuntimeContext* ctx) {
kernel(arg0_tensor, out_tensor, arg0_shape, result_shape, broadcast_axes);
};
functors.emplace_back(functor);
}
template <>
void Builder::BUILDER_DECL(ngraph::op::Ceiling)
{
......@@ -370,11 +398,15 @@ namespace ngraph
{TI(ngraph::op::Multiply), &runtime::cpu::Builder::build<ngraph::op::Multiply>},
{TI(ngraph::op::Parameter), &runtime::cpu::Builder::nop},
{TI(ngraph::op::Abs), &runtime::cpu::Builder::build<ngraph::op::Abs>},
{TI(ngraph::op::AvgPool), &runtime::cpu::Builder::build<ngraph::op::AvgPool>},
{TI(ngraph::op::Broadcast), &runtime::cpu::Builder::build<ngraph::op::Broadcast>},
{TI(ngraph::op::Ceiling), &runtime::cpu::Builder::build<ngraph::op::Ceiling>},
{TI(ngraph::runtime::cpu::op::ConvertLayout),
&runtime::cpu::Builder::build<ngraph::runtime::cpu::op::ConvertLayout>},
{TI(ngraph::op::Convolution),
&runtime::cpu::Builder::build<ngraph::op::Convolution>},
{TI(ngraph::op::ConvolutionBias),
&runtime::cpu::Builder::build<ngraph::op::ConvolutionBias>},
{TI(ngraph::op::ConvolutionBackpropData),
&runtime::cpu::Builder::build<ngraph::op::ConvolutionBackpropData>},
{TI(ngraph::op::ConvolutionBackpropFilters),
......
......@@ -810,53 +810,9 @@ using namespace ngraph::runtime;
}
}
auto computes_output = [&]() {
if (node->is_output())
{
return true;
}
// Check if node feeds a result node that has been copy eliminated
for (const descriptor::Output& output : node->get_outputs())
{
for (const descriptor::Input* input : output.get_inputs())
{
auto res =
std::dynamic_pointer_cast<ngraph::op::Result>(input->get_node());
if (res && !res->needs_copy())
{
return true;
}
}
}
return false;
};
auto possibly_overwritten = [&]() {
for (const descriptor::Output& output : node->get_outputs())
{
for (const descriptor::Input* input : output.get_inputs())
{
if (auto op =
std::dynamic_pointer_cast<ngraph::op::Op>(input->get_node()))
{
if (auto op_annotations = op->get_op_annotations())
{
for (auto oi_pair : op_annotations->get_in_place_oi_pairs())
{
if (input->get_index() == oi_pair.second)
{
return true;
}
}
}
}
}
}
return false;
};
// Always enable nodes computing output tensors or nodes whose outputs might get
// overwritten due to inplace kernels
if (computes_output() || possibly_overwritten())
if (computes_result(node.get()) || possibly_overwritten(node.get()))
{
writer << " || 1";
}
......@@ -1182,6 +1138,10 @@ void runtime::cpu::CPU_ExternalFunction::build()
for (shared_ptr<Node> node : m_function->get_ordered_ops())
{
if (node->is_parameter() || node->is_constant())
{
continue;
}
auto& n = *node; // Work around a compiler warning (*node inside typeid may have effects
// with shared pointers, which is fine here but clang doesn't like it.)
auto handler = build_dispatcher.find(type_index(typeid(n)));
......@@ -1190,23 +1150,48 @@ void runtime::cpu::CPU_ExternalFunction::build()
throw ngraph_error("Unhandled op during code generation : " + node->description());
}
vector<TensorViewWrapper> in;
vector<string> in_names;
for (const descriptor::Input& input : node->get_inputs())
{
const descriptor::Output& output = input.get_output();
shared_ptr<descriptor::TensorView> tv = output.get_tensor_view();
in.push_back(TensorViewWrapper(tv, tv->get_tensor().get_name()));
in_names.push_back(tv->get_tensor().get_name());
}
vector<TensorViewWrapper> out;
vector<string> out_names;
for (const descriptor::Output& output : node->get_outputs())
{
shared_ptr<descriptor::TensorView> tv = output.get_tensor_view();
out.push_back(TensorViewWrapper(tv, tv->get_tensor().get_name()));
out_names.push_back(tv->get_tensor().get_name());
}
size_t functor_count = functors.size();
handler->second(this, node.get(), in, out);
bool disable_caching = computes_result(node.get()) || possibly_overwritten(node.get());
auto enable = [&, in_names, out_names, disable_caching](CPURuntimeContext* ctx) -> bool {
bool en = false;
for (const auto& name : in_names)
{
if (tensor_stale[name] || disable_caching)
{
en = true;
}
}
for (const auto& name : out_names)
{
tensor_stale[name] = en;
}
return en;
};
enables.emplace_back(make_pair(enable, functors.size() - functor_count));
}
executor = [&](CPURuntimeContext* ctx, vector<void*>& inputs, vector<void*>& outputs) {
static bool first_iteration = true;
for (auto& p : intermediates_offsets)
{
tensor_data[p.first] =
......@@ -1216,6 +1201,7 @@ void runtime::cpu::CPU_ExternalFunction::build()
for (const auto& p : function_input_index)
{
tensor_data[p.first] = inputs[p.second];
tensor_stale[p.first] = ctx->p_en[p.second];
}
for (const auto& p : function_output_index)
......@@ -1223,10 +1209,23 @@ void runtime::cpu::CPU_ExternalFunction::build()
tensor_data[p.first] = outputs[p.second];
}
for (const auto& functor : functors)
auto functor = functors.begin();
for (const auto& p : enables)
{
functor(ctx);
if (p.first(ctx) || first_iteration)
{
for (size_t j = 0; j < p.second; j++)
{
(*functor)(ctx);
std::advance(functor, 1);
}
}
else
{
std::advance(functor, p.second);
}
}
first_iteration = false;
};
m_is_built = true;
......
......@@ -158,9 +158,11 @@ namespace ngraph
std::string m_function_name;
std::list<std::function<void(CPURuntimeContext*)>> functors;
std::list<std::pair<std::function<bool(CPURuntimeContext*)>, size_t>> enables;
std::function<void(CPURuntimeContext*, std::vector<void*>&, std::vector<void*>&)>
executor;
std::unordered_map<std::string, void*> tensor_data;
std::unordered_map<std::string, bool> tensor_stale;
std::unordered_map<std::string, size_t> intermediates_offsets;
std::unordered_map<std::string, size_t> function_input_index, function_output_index;
bool m_is_built;
......
/*******************************************************************************
* Copyright 2018 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#pragma once
#include "ngraph/runtime/reference/avg_pool.hpp"
#include "ngraph/shape.hpp"
namespace ngraph
{
namespace runtime
{
namespace cpu
{
namespace kernel
{
template <typename ElementType>
void avg_pool(void* arg,
void* out,
const Shape& arg_shape,
const Shape& out_shape,
const Shape& window_shape,
const Strides& window_movement_strides,
const Shape& padding_below,
const Shape& padding_above,
bool include_padding_in_avg_computation)
{
reference::avg_pool<ElementType>(static_cast<const ElementType*>(arg),
static_cast<ElementType*>(out),
arg_shape,
out_shape,
window_shape,
window_movement_strides,
padding_below,
padding_above,
include_padding_in_avg_computation);
}
}
}
}
}
/*******************************************************************************
* Copyright 2018 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#pragma once
#define EIGEN_USE_THREADS
#include <unsupported/Eigen/CXX11/Tensor>
#include "ngraph/runtime/cpu/kernel/eigen_thread_pool.hpp"
#include "ngraph/runtime/reference/broadcast.hpp"
namespace ngraph
{
namespace runtime
{
namespace cpu
{
namespace kernel
{
template <typename ElementType>
void broadcast(void* input0,
void* output,
const Shape& arg0_shape,
const Shape& result_shape,
const AxisSet& broadcast_axes)
{
reference::broadcast<ElementType>(static_cast<const ElementType*>(input0),
static_cast<ElementType*>(output),
arg0_shape,
result_shape,
broadcast_axes);
}
}
}
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment