Unverified Commit 107091d0 authored by Jayaram Bobba's avatar Jayaram Bobba Committed by GitHub

Merge pull request #527 from NervanaSystems/pruthvi/mkldnn_elementwise_add

Elementwise Add mkldnn support in cpu emitter
parents ce3670b5 f84b5ed1
......@@ -150,12 +150,53 @@ namespace ngraph
<< args[1].get_name() << ");\n";
writer << "out = arg0 + arg1;\n";
#else
if (runtime::cpu::mkldnn_utils::use_mkldnn_kernel(node))
{
std::vector<float> scale_vector(2, 1);
std::vector<mkldnn::memory::primitive_desc> inputs_pd;
auto input0_format =
runtime::cpu::mkldnn_utils::get_input_mkldnn_format(node, 0);
auto input1_format =
runtime::cpu::mkldnn_utils::get_input_mkldnn_format(node, 1);
auto result_format =
runtime::cpu::mkldnn_utils::get_output_mkldnn_format(node, 0);
auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
auto input0_data_desc =
mkldnn_emitter->build_memory_descriptor(args[0], input0_format);
auto input1_data_desc =
mkldnn_emitter->build_memory_descriptor(args[1], input1_format);
auto result_desc =
mkldnn_emitter->build_memory_descriptor(out[0], result_format);
inputs_pd.push_back(mkldnn::memory::primitive_desc(
input0_data_desc, runtime::cpu::mkldnn_utils::global_cpu_engine));
inputs_pd.push_back(mkldnn::memory::primitive_desc(
input1_data_desc, runtime::cpu::mkldnn_utils::global_cpu_engine));
size_t add_index = 0;
add_index = mkldnn_emitter->build_elementwise_add(
input0_data_desc, input1_data_desc, result_desc, scale_vector, inputs_pd);
auto& deps = mkldnn_emitter->get_primitive_deps(add_index);
writer << "cpu::mkldnn_utils::set_memory_ptr(ctx, " << to_string(deps[0])
<< ", " << args[0].get_name() << ");\n";
writer << "cpu::mkldnn_utils::set_memory_ptr(ctx, " << to_string(deps[1])
<< ", " << args[1].get_name() << ");\n";
writer << "cpu::mkldnn_utils::set_memory_ptr(ctx, " << to_string(deps[2])
<< ", " << out[0].get_name() << ");\n";
writer << "cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, "
<< to_string(add_index) << ");\n";
}
else
{
writer << "#pragma omp parallel for\n";
writer << "for (size_t i = 0; i < " << out[0].get_size() << "; i++)\n";
writer << "{\n";
writer << " " << out[0].get_name() << "[i] = " << args[0].get_name() << "[i] + "
<< args[1].get_name() << "[i];\n";
writer << " " << out[0].get_name() << "[i] = " << args[0].get_name()
<< "[i] + " << args[1].get_name() << "[i];\n";
writer << "}\n";
}
#endif
writer.indent--;
writer << "}\n";
......
......@@ -136,3 +136,31 @@ size_t MKLDNNEmitter::build_convolution_forward(const mkldnn::memory::desc& inpu
primitive_deps[conv_index] = {input_data_index, weights_index, result_index};
return conv_index;
}
size_t MKLDNNEmitter::build_elementwise_add(
const mkldnn::memory::desc& input0_data_desc,
const mkldnn::memory::desc& input1_data_desc,
const mkldnn::memory::desc& result_desc,
const std::vector<float>& scale_vector,
const std::vector<mkldnn::memory::primitive_desc>& inputs_pd)
{
std::vector<mkldnn::memory::primitive::at> inputs_primitive;
size_t input0_data_index = build_memory_primitive(input0_data_desc);
size_t input1_data_index = build_memory_primitive(input1_data_desc);
size_t result_index = build_memory_primitive(result_desc);
inputs_primitive.push_back(*mkldnn_primitives[input0_data_index]);
inputs_primitive.push_back(*mkldnn_primitives[input1_data_index]);
// elementwise sum primtive descriptor
mkldnn::sum::primitive_desc sum_pd =
mkldnn::sum::primitive_desc(result_desc, scale_vector, inputs_pd);
// sum primitive
size_t add_index = insert_primitive(
new mkldnn::sum(sum_pd, inputs_primitive, *mkldnn_primitives[result_index]));
primitive_deps[add_index] = {input0_data_index, input1_data_index, result_index};
return add_index;
}
......@@ -68,6 +68,13 @@ namespace ngraph
const ngraph::CoordinateDiff& padding_below,
const ngraph::CoordinateDiff& padding_above);
size_t build_elementwise_add(
const mkldnn::memory::desc& input0_data_desc,
const mkldnn::memory::desc& input1_data_desc,
const mkldnn::memory::desc& result_desc,
const std::vector<float>& scale_vector,
const std::vector<mkldnn::memory::primitive_desc>& input_pd);
private:
std::shared_ptr<CPU_ExternalFunction> external_function;
std::vector<mkldnn::primitive*> mkldnn_primitives;
......
......@@ -20,6 +20,7 @@
#include <unordered_set>
#include "ngraph/node.hpp"
#include "ngraph/ops/add.hpp"
#include "ngraph/ops/avg_pool.hpp"
#include "ngraph/ops/batch_norm.hpp"
#include "ngraph/ops/convolution.hpp"
......@@ -38,6 +39,7 @@ using namespace std;
#define TI(x) std::type_index(typeid(x))
static const std::unordered_set<std::type_index> s_op_registry{
TI(ngraph::op::Add),
TI(ngraph::op::AvgPool),
TI(ngraph::op::AvgPoolBackprop),
TI(ngraph::op::BatchNorm),
......
......@@ -25,6 +25,7 @@
#include <mkldnn.hpp>
#include "ngraph/descriptor/output.hpp"
#include "ngraph/ops/add.hpp"
#include "ngraph/ops/avg_pool.hpp"
#include "ngraph/ops/convolution.hpp"
#include "ngraph/ops/relu.hpp"
......@@ -42,6 +43,33 @@ namespace ngraph
{
namespace pass
{
template <>
void CPUAssignment::ASSIGN_DECL(ngraph::op::Add)
{
auto add = static_cast<op::Add*>(node);
auto arg0_shape = node->get_input_shape(0);
auto arg1_shape = node->get_input_shape(1);
auto arg0_rank = arg0_shape.size();
auto arg1_rank = arg1_shape.size();
auto src_size = 1;
for (size_t i = 0; i < node->get_input_shape(0).size(); i++)
{
src_size *= arg0_shape[0];
}
// insert Add as MKLDNN op, only if the src_size is big. this is to avoid MKLDNN overhead
// for smaller tensor sizes
if (node->get_input_element_type(0) == element::f32 &&
node->get_input_element_type(1) == element::f32 && arg0_rank == 4 &&
arg1_rank == 4 && src_size > 64000)
{
auto op_annotations =
std::make_shared<ngraph::runtime::cpu::CPUOpAnnotations>();
op_annotations->set_mkldnn_op(true);
add->set_op_annotations(op_annotations);
}
}
template <>
void CPUAssignment::ASSIGN_DECL(ngraph::op::Convolution)
{
......@@ -204,6 +232,7 @@ namespace ngraph
#define TI(x) type_index(typeid(x))
static const runtime::cpu::pass::AssignOpMap s_dispatcher{
{TI(ngraph::op::Add), &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::Add>},
{TI(ngraph::op::Convolution),
&runtime::cpu::pass::CPUAssignment::assign<ngraph::op::Convolution>},
{TI(ngraph::op::ConvolutionBackpropData),
......
......@@ -25,6 +25,7 @@
#include "cpu_layout.hpp"
#include "ngraph/descriptor/output.hpp"
#include "ngraph/graph_util.hpp"
#include "ngraph/ops/add.hpp"
#include "ngraph/ops/avg_pool.hpp"
#include "ngraph/ops/convolution.hpp"
#include "ngraph/ops/op.hpp"
......@@ -666,6 +667,29 @@ namespace ngraph
set_default_layouts(external_function, node);
}
}
template <>
void CPULayout::LAYOUT_DECL(ngraph::op::Add)
{
if (runtime::cpu::mkldnn_utils::use_mkldnn_kernel(node.get()))
{
auto input0_layout =
runtime::cpu::mkldnn_utils::get_input_mkldnn_format(node.get(), 0);
vector<memory::format> prim_input_formats;
vector<memory::format> prim_output_formats;
prim_input_formats.push_back(input0_layout);
prim_input_formats.push_back(input0_layout);
prim_output_formats.push_back(input0_layout);
node =
insert_input_conversions(external_function, node, prim_input_formats);
set_output_layouts(node, prim_output_formats);
}
else
{
set_default_layouts(external_function, node);
}
}
}
}
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment