Commit f9191dd9 authored by pthoreho's avatar pthoreho

- Added mkldnn add primitive with layout assignment support

- modified mkldnn add kernel to use mkldnn_utils to build primitive and stream execution
parent 7f01651c
......@@ -150,65 +150,44 @@ namespace ngraph
<< args[1].get_name() << ");\n";
writer << "out = arg0 + arg1;\n";
#else
auto op_annotations =
static_cast<const ngraph::op::Op*>(node)->get_op_annotations();
if (op_annotations &&
static_pointer_cast<ngraph::runtime::cpu::CPUOpAnnotations>(op_annotations)
->is_mkldnn_op())
{
auto input0_size_1d = 1;
auto input1_size_1d = 1;
auto result_size_1d = 1;
auto src_size = args[0].get_shape().size();
for (size_t i = 0; i < src_size; i++)
{
input0_size_1d *= args[0].get_shape()[i];
input1_size_1d *= args[1].get_shape()[i];
result_size_1d *= out[0].get_shape()[i];
}
if (runtime::cpu::mkldnn_utils::use_mkldnn_kernel(node))
{
// get input element type
const string& et = runtime::cpu::mkldnn_utils::get_mkldnn_data_type_string(
args[1].get_element_type());
// Bind to CPU engine
writer << "engine cpu_engine = engine(engine::cpu, 0);\n";
writer << "std::vector<float>scale_vector(2, 1);\n";
writer << "std::vector<memory::primitive_desc> inputs_pd;\n";
writer << "std::vector<memory::primitive::at> inputs_primitive;\n";
// memory desc for inputs
writer << "memory::desc input0_data_desc = memory::desc({" << input0_size_1d
<< "}, " << et << ", memory::format::x);\n";
writer << "memory::desc input1_data_desc = memory::desc({" << input1_size_1d
<< "}, " << et << ", memory::format::x);\n";
writer << "memory::desc result_desc = memory::desc({" << result_size_1d << "}, "
<< et << ", memory::format::x);\n";
// memory for the user data
writer << "memory input0_data = memory({input0_data_desc, cpu_engine}, "
<< args[0].get_name() << ");\n";
writer << "memory input1_data = memory({input1_data_desc, cpu_engine}, "
<< args[1].get_name() << ");\n";
writer << "memory result = memory({result_desc, cpu_engine}, "
<< out[0].get_name() << ");\n";
writer << "inputs_pd.push_back(memory::primitive_desc(input0_data_desc, "
"cpu_engine));\n";
writer << "inputs_pd.push_back(memory::primitive_desc(input1_data_desc, "
"cpu_engine));\n";
std::vector<float>scale_vector(2, 1);
std::vector<mkldnn::memory::primitive_desc> inputs_pd;
writer << "inputs_primitive.push_back(primitive::at(input0_data));\n";
writer << "inputs_primitive.push_back(primitive::at(input1_data));\n";
auto input0_format = runtime::cpu::mkldnn_utils::get_input_mkldnn_format(node, 0);
auto input1_format = runtime::cpu::mkldnn_utils::get_input_mkldnn_format(node, 1);
auto result_format = runtime::cpu::mkldnn_utils::get_output_mkldnn_format(node, 0);
auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
auto input0_data_desc = mkldnn_emitter->build_memory_descriptor(args[0], input0_format);
auto input1_data_desc = mkldnn_emitter->build_memory_descriptor(args[1], input1_format);
auto result_desc = mkldnn_emitter->build_memory_descriptor(out[0], result_format);
inputs_pd.push_back(mkldnn::memory::primitive_desc(input0_data_desc,
runtime::cpu::mkldnn_utils::global_cpu_engine));
inputs_pd.push_back(mkldnn::memory::primitive_desc(input1_data_desc,
runtime::cpu::mkldnn_utils::global_cpu_engine));
size_t add_index=0;
add_index = mkldnn_emitter->build_elementwise_add(input0_data_desc,
input1_data_desc,
result_desc,
scale_vector,
inputs_pd);
auto& deps = mkldnn_emitter->get_primitive_deps(add_index);
writer << "cpu::mkldnn_utils::set_memory_ptr(ctx, " << to_string(deps[0])
<< ", " << args[0].get_name() << ");\n";
writer << "cpu::mkldnn_utils::set_memory_ptr(ctx, " << to_string(deps[1])
<< ", " << args[1].get_name() << ");\n";
writer << "cpu::mkldnn_utils::set_memory_ptr(ctx, " << to_string(deps[2])
<< ", " << out[0].get_name() << ");\n";
// elementwise sum primtive descriptor
writer << "sum::primitive_desc sum_pd = sum::primitive_desc(result_desc, "
"scale_vector, inputs_pd);\n";
// sum primitive
writer << "sum sum_primitive = sum(sum_pd, inputs_primitive, result);\n";
// create stream and execute
writer << "stream s = stream(stream::kind::eager);\n"
<< "s.submit({sum_primitive}).wait();\n";
writer << "cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, "
<< to_string(add_index) << ");\n";
}
else
{
......
......@@ -140,19 +140,23 @@ size_t MKLDNNEmitter::build_convolution_forward(const mkldnn::memory::desc& inpu
size_t MKLDNNEmitter::build_elementwise_add(const mkldnn::memory::desc& input0_data_desc,
const mkldnn::memory::desc& input1_data_desc,
const mkldnn::memory::desc& result_desc,
const mkldnn::memory::primitive_desc& input_pd,
const mkldnn::memory::primitive::at& inputs_primitive)
const std::vector<float>& scale_vector,
const std::vector<mkldnn::memory::primitive_desc>& inputs_pd)
{
std::vector<mkldnn::memory::primitive::at> inputs_primitive;
size_t input0_data_index = build_memory_primitive(input0_data_desc);
size_t input1_data_index = build_memory_primitive(input1_data_desc);
size_t result_index = build_memory_primitive(result_desc);
inputs_primitive.push_back(*mkldnn_primitives[input0_data_index]);
inputs_primitive.push_back(*mkldnn_primitives[input1_data_index]);
// elementwise sum primtive descriptor
sum::primitive_desc sum_pd = sum::primitive_desc(result_desc,scale_vector, inputs_pd);
mkldnn::sum::primitive_desc sum_pd = mkldnn::sum::primitive_desc(result_desc, scale_vector, inputs_pd);
// sum primitive
size_t add_index = insert_primitive(new mkldnn::sum(sum_pd, inputs_primitive, *mkldnn_primitives[result_index]))
size_t add_index = insert_primitive(new mkldnn::sum(sum_pd, inputs_primitive, *mkldnn_primitives[result_index]));
primitive_deps[add_index] = {input1_data_index, input0_data_index, result_index};
return add_index;
......
......@@ -68,6 +68,12 @@ namespace ngraph
const ngraph::CoordinateDiff& padding_below,
const ngraph::CoordinateDiff& padding_above);
size_t build_elementwise_add(const mkldnn::memory::desc& input0_data_desc,
const mkldnn::memory::desc& input1_data_desc,
const mkldnn::memory::desc& result_desc,
const std::vector<float>& scale_vector,
const std::vector<mkldnn::memory::primitive_desc>& input_pd);
private:
std::shared_ptr<CPU_ExternalFunction> external_function;
std::vector<mkldnn::primitive*> mkldnn_primitives;
......
......@@ -26,11 +26,8 @@
#include "ngraph/ops/convolution.hpp"
#include "ngraph/ops/max_pool.hpp"
#include "ngraph/ops/relu.hpp"
<<<<<<< HEAD
=======
#include "ngraph/runtime/cpu/cpu_layout_descriptor.hpp"
#include "ngraph/runtime/cpu/cpu_op_annotations.hpp"
>>>>>>> jbobba/batchnorm-layout
#include "ngraph/types/element_type.hpp"
#include "mkldnn_utils.hpp"
......
......@@ -43,6 +43,30 @@ namespace ngraph
{
namespace pass
{
template <>
void CPUAssignment::ASSIGN_DECL(ngraph::op::Add)
{
auto add = static_cast<op::Add*>(node);
auto arg0_shape = node->get_input_shape(0);
auto src_size = 1;
for (size_t i = 0; i < node->get_input_shape(0).size(); i++)
{
src_size *= arg0_shape[0];
}
// insert Add as MKLDNN op, only if the src_size is big. this is to avoid MKLDNN overhead
// for smaller tensor sizes
if (node->get_input_element_type(0) == element::f32 &&
node->get_input_element_type(1) == element::f32 && src_size > 64000)
{
auto op_annotations =
std::make_shared<ngraph::runtime::cpu::CPUOpAnnotations>();
op_annotations->set_mkldnn_op(true);
add->set_op_annotations(op_annotations);
}
}
template <>
void CPUAssignment::ASSIGN_DECL(ngraph::op::Convolution)
{
......@@ -71,28 +95,6 @@ namespace ngraph
}
template <>
void CPUAssignment::ASSIGN_DECL(ngraph::op::Add)
{
auto add = static_cast<op::Add*>(node);
auto arg0_shape = node->get_input_shape(0);
auto src_size = 1;
for (size_t i = 0; i < node->get_input_shape(0).size(); i++)
{
src_size *= arg0_shape[0];
}
// insert Add as MKLDNN op, only if the src_size is big. this is to avoid MKLDNN overhead
// for smaller tensor sizes
if (node->get_input_element_type(0) == element::f32 &&
node->get_input_element_type(1) == element::f32 && src_size > 64000)
{
auto op_annotations =
std::make_shared<ngraph::runtime::cpu::CPUOpAnnotations>();
op_annotations->set_mkldnn_op(true);
add->set_op_annotations(op_annotations);
}
}
void CPUAssignment::ASSIGN_DECL(ngraph::op::ConvolutionBackpropData)
{
auto convolution = static_cast<op::ConvolutionBackpropData*>(node);
......@@ -227,9 +229,9 @@ namespace ngraph
#define TI(x) type_index(typeid(x))
static const runtime::cpu::pass::AssignOpMap s_dispatcher{
{TI(ngraph::op::Add), &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::Add>},
{TI(ngraph::op::Convolution),
&runtime::cpu::pass::CPUAssignment::assign<ngraph::op::Convolution>},
{TI(ngraph::op::Add), &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::Add>},
{TI(ngraph::op::ConvolutionBackpropData),
&runtime::cpu::pass::CPUAssignment::assign<ngraph::op::ConvolutionBackpropData>},
{TI(ngraph::op::ConvolutionBackpropFilters),
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment