Commit 82ee0a77 authored by Jayaram Bobba's avatar Jayaram Bobba Committed by Scott Cyphers

Convolution sum fusion (#1146)

* inplace compute

* fix warnings

* Initial support for convolution sum fusion

* Added in-place support for conv sum fusion and test cases

* reverting spurious changes

* Bug fix to account for inplace input in conv sum fusion

* fix compilation error

* Addressed PR feedback
parent f7069237
......@@ -3103,6 +3103,95 @@ namespace ngraph
}
}
template <>
void CPU_Emitter::EMITTER_DECL(ngraph::op::ConvolutionBiasAdd)
{
auto convolution = static_cast<const ngraph::op::ConvolutionBiasAdd*>(node);
auto arg0_shape = args[0].get_shape();
auto arg1_shape = args[1].get_shape();
auto arg2_shape = args[2].get_shape();
auto result_shape = out[0].get_shape();
if (runtime::cpu::mkldnn_utils::use_mkldnn_kernel(node))
{
// For dilation, MKLDNN wants to know how many elements to insert between, not how far
// apart to space the elements like nGraph. So we have to subtract 1 from each pos.
Strides window_dilation_strides_adjusted;
for (size_t s : convolution->get_window_dilation_strides())
{
window_dilation_strides_adjusted.push_back(s - 1);
}
auto input_format =
runtime::cpu::mkldnn_utils::get_input_mkldnn_format(node, 0);
auto weights_format =
runtime::cpu::mkldnn_utils::get_input_mkldnn_format(node, 1);
auto bias_format = mkldnn_utils::get_input_mkldnn_format(node, 2);
// HACK to help MKLDNN pick the right implementation
if (weights_format == mkldnn::memory::format::nchw)
{
weights_format = mkldnn::memory::format::oihw;
}
auto output_format =
runtime::cpu::mkldnn_utils::get_output_mkldnn_format(node, 0);
auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
auto input_data_desc =
mkldnn_emitter->build_memory_descriptor(args[0], input_format);
auto weights_desc =
mkldnn_emitter->build_memory_descriptor(args[1], weights_format);
auto bias_desc = mkldnn_emitter->build_memory_descriptor(args[2], bias_format);
// Since this is an in-place kernel, args[3] and out[0] will share the same
// memory buffer and descriptor
auto result_desc =
mkldnn_emitter->build_memory_descriptor(out[0], output_format);
size_t conv_index = 0;
mkldnn::post_ops ops;
ops.append_sum(1.f);
const float ops_scale = 1.f;
const float ops_alpha = -0.f; // relu negative slope
const float ops_beta = 0.f;
if (convolution->with_relu())
{
ops.append_eltwise(
ops_scale, mkldnn::algorithm::eltwise_relu, ops_alpha, ops_beta);
}
conv_index = mkldnn_emitter->build_convolution_forward(
input_data_desc,
weights_desc,
bias_desc,
result_desc,
convolution->get_window_movement_strides(),
window_dilation_strides_adjusted,
convolution->get_padding_below(),
convolution->get_padding_above(),
ops);
auto& deps = mkldnn_emitter->get_primitive_deps(conv_index);
writer << "cpu::mkldnn_utils::set_memory_ptr(ctx, " << to_string(deps[0])
<< ", " << args[0].get_name() << ");\n";
writer << "cpu::mkldnn_utils::set_memory_ptr(ctx, " << to_string(deps[1])
<< ", " << args[1].get_name() << ");\n";
writer << "cpu::mkldnn_utils::set_memory_ptr(ctx, " << to_string(deps[2])
<< ", " << args[2].get_name() << ");\n";
writer << "cpu::mkldnn_utils::set_memory_ptr(ctx, " << to_string(deps[3])
<< ", " << out[0].get_name() << ");\n";
writer << "cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, "
<< to_string(conv_index) << ");\n";
}
else
{
throw ngraph_error("ConvolutionBiasAdd is only supported with MKLDNN kernel.");
}
}
template <>
void CPU_Emitter::EMITTER_DECL(ngraph::op::ConvolutionBiasBackpropFiltersBias)
{
......
......@@ -268,6 +268,7 @@ static const runtime::cpu::OpMap dispatcher{
{TI(ngraph::op::ConvolutionRelu), &runtime::cpu::CPU_Emitter::emit<op::ConvolutionRelu>},
{TI(ngraph::op::ConvolutionBiasRelu),
&runtime::cpu::CPU_Emitter::emit<op::ConvolutionBiasRelu>},
{TI(ngraph::op::ConvolutionBiasAdd), &runtime::cpu::CPU_Emitter::emit<op::ConvolutionBiasAdd>},
// conv+bias backprop for data share the same implementation as ConvolutionBackpropData
{TI(ngraph::op::ConvolutionBiasBackpropFiltersBias),
&runtime::cpu::CPU_Emitter::emit<op::ConvolutionBiasBackpropFiltersBias>},
......
......@@ -223,3 +223,87 @@ shared_ptr<Node>
m_padding_above_forward,
m_data_dilation_strides_forward);
}
op::ConvolutionBiasAdd::ConvolutionBiasAdd(const std::shared_ptr<op::ConvolutionBias>& conv,
const std::shared_ptr<Node>& sum_input,
bool with_relu)
: RequiresTensorViewArgs(
"ConvolutionBiasAdd",
{conv->get_argument(0), conv->get_argument(1), conv->get_argument(2), sum_input})
, m_window_movement_strides(conv->get_window_movement_strides())
, m_window_dilation_strides(conv->get_window_dilation_strides())
, m_padding_below(conv->get_padding_below())
, m_padding_above(conv->get_padding_above())
, m_data_dilation_strides(conv->get_data_dilation_strides())
, m_with_relu(with_relu)
{
set_value_type_checked(conv->get_element_type(), conv->get_shape());
}
op::ConvolutionBiasAdd::ConvolutionBiasAdd(const std::shared_ptr<Node>& data_batch,
const std::shared_ptr<Node>& filters,
const std::shared_ptr<Node>& bias,
const std::shared_ptr<Node>& sum_input,
const Strides& window_movement_strides,
const Strides& window_dilation_strides,
const CoordinateDiff& padding_below,
const CoordinateDiff& padding_above,
const Strides& data_dilation_strides,
bool with_relu)
: RequiresTensorViewArgs("ConvolutionBiasAdd", {data_batch, filters, bias, sum_input})
, m_window_movement_strides(window_movement_strides)
, m_window_dilation_strides(window_dilation_strides)
, m_padding_below(padding_below)
, m_padding_above(padding_above)
, m_data_dilation_strides(data_dilation_strides)
, m_with_relu(with_relu)
{
auto& data_batch_shape = data_batch->get_shape();
auto& data_batch_et = data_batch->get_element_type();
auto& filters_shape = filters->get_shape();
auto& filters_et = filters->get_element_type();
//
// Make sure data batch and filter element types match.
//
if (data_batch_et != filters_et)
{
throw ngraph_error("Convolution data batch and filter element types do not match");
}
set_value_type_checked(
data_batch_et,
util::infer_convolution_output_shape(data_batch_shape,
filters_shape,
window_movement_strides,
window_dilation_strides,
padding_below,
padding_above,
data_dilation_strides,
0, /* batch_axis_data, */
1, /* input_channel_axis_data, */
1, /* input_channel_axis_filters, */
0, /* output_channel_axis_filters, */
0, /* batch_axis_result, */
1, /* output_channel_axis_result, */
""));
}
std::shared_ptr<Node> op::ConvolutionBiasAdd::copy_with_new_args(const NodeVector& new_args) const
{
if (new_args.size() != 4)
{
throw ngraph_error("Incorrect number of new arguments");
}
return std::shared_ptr<Node>(new ConvolutionBiasAdd(new_args.at(0),
new_args.at(1),
new_args.at(2),
new_args.at(3),
get_window_movement_strides(),
get_window_dilation_strides(),
get_padding_below(),
get_padding_above(),
get_data_dilation_strides(),
m_with_relu));
}
......@@ -149,5 +149,43 @@ namespace ngraph
CoordinateDiff m_padding_above_backward;
Strides m_data_dilation_strides_backward;
};
class ConvolutionBiasAdd : public util::RequiresTensorViewArgs
{
public:
ConvolutionBiasAdd(const std::shared_ptr<op::ConvolutionBias>& conv,
const std::shared_ptr<Node>& sum_input,
bool with_relu);
ConvolutionBiasAdd(const std::shared_ptr<Node>& data_batch,
const std::shared_ptr<Node>& filters,
const std::shared_ptr<Node>& bias,
const std::shared_ptr<Node>& sum_input,
const Strides& window_movement_strides,
const Strides& window_dilation_strides,
const CoordinateDiff& padding_below,
const CoordinateDiff& padding_above,
const Strides& data_dilation_strides,
bool with_relu);
const Strides& get_window_movement_strides() const { return m_window_movement_strides; }
const Strides& get_window_dilation_strides() const { return m_window_dilation_strides; }
const CoordinateDiff& get_padding_below() const { return m_padding_below; }
const CoordinateDiff& get_padding_above() const { return m_padding_above; }
const Strides& get_data_dilation_strides() const { return m_data_dilation_strides; }
std::shared_ptr<Node> get_filters() { return get_argument(1); }
std::shared_ptr<Node> get_data_batch() { return get_argument(0); }
bool with_relu() const { return m_with_relu; }
virtual std::shared_ptr<Node>
copy_with_new_args(const NodeVector& new_args) const override;
protected:
Strides m_window_movement_strides;
Strides m_window_dilation_strides;
CoordinateDiff m_padding_below;
CoordinateDiff m_padding_above;
Strides m_data_dilation_strides;
bool m_with_relu;
};
}
}
......@@ -213,6 +213,34 @@ namespace ngraph
}
}
template <>
void CPUAssignment::ASSIGN_DECL(ngraph::op::ConvolutionBiasAdd)
{
auto convolution = static_cast<op::ConvolutionBiasAdd*>(node);
auto arg0_rank = node->get_input_shape(0).size();
auto arg1_rank = node->get_input_shape(1).size();
bool data_dilated = false;
for (size_t s : convolution->get_data_dilation_strides())
{
data_dilated = data_dilated || (s != 1);
}
if (!data_dilated && arg0_rank == 4 && arg1_rank == 4 &&
node->get_input_element_type(0) == element::f32)
{
auto op_annotations =
std::make_shared<ngraph::runtime::cpu::CPUOpAnnotations>();
op_annotations->set_mkldnn_op(true);
const int ADD_INPUT = 3;
// Accumulates conv into the second input of the unfused add
std::map<size_t, size_t> oi_pairs = {{0, ADD_INPUT}};
op_annotations->set_in_place_oi_pairs(oi_pairs);
convolution->set_op_annotations(op_annotations);
}
}
template <>
void CPUAssignment::ASSIGN_DECL(ngraph::op::BatchNormRelu)
{
......@@ -618,6 +646,8 @@ static const runtime::cpu::pass::AssignOpMap s_dispatcher{
&runtime::cpu::pass::CPUAssignment::assign<ngraph::op::ConvolutionRelu>},
{TI(ngraph::op::ConvolutionBiasRelu),
&runtime::cpu::pass::CPUAssignment::assign<ngraph::op::ConvolutionBiasRelu>},
{TI(ngraph::op::ConvolutionBiasAdd),
&runtime::cpu::pass::CPUAssignment::assign<ngraph::op::ConvolutionBiasAdd>},
{TI(ngraph::op::BatchNormRelu),
&runtime::cpu::pass::CPUAssignment::assign<ngraph::op::BatchNormRelu>},
{TI(ngraph::op::ConvolutionBackpropData),
......
......@@ -22,7 +22,6 @@
#include "ngraph/graph_util.hpp"
#include "ngraph/log.hpp"
#include "ngraph/op/add.hpp"
#include "ngraph/op/add.hpp"
#include "ngraph/op/batch_norm.hpp"
#include "ngraph/op/broadcast.hpp"
#include "ngraph/op/broadcast.hpp"
......@@ -1072,6 +1071,172 @@ void ngraph::runtime::cpu::pass::CPUFusion::construct_conv_bias_relu()
this->add_matcher(m);
}
void ngraph::runtime::cpu::pass::CPUFusion::construct_conv_bias_add()
{
Shape shape{2, 2, 1, 1};
auto data_batch = std::make_shared<pattern::op::Label>(element::f32, shape);
auto filters = std::make_shared<pattern::op::Label>(element::f32, shape);
auto bias = std::make_shared<pattern::op::Label>(element::f32, Shape{1});
auto pconv = std::make_shared<op::ConvolutionBias>(data_batch,
filters,
bias,
Strides{1, 1},
Strides{1, 1},
CoordinateDiff{0, 0},
CoordinateDiff{0, 0},
Strides{1, 1});
auto add_input = std::make_shared<pattern::op::Label>(element::f32, pconv->get_shape());
auto padd = std::make_shared<op::Add>(add_input, pconv);
pattern::graph_rewrite_callback callback = [data_batch, filters](pattern::Matcher& m) {
NGRAPH_DEBUG << "In a callback for construct_conv_sum against "
<< m.get_match_root()->get_name();
auto add_m = m.get_match_root();
auto pattern_map = m.get_pattern_map();
auto conv_m = std::dynamic_pointer_cast<op::ConvolutionBias>(add_m->get_argument(1));
auto inplace_input = add_m->get_argument(0);
if (!conv_m)
{
conv_m = std::dynamic_pointer_cast<op::ConvolutionBias>(add_m->get_argument(0));
inplace_input = add_m->get_argument(1);
}
//These checks are to make sure a MKLDNN Convolution kernel can be used.
bool data_dilated = false;
for (size_t s : conv_m->get_data_dilation_strides())
{
data_dilated = data_dilated || (s != 1);
}
if (data_dilated)
{
NGRAPH_DEBUG << "Convolution has dilations greater than 1";
return false;
}
if (conv_m->get_element_type() != element::f32)
{
NGRAPH_DEBUG << "Convolution isn't of type float";
return false;
}
auto arg0_rank = conv_m->get_input_shape(0).size();
auto arg1_rank = conv_m->get_input_shape(1).size();
if (arg0_rank != 4 || arg1_rank != 4)
{
NGRAPH_DEBUG << "Convolution's arguments ranks aren't equal to 4";
return false;
}
if (conv_m->get_users().size() > 1)
{
NGRAPH_DEBUG << "Convolution has more than one user";
// return false;
}
if (inplace_input->get_users().size() > 1)
{
NGRAPH_DEBUG << "Add has more than one user. Convolution Add might use an in-place "
"destructive kernel";
// return false;
}
if (inplace_input->is_parameter())
{
NGRAPH_DEBUG
<< "Unsafe to use in-place kernel since add's in-place input is a parameter";
return false;
}
for (auto add_user : m.get_match_root()->get_users())
{
if (add_user->is_output())
{
// TODO: Remove restriction once we handle this case in codegen
NGRAPH_DEBUG
<< "Unsafe to use in-place kernel since add's in-place output is a result";
return false;
}
}
auto conv_add =
std::shared_ptr<Node>(new op::ConvolutionBiasAdd(conv_m, inplace_input, false));
ngraph::replace_node(m.get_match_root(), conv_add);
return true;
};
auto m = std::make_shared<pattern::Matcher>(padd, callback, "conv_bias_add");
this->add_matcher(m);
}
void ngraph::runtime::cpu::pass::CPUFusion::construct_conv_bias_add_relu()
{
Shape shape{2, 2, 1, 1};
auto data_batch = std::make_shared<pattern::op::Label>(element::f32, shape);
auto filters = std::make_shared<pattern::op::Label>(element::f32, shape);
auto bias = std::make_shared<pattern::op::Label>(element::f32, Shape{1});
auto add_input = std::make_shared<pattern::op::Label>(element::f32, shape);
auto pconv = std::make_shared<op::ConvolutionBiasAdd>(data_batch,
filters,
bias,
add_input,
Strides{1, 1},
Strides{1, 1},
CoordinateDiff{0, 0},
CoordinateDiff{0, 0},
Strides{1, 1},
false);
auto prelu = std::make_shared<op::Relu>(pconv);
pattern::graph_rewrite_callback callback = [](pattern::Matcher& m) {
NGRAPH_DEBUG << "In a callback for construct_conv_sum against "
<< m.get_match_root()->get_name();
auto conv_m =
std::dynamic_pointer_cast<op::ConvolutionBiasAdd>(m.get_match_root()->get_argument(0));
if (conv_m->get_users().size() > 1)
{
NGRAPH_DEBUG << "Convolution has more than one user";
return false;
}
for (auto conv_bias_user : m.get_match_root()->get_users())
{
if (conv_bias_user->is_output())
{
// TODO: Remove restriction once we handle this case in codegen
NGRAPH_DEBUG << "Unsafe to use in-place kernel since in-place output is a result";
return false;
}
}
// ConvolutionBiasAdd created only if it can run with MKLDNN.
// No further checks needed.
auto conv_n =
std::make_shared<op::ConvolutionBiasAdd>(conv_m->get_argument(0),
conv_m->get_argument(1),
conv_m->get_argument(2),
conv_m->get_argument(3),
conv_m->get_window_movement_strides(),
conv_m->get_window_dilation_strides(),
conv_m->get_padding_below(),
conv_m->get_padding_above(),
conv_m->get_data_dilation_strides(),
true);
ngraph::replace_node(m.get_match_root(), conv_n);
return true;
};
auto m = std::make_shared<pattern::Matcher>(prelu, callback, "conv_bias_add_relu");
this->add_matcher(m);
}
void ngraph::runtime::cpu::pass::CPUFusion::construct_sigmoid_multiply()
{
// Construct predicate to match sigmoid and tanh
......
......@@ -64,6 +64,8 @@ public:
construct_batch_norm_relu_global_stats();
construct_conv_relu();
construct_conv_bias_relu();
construct_conv_bias_add();
construct_conv_bias_add_relu();
}
if (fusions & DIFFERENTIABLE_FUSIONS)
......@@ -89,4 +91,6 @@ private:
void construct_batch_norm_relu_global_stats();
void construct_conv_relu();
void construct_conv_bias_relu();
void construct_conv_bias_add();
void construct_conv_bias_add_relu();
};
......@@ -457,6 +457,27 @@ namespace ngraph
}
}
template <>
void CPULayout::LAYOUT_DECL(ngraph::op::ConvolutionBiasAdd)
{
if (runtime::cpu::mkldnn_utils::use_mkldnn_kernel(node.get()))
{
vector<memory::format> prim_input_formats;
vector<memory::format> prim_output_formats;
ConvolutionLayout<ngraph::op::ConvolutionBiasAdd, true, false>(
node, prim_input_formats, prim_output_formats);
// Force second input to sum to use the same layout as convolution output
prim_input_formats.push_back(prim_output_formats[0]);
node =
insert_input_conversions(external_function, node, prim_input_formats);
set_output_layouts(node, prim_output_formats);
}
else
{
set_default_layouts(external_function, node);
}
}
template <>
void CPULayout::LAYOUT_DECL(ngraph::op::ConvolutionBackpropData)
{
......@@ -1474,6 +1495,8 @@ static const runtime::cpu::pass::LayoutOpMap s_dispatcher{
&runtime::cpu::pass::CPULayout::layout<ngraph::op::ConvolutionRelu>},
{TI(ngraph::op::ConvolutionBiasRelu),
&runtime::cpu::pass::CPULayout::layout<ngraph::op::ConvolutionBiasRelu>},
{TI(ngraph::op::ConvolutionBiasAdd),
&runtime::cpu::pass::CPULayout::layout<ngraph::op::ConvolutionBiasAdd>},
{TI(ngraph::op::ConvolutionBiasBackpropFiltersBias),
&runtime::cpu::pass::CPULayout::layout<ngraph::op::ConvolutionBiasBackpropFiltersBias>},
{TI(ngraph::op::BatchNorm), &runtime::cpu::pass::CPULayout::layout<ngraph::op::BatchNorm>},
......
......@@ -979,6 +979,59 @@ TEST(cpu_fusion, conv_bias_relu_n2c1h2w2_2)
EXPECT_TRUE(test::all_close(cpu_results.at(0), int_results.at(0)));
}
// ConvolutionBiasAdd relies on an in-place fused MKLDNN kernel.
// Need to ensure that it is fused only when in-place buffer allocation is feasible
shared_ptr<Function> gen_conv_bias_add(bool param_input, bool result_output)
{
auto A = make_shared<op::Parameter>(element::f32, Shape{2, 1, 2, 2});
auto weights = make_shared<op::Parameter>(element::f32, Shape{1, 1, 1, 1});
auto bias = make_shared<op::Parameter>(element::f32, Shape{1});
auto conv = make_shared<op::Convolution>(A, weights, Strides{1, 1}, Strides{1, 1});
auto bias_broadcast = make_shared<op::Broadcast>(bias, conv->get_shape(), AxisSet{0, 2, 3});
auto convbias = conv + bias_broadcast;
auto B = make_shared<op::Parameter>(element::f32, Shape{2, 1, 2, 2});
auto abs_B = make_shared<op::Abs>(B);
auto add =
param_input ? make_shared<op::Add>(convbias, B) : make_shared<op::Add>(convbias, abs_B);
auto abs = make_shared<op::Abs>(add);
return result_output ? make_shared<Function>(add, op::ParameterVector{A, weights, bias, B})
: make_shared<Function>(abs, op::ParameterVector{A, weights, bias, B});
}
TEST(cpu_fusion, fuse_conv_bias_add)
{
auto func_fuse = gen_conv_bias_add(false, false);
auto func_nofuse1 = gen_conv_bias_add(true, false);
auto func_nofuse2 = gen_conv_bias_add(false, true);
pass::Manager pass_manager;
pass_manager.register_pass<runtime::cpu::pass::CPUFusion>();
pass_manager.run_passes(func_fuse);
ASSERT_EQ(count_ops_of_type<op::ConvolutionBiasAdd>(func_fuse), 1);
pass_manager.run_passes(func_nofuse1);
ASSERT_EQ(count_ops_of_type<op::ConvolutionBiasAdd>(func_nofuse1), 0);
pass_manager.run_passes(func_nofuse2);
ASSERT_EQ(count_ops_of_type<op::ConvolutionBiasAdd>(func_nofuse2), 0);
}
TEST(cpu_fusion, conv_bias_add)
{
auto int_f = gen_conv_bias_add(false, false);
auto cpu_f = gen_conv_bias_add(false, false);
vector<vector<float>> args{{1.25f, 2.25f, 5.25f, 6.25f, -1.25f, -1.25f, 3.25f, -4.25f},
{-1.25f},
{2.25f},
{1.25f, 2.25f, -3.25f, 2.25f, 4.25f, 4.25f, 1.25f, 2.25f}};
auto int_results = execute(int_f, args, "INTERPRETER");
auto cpu_results = execute(cpu_f, args, "CPU");
EXPECT_TRUE(test::all_close(cpu_results.at(0), int_results.at(0)));
}
std::vector<shared_ptr<runtime::TensorView>>
rnn_matrix_fusion_eval(const size_t time_steps,
const Shape& data_shape,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment