Commit 00b4453d authored by Jayaram Bobba's avatar Jayaram Bobba Committed by Robert Kimball

IAT: More convolution folding optimizations (#1712)

* Check output shape when setting memory layout for slice op.

* Miscellaneous fusion and other optimizations for inception-resnetv2
- ConvBias Batchnorm folding
- ConvBias Affine folding
- Check if MKLDNN can slice a given layout and select layouts
  appropriately

* Fixed unit test and bug in conv bias pattern

* Addressed PR feedback

* Addressed PR feedback
parent c6bb0cf4
......@@ -741,7 +741,6 @@ void ngraph::runtime::cpu::pass::CPUFusion::construct_batch_norm_relu()
auto beta_shape = Shape{2};
auto beta = std::make_shared<pattern::op::Label>(element::f32, beta_shape);
double eps = 0.001;
auto shape_r = Shape{1, 2, 2, 2};
auto bn = std::make_shared<op::BatchNorm>(eps, gamma, beta, input);
auto goe = std::make_shared<op::GetOutputElement>(bn, 0);
auto prelu = std::make_shared<op::Relu>(goe);
......@@ -811,7 +810,6 @@ void ngraph::runtime::cpu::pass::CPUFusion::construct_batch_norm_relu_global_sta
auto beta_shape = Shape{2};
auto beta = std::make_shared<pattern::op::Label>(element::f32, beta_shape);
double eps = 0.001;
auto shape_r = Shape{1, 2, 2, 2};
auto bn = std::make_shared<op::BatchNorm>(eps, gamma, beta, input, mean, var);
auto prelu = std::make_shared<op::Relu>(bn);
......@@ -1403,3 +1401,210 @@ void ngraph::runtime::cpu::pass::CPUFusion::construct_bounded_relu()
auto m = std::make_shared<pattern::Matcher>(min, callback);
this->add_matcher(m);
}
void ngraph::runtime::cpu::pass::CPUFusion::construct_conv_bias_folded_batch_norm()
{
auto input = std::make_shared<pattern::op::Label>(element::f32, Shape{2, 2, 1, 1});
auto filters = std::make_shared<pattern::op::Label>(element::f32, Shape{2, 2, 1, 1});
auto bias = std::make_shared<pattern::op::Label>(element::f32, Shape{2});
auto pconv = std::make_shared<op::ConvolutionBias>(input,
filters,
bias,
Strides{1, 1},
Strides{1, 1},
CoordinateDiff{0, 0},
CoordinateDiff{0, 0},
Strides{1, 1});
auto mean = std::make_shared<pattern::op::Label>(element::f32, Shape{2});
auto var = std::make_shared<pattern::op::Label>(element::f32, Shape{2});
auto gamma = std::make_shared<pattern::op::Label>(element::f32, Shape{2});
auto beta = std::make_shared<pattern::op::Label>(element::f32, Shape{2});
double eps = 0.001;
auto bn = std::make_shared<op::BatchNorm>(eps, gamma, beta, pconv, mean, var);
ngraph::pattern::graph_rewrite_callback callback =
[input, filters, bias, mean, var, gamma, beta](pattern::Matcher& m) {
NGRAPH_DEBUG << "In callback for folded batch norm against node = "
<< m.get_match_root()->get_name();
auto pattern_map = m.get_pattern_map();
auto m_bn = std::dynamic_pointer_cast<op::BatchNorm>(m.get_match_root());
auto m_conv = std::dynamic_pointer_cast<op::ConvolutionBias>(m_bn->get_argument(2));
if (m_conv->get_users().size() > 1)
{
return false;
}
if (m_conv->get_shape().size() != 4)
{
return false;
}
// new weights = old weights * gamma / sqrt(variance + epsilon)
// new biases = (old_bias-mean) * gamma / sqrt(variance + epsilon) + beta
auto bn_eps = op::Constant::create(element::f32, Shape{}, {m_bn->get_eps_value()});
auto var_eps = std::make_shared<op::Add>(
pattern_map[var],
std::make_shared<op::Broadcast>(bn_eps, pattern_map[var]->get_shape(), AxisSet{0}));
auto sqrt_var_eps = std::make_shared<op::Sqrt>(var_eps);
auto mean_gamma = std::make_shared<op::Multiply>(
std::make_shared<op::Subtract>(pattern_map[bias], pattern_map[mean]),
pattern_map[gamma]);
auto new_biases = std::make_shared<op::Add>(
pattern_map[beta], std::make_shared<op::Divide>(mean_gamma, sqrt_var_eps));
auto weight_scaling = std::make_shared<op::Divide>(pattern_map[gamma], sqrt_var_eps);
auto new_weights = std::make_shared<op::Multiply>(
pattern_map[filters],
std::make_shared<op::Broadcast>(
weight_scaling, pattern_map[filters]->get_shape(), AxisSet{1, 2, 3}));
auto conv_bias =
std::make_shared<op::ConvolutionBias>(pattern_map[input],
new_weights,
new_biases,
m_conv->get_window_movement_strides(),
m_conv->get_window_dilation_strides(),
m_conv->get_padding_below(),
m_conv->get_padding_above(),
m_conv->get_data_dilation_strides());
ngraph::replace_node(m.get_match_root(), conv_bias);
return true;
};
auto m = std::make_shared<ngraph::pattern::Matcher>(bn, callback);
this->add_matcher(m);
}
void ngraph::runtime::cpu::pass::CPUFusion::construct_conv_bias_affine_folding()
{
// A * ConvBias (input, filters, bias) + B -> ConvBias (input, filters * A_c)
Shape shape{2, 2, 1, 1};
auto input = std::make_shared<pattern::op::Label>(element::f32, shape);
auto filters = std::make_shared<pattern::op::Label>(element::f32, shape);
auto bias = std::make_shared<pattern::op::Label>(element::f32, Shape{2});
auto conv = std::make_shared<op::ConvolutionBias>(input,
filters,
bias,
Strides{1, 1},
Strides{1, 1},
CoordinateDiff{0, 0},
CoordinateDiff{0, 0},
Strides{1, 1});
auto conv_label = std::make_shared<pattern::op::Label>(conv, nullptr, NodeVector{conv});
auto Ac = std::make_shared<pattern::op::Label>(element::f32, Shape{2});
auto A = std::make_shared<op::Broadcast>(Ac, Shape{2, 2, 1, 1}, AxisSet{0, 2, 3});
auto A_label = std::make_shared<pattern::op::Label>(A, nullptr, NodeVector{A});
auto multiply = std::make_shared<op::Multiply>(conv_label, A_label);
ngraph::pattern::graph_rewrite_callback callback = [input, filters, bias, conv_label, A_label](
pattern::Matcher& m) {
NGRAPH_DEBUG << "In callback for conv affine folding against node = "
<< m.get_match_root()->get_name();
auto pattern_map = m.get_pattern_map();
auto conv_m = std::static_pointer_cast<op::ConvolutionBias>(pattern_map[conv_label]);
if (conv_m->get_users().size() > 1)
{
return false;
}
if (conv_m->get_shape().size() != 4)
{
return false;
}
if (conv_m->with_relu())
{
return false;
}
auto A_m = std::static_pointer_cast<op::Broadcast>(pattern_map[A_label]);
// Check if values are being broadcast along channel (2nd) dimension
auto is_channel_bcast = [](const std::shared_ptr<op::Broadcast>& bcast) {
if (bcast->get_argument(0)->get_shape().size() == 0)
{
return true;
}
if (bcast->get_argument(0)->get_shape().size() == 1 &&
bcast->get_broadcast_axes() == AxisSet{0, 2, 3})
{
return true;
}
if (bcast->get_argument(0)->get_shape().size() == 2)
{
auto input_shape = bcast->get_argument(0)->get_shape();
if (input_shape[0] == 1 && bcast->get_broadcast_axes() == AxisSet{2, 3})
return true;
}
return false;
};
if (!is_channel_bcast(A_m))
{
return false;
}
auto get_bcast_input = [](const std::shared_ptr<op::Broadcast>& bcast) {
if (bcast->get_argument(0)->get_shape().size() == 0)
{
Shape bshape{bcast->get_shape()[1]};
return std::static_pointer_cast<ngraph::Node>(
std::make_shared<op::Broadcast>(bcast->get_argument(0), bshape, AxisSet{0}));
}
if (bcast->get_argument(0)->get_shape().size() == 1)
{
return bcast->get_argument(0);
}
if (bcast->get_argument(0)->get_shape().size() == 2)
{
Shape bshape{bcast->get_argument(0)->get_shape()[1]};
return std::static_pointer_cast<ngraph::Node>(std::make_shared<op::Reshape>(
bcast->get_argument(0), AxisVector{0, 1}, bshape));
}
throw ngraph_error("Unexpected shape for bcast input");
};
auto Ac_m = get_bcast_input(A_m);
// new weights = old weights * Ac_m
// new_bias = old_bias * Ac_m;
auto filters_n = std::make_shared<op::Multiply>(
pattern_map[filters],
std::make_shared<op::Broadcast>(
Ac_m, pattern_map[filters]->get_shape(), AxisSet{1, 2, 3}));
auto bias_n = std::make_shared<op::Multiply>(pattern_map[bias], Ac_m);
auto convbias_n =
std::make_shared<op::ConvolutionBias>(pattern_map[input],
filters_n,
bias_n,
conv_m->get_window_movement_strides(),
conv_m->get_window_dilation_strides(),
conv_m->get_padding_below(),
conv_m->get_padding_above(),
conv_m->get_data_dilation_strides());
ngraph::replace_node(m.get_match_root(), convbias_n);
return true;
};
auto m = std::make_shared<ngraph::pattern::Matcher>(multiply, callback);
this->add_matcher(m);
}
......@@ -64,6 +64,8 @@ public:
construct_zero_padded_conv();
construct_zero_padded_conv_backprop_filters();
construct_conv_bias_bprop();
construct_conv_bias_folded_batch_norm();
construct_conv_bias_affine_folding();
construct_batch_norm_relu();
construct_batch_norm_relu_global_stats();
construct_conv_relu();
......@@ -96,4 +98,6 @@ private:
void construct_conv_add();
void construct_conv_add_relu();
void construct_bounded_relu();
void construct_conv_bias_folded_batch_norm();
void construct_conv_bias_affine_folding();
};
......@@ -1671,60 +1671,40 @@ namespace ngraph
{
if (mkldnn_utils::use_mkldnn_kernel(node.get()))
{
// pass input format to output
const ngraph::op::Slice* slice =
static_cast<const ngraph::op::Slice*>(node.get());
auto lower_bounds = slice->get_lower_bounds();
auto result_shape = slice->get_output_shape(0);
auto input_md = mkldnn_utils::get_input_mkldnn_md(node.get(), 0);
NGRAPH_DEBUG << "input memory format: " << input_md.data.format << "\n";
auto result_format =
static_cast<mkldnn::memory::format>(input_md.data.format);
auto input_pd = mkldnn::memory::primitive_desc(
input_md, runtime::cpu::mkldnn_utils::global_cpu_engine);
auto dims = mkldnn::memory::dims(result_shape.begin(), result_shape.end());
auto offsets =
mkldnn::memory::dims(lower_bounds.begin(), lower_bounds.end());
auto slice = static_cast<ngraph::op::Slice*>(node.get());
auto lower_bounds = slice->get_lower_bounds();
if (result_format == mkldnn::memory::nChw16c)
try
{
// check lower bound of channels
if (lower_bounds[1] % 16 != 0)
{
NGRAPH_DEBUG
<< "slice nChw16c: lower bound of channels not multiple of 16, "
"set native layout\n";
set_native_layouts(external_function, node);
return;
}
// MKLDNN currently doesn't support views for blocked layouts
// when the dims and offsets are not divisible by the block size
auto view_md = mkldnn::view::primitive_desc(input_pd, dims, offsets)
.dst_primitive_desc()
.desc();
vector<memory::desc> o_mds;
o_mds.push_back(view_md);
set_output_layouts(node, o_mds);
}
else if (result_format == mkldnn::memory::nChw8c)
catch (const mkldnn::error& e)
{
// check lower bound of channels
if (lower_bounds[1] % 8 != 0)
if (e.status == mkldnn_unimplemented)
{
NGRAPH_DEBUG
<< "slice nChw8C: lower bound of channels not multiple of 8,"
"set native layout\n";
set_native_layouts(external_function, node);
return;
}
else
{
throw ngraph_error(e.message);
}
}
vector<memory::desc> o_mds;
if (result_format == mkldnn::memory::blocked)
{
auto cpu_tvl = dynamic_pointer_cast<runtime::cpu::LayoutDescriptor>(
node->get_inputs()[0]
.get_output()
.get_tensor_ptr()
->get_tensor_layout());
auto result_desc =
mkldnn_utils::create_blocked_mkldnn_md(node->get_output_shape(0),
cpu_tvl->get_strides(),
node->get_element_type());
o_mds.push_back(result_desc);
}
else
{
auto result_desc = mkldnn_utils::create_default_mkldnn_md(
node.get(), 0, true, result_format);
o_mds.push_back(result_desc);
}
set_output_layouts(node, o_mds);
}
else
{
......
......@@ -1612,7 +1612,7 @@ TEST(cpu_fusion, maxpool_with_indices_in_mxnet)
ASSERT_TRUE(std::dynamic_pointer_cast<op::Parameter>(mpwi_bprop->get_argument(2)));
}
TEST(cpu_fusion, batch_norm_folding)
TEST(cpu_fusion, conv_batch_norm_folding)
{
Shape shape_input{1, 8, 3, 3};
Shape shape_weights{2, 8, 1, 1};
......@@ -1671,7 +1671,48 @@ TEST(cpu_fusion, batch_norm_folding)
EXPECT_TRUE(test::all_close(cpu_results.at(0), int_results.at(0)));
}
TEST(cpu_fusion, affine_folding)
TEST(cpu_fusion, convbias_batch_norm_folding)
{
Shape shape_input{2, 8, 5, 5};
Shape shape_weights{2, 8, 2, 2};
Shape shape_norm{2};
auto make_function = [shape_input, shape_weights, shape_norm]() {
auto input = std::make_shared<op::Parameter>(element::f32, shape_input);
auto weights = std::make_shared<op::Parameter>(element::f32, shape_weights);
auto bias = std::make_shared<op::Parameter>(element::f32, Shape{2});
double eps = 1.01;
auto gamma = std::make_shared<op::Parameter>(element::f32, shape_norm);
auto beta = std::make_shared<op::Parameter>(element::f32, shape_norm);
auto mean = std::make_shared<op::Parameter>(element::f32, shape_norm);
auto var = std::make_shared<op::Parameter>(element::f32, shape_norm);
auto conv = std::make_shared<op::Convolution>(input, weights, Strides{1, 1}, Strides{1, 1});
auto convbias =
conv + std::make_shared<op::Broadcast>(bias, conv->get_shape(), AxisSet{0, 2, 3});
auto bn = std::make_shared<op::BatchNorm>(eps, gamma, beta, convbias, mean, var);
auto f = make_shared<Function>(
NodeVector{bn}, op::ParameterVector{input, weights, bias, gamma, beta, mean, var});
return f;
};
auto int_f = make_function();
auto cpu_f = make_function();
test::Uniform<float> rng(1.0f, 100.0f);
vector<vector<float>> args;
for (shared_ptr<op::Parameter> param : cpu_f->get_parameters())
{
vector<float> tensor_val(shape_size(param->get_shape()));
rng.initialize(tensor_val);
args.push_back(tensor_val);
}
auto int_results = execute(int_f, args, "INTERPRETER");
auto cpu_results = execute(cpu_f, args, "CPU");
EXPECT_TRUE(test::all_close(cpu_results.at(0), int_results.at(0)));
}
TEST(cpu_fusion, conv_affine_folding)
{
Shape shape_input{1, 8, 3, 3};
Shape shape_weights{2, 8, 1, 1};
......@@ -1728,6 +1769,48 @@ TEST(cpu_fusion, affine_folding)
EXPECT_TRUE(test::all_close(cpu_results.at(0), int_results.at(0)));
}
TEST(cpu_fusion, convbias_affine_folding)
{
Shape shape_input{1, 6, 3, 3};
Shape shape_weights{3, 6, 1, 1};
Shape shape_norm{3};
auto make_function = [shape_input, shape_weights, shape_norm]() {
auto input = std::make_shared<op::Parameter>(element::f32, shape_input);
auto weights = std::make_shared<op::Parameter>(element::f32, shape_weights);
auto bias = std::make_shared<op::Parameter>(element::f32, Shape{3});
auto a = std::make_shared<op::Parameter>(element::f32, shape_norm);
auto b = std::make_shared<op::Parameter>(element::f32, shape_norm);
auto conv = std::make_shared<op::Convolution>(input, weights, Strides{1, 1}, Strides{1, 1});
auto convbias =
conv + std::make_shared<op::Broadcast>(bias, conv->get_shape(), AxisSet{0, 2, 3});
auto out = std::make_shared<op::Add>(
std::make_shared<op::Multiply>(
convbias, std::make_shared<op::Broadcast>(a, conv->get_shape(), AxisSet{0, 2, 3})),
std::make_shared<op::Broadcast>(b, conv->get_shape(), AxisSet{0, 2, 3}));
auto f =
make_shared<Function>(NodeVector{out}, op::ParameterVector{input, weights, bias, a, b});
return f;
};
auto int_f = make_function();
auto cpu_f = make_function();
test::Uniform<float> rng(20.0f, 300.0f);
vector<vector<float>> args;
for (shared_ptr<op::Parameter> param : cpu_f->get_parameters())
{
vector<float> tensor_val(shape_size(param->get_shape()));
rng.initialize(tensor_val);
args.push_back(tensor_val);
}
auto int_results = execute(int_f, args, "INTERPRETER");
auto cpu_results = execute(cpu_f, args, "CPU");
EXPECT_TRUE(test::all_close(cpu_results.at(0), int_results.at(0)));
}
TEST(cpu_fusion, group_convolution_fusion)
{
Shape shape_a{1, 32, 2, 2};
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment