Commit 00b4453d authored by Jayaram Bobba's avatar Jayaram Bobba Committed by Robert Kimball

IAT: More convolution folding optimizations (#1712)

* Check output shape when setting memory layout for slice op.

* Miscellaneous fusion and other optimizations for inception-resnetv2
- ConvBias Batchnorm folding
- ConvBias Affine folding
- Check if MKLDNN can slice a given layout and select layouts
  appropriately

* Fixed unit test and bug in conv bias pattern

* Addressed PR feedback

* Addressed PR feedback
parent c6bb0cf4
This diff is collapsed.
......@@ -64,6 +64,8 @@ public:
construct_zero_padded_conv();
construct_zero_padded_conv_backprop_filters();
construct_conv_bias_bprop();
construct_conv_bias_folded_batch_norm();
construct_conv_bias_affine_folding();
construct_batch_norm_relu();
construct_batch_norm_relu_global_stats();
construct_conv_relu();
......@@ -96,4 +98,6 @@ private:
void construct_conv_add();
void construct_conv_add_relu();
void construct_bounded_relu();
void construct_conv_bias_folded_batch_norm();
void construct_conv_bias_affine_folding();
};
......@@ -1671,60 +1671,40 @@ namespace ngraph
{
if (mkldnn_utils::use_mkldnn_kernel(node.get()))
{
// pass input format to output
const ngraph::op::Slice* slice =
static_cast<const ngraph::op::Slice*>(node.get());
auto lower_bounds = slice->get_lower_bounds();
auto result_shape = slice->get_output_shape(0);
auto input_md = mkldnn_utils::get_input_mkldnn_md(node.get(), 0);
NGRAPH_DEBUG << "input memory format: " << input_md.data.format << "\n";
auto result_format =
static_cast<mkldnn::memory::format>(input_md.data.format);
auto input_pd = mkldnn::memory::primitive_desc(
input_md, runtime::cpu::mkldnn_utils::global_cpu_engine);
auto dims = mkldnn::memory::dims(result_shape.begin(), result_shape.end());
auto offsets =
mkldnn::memory::dims(lower_bounds.begin(), lower_bounds.end());
auto slice = static_cast<ngraph::op::Slice*>(node.get());
auto lower_bounds = slice->get_lower_bounds();
if (result_format == mkldnn::memory::nChw16c)
try
{
// check lower bound of channels
if (lower_bounds[1] % 16 != 0)
{
NGRAPH_DEBUG
<< "slice nChw16c: lower bound of channels not multiple of 16, "
"set native layout\n";
set_native_layouts(external_function, node);
return;
}
// MKLDNN currently doesn't support views for blocked layouts
// when the dims and offsets are not divisible by the block size
auto view_md = mkldnn::view::primitive_desc(input_pd, dims, offsets)
.dst_primitive_desc()
.desc();
vector<memory::desc> o_mds;
o_mds.push_back(view_md);
set_output_layouts(node, o_mds);
}
else if (result_format == mkldnn::memory::nChw8c)
catch (const mkldnn::error& e)
{
// check lower bound of channels
if (lower_bounds[1] % 8 != 0)
if (e.status == mkldnn_unimplemented)
{
NGRAPH_DEBUG
<< "slice nChw8C: lower bound of channels not multiple of 8,"
"set native layout\n";
set_native_layouts(external_function, node);
return;
}
else
{
throw ngraph_error(e.message);
}
}
vector<memory::desc> o_mds;
if (result_format == mkldnn::memory::blocked)
{
auto cpu_tvl = dynamic_pointer_cast<runtime::cpu::LayoutDescriptor>(
node->get_inputs()[0]
.get_output()
.get_tensor_ptr()
->get_tensor_layout());
auto result_desc =
mkldnn_utils::create_blocked_mkldnn_md(node->get_output_shape(0),
cpu_tvl->get_strides(),
node->get_element_type());
o_mds.push_back(result_desc);
}
else
{
auto result_desc = mkldnn_utils::create_default_mkldnn_md(
node.get(), 0, true, result_format);
o_mds.push_back(result_desc);
}
set_output_layouts(node, o_mds);
}
else
{
......
......@@ -1612,7 +1612,7 @@ TEST(cpu_fusion, maxpool_with_indices_in_mxnet)
ASSERT_TRUE(std::dynamic_pointer_cast<op::Parameter>(mpwi_bprop->get_argument(2)));
}
TEST(cpu_fusion, batch_norm_folding)
TEST(cpu_fusion, conv_batch_norm_folding)
{
Shape shape_input{1, 8, 3, 3};
Shape shape_weights{2, 8, 1, 1};
......@@ -1671,7 +1671,48 @@ TEST(cpu_fusion, batch_norm_folding)
EXPECT_TRUE(test::all_close(cpu_results.at(0), int_results.at(0)));
}
TEST(cpu_fusion, affine_folding)
TEST(cpu_fusion, convbias_batch_norm_folding)
{
Shape shape_input{2, 8, 5, 5};
Shape shape_weights{2, 8, 2, 2};
Shape shape_norm{2};
auto make_function = [shape_input, shape_weights, shape_norm]() {
auto input = std::make_shared<op::Parameter>(element::f32, shape_input);
auto weights = std::make_shared<op::Parameter>(element::f32, shape_weights);
auto bias = std::make_shared<op::Parameter>(element::f32, Shape{2});
double eps = 1.01;
auto gamma = std::make_shared<op::Parameter>(element::f32, shape_norm);
auto beta = std::make_shared<op::Parameter>(element::f32, shape_norm);
auto mean = std::make_shared<op::Parameter>(element::f32, shape_norm);
auto var = std::make_shared<op::Parameter>(element::f32, shape_norm);
auto conv = std::make_shared<op::Convolution>(input, weights, Strides{1, 1}, Strides{1, 1});
auto convbias =
conv + std::make_shared<op::Broadcast>(bias, conv->get_shape(), AxisSet{0, 2, 3});
auto bn = std::make_shared<op::BatchNorm>(eps, gamma, beta, convbias, mean, var);
auto f = make_shared<Function>(
NodeVector{bn}, op::ParameterVector{input, weights, bias, gamma, beta, mean, var});
return f;
};
auto int_f = make_function();
auto cpu_f = make_function();
test::Uniform<float> rng(1.0f, 100.0f);
vector<vector<float>> args;
for (shared_ptr<op::Parameter> param : cpu_f->get_parameters())
{
vector<float> tensor_val(shape_size(param->get_shape()));
rng.initialize(tensor_val);
args.push_back(tensor_val);
}
auto int_results = execute(int_f, args, "INTERPRETER");
auto cpu_results = execute(cpu_f, args, "CPU");
EXPECT_TRUE(test::all_close(cpu_results.at(0), int_results.at(0)));
}
TEST(cpu_fusion, conv_affine_folding)
{
Shape shape_input{1, 8, 3, 3};
Shape shape_weights{2, 8, 1, 1};
......@@ -1728,6 +1769,48 @@ TEST(cpu_fusion, affine_folding)
EXPECT_TRUE(test::all_close(cpu_results.at(0), int_results.at(0)));
}
TEST(cpu_fusion, convbias_affine_folding)
{
Shape shape_input{1, 6, 3, 3};
Shape shape_weights{3, 6, 1, 1};
Shape shape_norm{3};
auto make_function = [shape_input, shape_weights, shape_norm]() {
auto input = std::make_shared<op::Parameter>(element::f32, shape_input);
auto weights = std::make_shared<op::Parameter>(element::f32, shape_weights);
auto bias = std::make_shared<op::Parameter>(element::f32, Shape{3});
auto a = std::make_shared<op::Parameter>(element::f32, shape_norm);
auto b = std::make_shared<op::Parameter>(element::f32, shape_norm);
auto conv = std::make_shared<op::Convolution>(input, weights, Strides{1, 1}, Strides{1, 1});
auto convbias =
conv + std::make_shared<op::Broadcast>(bias, conv->get_shape(), AxisSet{0, 2, 3});
auto out = std::make_shared<op::Add>(
std::make_shared<op::Multiply>(
convbias, std::make_shared<op::Broadcast>(a, conv->get_shape(), AxisSet{0, 2, 3})),
std::make_shared<op::Broadcast>(b, conv->get_shape(), AxisSet{0, 2, 3}));
auto f =
make_shared<Function>(NodeVector{out}, op::ParameterVector{input, weights, bias, a, b});
return f;
};
auto int_f = make_function();
auto cpu_f = make_function();
test::Uniform<float> rng(20.0f, 300.0f);
vector<vector<float>> args;
for (shared_ptr<op::Parameter> param : cpu_f->get_parameters())
{
vector<float> tensor_val(shape_size(param->get_shape()));
rng.initialize(tensor_val);
args.push_back(tensor_val);
}
auto int_results = execute(int_f, args, "INTERPRETER");
auto cpu_results = execute(cpu_f, args, "CPU");
EXPECT_TRUE(test::all_close(cpu_results.at(0), int_results.at(0)));
}
TEST(cpu_fusion, group_convolution_fusion)
{
Shape shape_a{1, 32, 2, 2};
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment