Commit 519b18ac authored by Jayaram Bobba's avatar Jayaram Bobba Committed by Robert Kimball

IAT: Skip reshapes that are removing or adding size-1 dimensions (#1684)

* Reshape optimizations for when unit-sized dimensions are added/removed from tensors

* Added unit tests for eliminating squeeze and expand_dims operations

* Bug fix to expand dims layout

* Style fix
parent 00b4453d
......@@ -44,7 +44,7 @@ namespace ngraph
s *= shape[shape.size() - (i + 1)];
}
std::reverse(m_strides.begin(), m_strides.end());
m_mkldnn_memory_size = shape_size(tv.get_shape()) * tv.get_element_type().size();
m_buffer_size = shape_size(tv.get_shape()) * tv.get_element_type().size();
}
size_t LayoutDescriptor::get_index_offset(const std::vector<size_t>& indices)
......@@ -109,7 +109,7 @@ namespace ngraph
{
auto mem_prim_desc =
mkldnn::memory::primitive_desc(md, mkldnn_utils::global_cpu_engine);
m_mkldnn_memory_size = mem_prim_desc.get_size();
m_buffer_size = mem_prim_desc.get_size();
}
catch (const mkldnn::error& e)
{
......@@ -118,6 +118,15 @@ namespace ngraph
e.message);
}
}
bool LayoutDescriptor::is_row_major_layout()
{
if (!is_mkldnn_layout())
return true;
auto native_md = runtime::cpu::mkldnn_utils::create_blocked_mkldnn_md(
get_shape(), m_strides, get_element_type());
return runtime::cpu::mkldnn_utils::compare_mkldnn_mds(m_mkldnn_md, native_md);
}
}
}
}
......@@ -37,7 +37,7 @@ namespace ngraph
public:
LayoutDescriptor(const ngraph::descriptor::Tensor& tv);
~LayoutDescriptor() override {}
virtual size_t get_allocated_size() override { return m_mkldnn_memory_size; }
virtual size_t get_allocated_size() override { return m_buffer_size; }
size_t get_offset() const { return m_offset; }
size_t get_index_offset(const std::vector<size_t>& indices) override;
......@@ -51,6 +51,7 @@ namespace ngraph
{
return m_mkldnn_md.data.format != mkldnn::memory::format::format_undef;
}
bool is_row_major_layout();
static const mkldnn::memory::desc DummyDesc;
......@@ -64,7 +65,7 @@ namespace ngraph
// Otherwise, physical layout is assumed to be in row-major
// format represented by m_strides
mkldnn::memory::desc m_mkldnn_md;
size_t m_mkldnn_memory_size;
size_t m_buffer_size;
};
typedef std::vector<std::shared_ptr<ngraph::runtime::cpu::LayoutDescriptor>>
......
......@@ -371,31 +371,10 @@ mkldnn::memory::desc runtime::cpu::mkldnn_utils::create_blocked_mkldnn_md(
return memory::desc(md);
}
memory::desc runtime::cpu::mkldnn_utils::rotate_blocked_md(const memory::desc& in,
AxisVector& axis_order)
// MKLDNN kernel selection sometimes relies on named layouts like "mkldnn_nchw"
// Try and convert a blocked layout into a named layout
memory::desc runtime::cpu::mkldnn_utils::try_get_named_md(mkldnn_memory_desc_t md)
{
mkldnn_memory_desc_t md;
md.primitive_kind = in.data.primitive_kind;
md.ndims = in.data.ndims;
md.format = mkldnn_blocked;
md.data_type = in.data.data_type;
for (size_t i = 0; i < in.data.ndims; i++)
{
md.layout_desc.blocking.block_dims[i] =
in.data.layout_desc.blocking.block_dims[axis_order[i]];
md.layout_desc.blocking.strides[1][i] =
in.data.layout_desc.blocking.strides[1][axis_order[i]];
md.layout_desc.blocking.strides[0][i] =
in.data.layout_desc.blocking.strides[0][axis_order[i]];
md.layout_desc.blocking.padding_dims[i] =
in.data.layout_desc.blocking.padding_dims[axis_order[i]];
md.layout_desc.blocking.offset_padding_to_data[i] =
in.data.layout_desc.blocking.offset_padding_to_data[axis_order[i]];
md.dims[i] = in.data.dims[axis_order[i]];
}
md.layout_desc.blocking.offset_padding = in.data.layout_desc.blocking.offset_padding;
auto out_md = memory::desc(md);
auto get_named_md = [](const mkldnn_memory_desc_t& blk, const mkldnn_memory_format_t format) {
......@@ -448,12 +427,132 @@ memory::desc runtime::cpu::mkldnn_utils::rotate_blocked_md(const memory::desc& i
return out_md;
}
bool runtime::cpu::mkldnn_utils::use_mkldnn_kernel(const ngraph::Node* node)
memory::desc runtime::cpu::mkldnn_utils::rotate_blocked_md(const memory::desc& in,
const AxisVector& axis_order)
{
auto op_annotations = static_cast<const ngraph::op::Op*>(node)->get_op_annotations();
return (op_annotations &&
static_pointer_cast<ngraph::runtime::cpu::CPUOpAnnotations>(op_annotations)
->is_mkldnn_op());
mkldnn_memory_desc_t md;
md.primitive_kind = in.data.primitive_kind;
md.ndims = in.data.ndims;
md.format = mkldnn_blocked;
md.data_type = in.data.data_type;
for (size_t i = 0; i < in.data.ndims; i++)
{
md.layout_desc.blocking.block_dims[i] =
in.data.layout_desc.blocking.block_dims[axis_order[i]];
md.layout_desc.blocking.strides[1][i] =
in.data.layout_desc.blocking.strides[1][axis_order[i]];
md.layout_desc.blocking.strides[0][i] =
in.data.layout_desc.blocking.strides[0][axis_order[i]];
md.layout_desc.blocking.padding_dims[i] =
in.data.layout_desc.blocking.padding_dims[axis_order[i]];
md.layout_desc.blocking.offset_padding_to_data[i] =
in.data.layout_desc.blocking.offset_padding_to_data[axis_order[i]];
md.dims[i] = in.data.dims[axis_order[i]];
}
md.layout_desc.blocking.offset_padding = in.data.layout_desc.blocking.offset_padding;
return try_get_named_md(md);
}
memory::desc runtime::cpu::mkldnn_utils::squeeze_blocked_md(const memory::desc& in,
AxisVector& axis_list)
{
if (in.data.ndims <= axis_list.size())
{
throw ngraph_error("Squeezing too many axes: input " + to_string(in.data.ndims) +
" , removing " + to_string(axis_list.size()));
}
for (auto axis : axis_list)
{
if (in.data.dims[axis] != 1)
{
throw ngraph_error("Cannot squeeze axis on non unit-size, axis: " + to_string(axis) +
" size: " + to_string(in.data.dims[axis]));
}
}
mkldnn_memory_desc_t md;
md.primitive_kind = in.data.primitive_kind;
md.ndims = in.data.ndims - static_cast<int>(axis_list.size());
md.format = mkldnn_blocked;
md.data_type = in.data.data_type;
size_t k = 0;
for (size_t i = 0, j = 0; i < in.data.ndims; i++)
{
if (k < axis_list.size() && i == axis_list[k])
{
k++;
continue;
}
md.layout_desc.blocking.block_dims[j] = in.data.layout_desc.blocking.block_dims[i];
md.layout_desc.blocking.strides[1][j] = in.data.layout_desc.blocking.strides[1][i];
md.layout_desc.blocking.strides[0][j] = in.data.layout_desc.blocking.strides[0][i];
md.layout_desc.blocking.padding_dims[j] = in.data.layout_desc.blocking.padding_dims[i];
md.layout_desc.blocking.offset_padding_to_data[j] =
in.data.layout_desc.blocking.offset_padding_to_data[i];
md.dims[j] = in.data.dims[i];
j++;
}
md.layout_desc.blocking.offset_padding = in.data.layout_desc.blocking.offset_padding;
return try_get_named_md(md);
}
memory::desc runtime::cpu::mkldnn_utils::expand_blocked_md(const memory::desc& in,
AxisVector& axis_list)
{
mkldnn_memory_desc_t md;
md.primitive_kind = in.data.primitive_kind;
md.ndims = in.data.ndims + static_cast<int>(axis_list.size());
md.format = mkldnn_blocked;
md.data_type = in.data.data_type;
size_t k = 0;
for (size_t i = 0, j = 0; j < md.ndims; j++)
{
if (j == axis_list[k])
{
k++;
md.dims[j] = 1;
md.layout_desc.blocking.block_dims[j] = 1;
md.layout_desc.blocking.padding_dims[j] = 1;
md.layout_desc.blocking.offset_padding_to_data[j] = 0;
if (i > 0)
{
md.layout_desc.blocking.strides[1][j] =
in.data.layout_desc.blocking.strides[1][i - 1];
md.layout_desc.blocking.strides[0][j] =
in.data.layout_desc.blocking.strides[0][i - 1];
}
else
{
md.layout_desc.blocking.strides[1][j] = 0;
size_t nelems = 1;
for (size_t idx = 0; idx < in.data.ndims; idx++)
nelems *= in.data.dims[idx];
md.layout_desc.blocking.strides[0][j] = nelems;
}
}
else
{
md.dims[j] = in.data.dims[i];
md.layout_desc.blocking.strides[1][j] = in.data.layout_desc.blocking.strides[1][i];
md.layout_desc.blocking.strides[0][j] = in.data.layout_desc.blocking.strides[0][i];
md.layout_desc.blocking.block_dims[j] = in.data.layout_desc.blocking.block_dims[i];
md.layout_desc.blocking.padding_dims[j] = in.data.layout_desc.blocking.padding_dims[i];
md.layout_desc.blocking.offset_padding_to_data[j] =
in.data.layout_desc.blocking.offset_padding_to_data[i];
i++;
}
}
md.layout_desc.blocking.offset_padding = in.data.layout_desc.blocking.offset_padding;
return try_get_named_md(md);
}
bool runtime::cpu::mkldnn_utils::compare_mkldnn_formats(mkldnn::memory::format lhs,
......@@ -493,3 +592,38 @@ bool runtime::cpu::mkldnn_utils::is_mkldnn_blocked_data_format(mkldnn::memory::f
}
return false;
}
bool runtime::cpu::mkldnn_utils::is_mkldnn_padded_layout(const mkldnn::memory::desc& in,
const AxisVector& axis_list)
{
for (size_t i = 0; i < in.data.ndims; i++)
{
if (std::find(axis_list.begin(), axis_list.end(), i) == axis_list.end())
{
continue;
}
if (in.data.layout_desc.blocking.padding_dims[i] != in.data.dims[i])
{
return true;
}
if (in.data.layout_desc.blocking.offset_padding_to_data[i] != 0)
{
return true;
}
}
if (in.data.layout_desc.blocking.offset_padding != 0)
{
return true;
}
return false;
}
bool runtime::cpu::mkldnn_utils::use_mkldnn_kernel(const ngraph::Node* node)
{
auto op_annotations = static_cast<const ngraph::op::Op*>(node)->get_op_annotations();
return (op_annotations &&
static_pointer_cast<ngraph::runtime::cpu::CPUOpAnnotations>(op_annotations)
->is_mkldnn_op());
}
......@@ -43,6 +43,7 @@ namespace ngraph
const mkldnn::memory::desc& get_input_mkldnn_md(const Node* node, size_t index);
const mkldnn::memory::desc& get_output_mkldnn_md(const Node* node, size_t index);
mkldnn::memory::desc create_default_mkldnn_md(const Node* node,
size_t index,
bool is_output,
......@@ -54,14 +55,24 @@ namespace ngraph
mkldnn::memory::desc create_blocked_mkldnn_md(const Shape& dims,
const Strides& strides,
const ngraph::element::Type type);
mkldnn::memory::desc try_get_named_md(mkldnn_memory_desc_t md);
mkldnn::memory::desc rotate_blocked_md(const mkldnn::memory::desc& in,
AxisVector& axis_order);
bool use_mkldnn_kernel(const ngraph::Node* node);
const AxisVector& axis_order);
mkldnn::memory::desc squeeze_blocked_md(const mkldnn::memory::desc& in,
AxisVector& axis_list);
mkldnn::memory::desc expand_blocked_md(const mkldnn::memory::desc& in,
AxisVector& axis_list);
bool compare_mkldnn_formats(mkldnn::memory::format lhs, mkldnn::memory::format rhs);
bool compare_mkldnn_mds(const mkldnn::memory::desc& lhs,
const mkldnn::memory::desc& rhs);
bool is_mkldnn_padded_layout(const mkldnn::memory::desc& in,
const AxisVector& axis_list);
bool is_mkldnn_filter_format(mkldnn::memory::format fmt);
bool is_mkldnn_blocked_data_format(mkldnn::memory::format fmt);
bool use_mkldnn_kernel(const ngraph::Node* node);
std::unordered_set<std::type_index>& get_op_registry();
std::map<element::Type, const mkldnn::memory::data_type>&
get_mkldnn_data_type_map();
......
......@@ -483,7 +483,7 @@ namespace ngraph
auto arg0_rank = arg0_shape.size();
auto result_shape = node->get_output_shape(0);
if ((arg0_rank == 4 || arg0_rank == 2) &&
if ((arg0_rank == 4 || arg0_rank == 3 || arg0_rank == 2) &&
node->get_input_element_type(0) == element::f32)
{
auto op_annotations =
......
This diff is collapsed.
......@@ -31,7 +31,10 @@
#include "ngraph/op/parameter.hpp"
#include "ngraph/pass/manager.hpp"
#include "ngraph/pass/visualize_tree.hpp"
#include "ngraph/runtime/cpu/op/convert_layout.hpp"
#include "ngraph/runtime/cpu/pass/cpu_assignment.hpp"
#include "ngraph/runtime/cpu/pass/cpu_fusion.hpp"
#include "ngraph/runtime/cpu/pass/cpu_layout.hpp"
#include "ngraph/serializer.hpp"
#include "ngraph/util.hpp"
#include "nlohmann/json.hpp"
......@@ -191,3 +194,166 @@ TEST(cpu_test, mkldnn_layouts)
EXPECT_EQ(vector<float>{expected_result}, rv);
}
TEST(cpu_test, reshape_squeeze)
{
auto make_function = []() -> std::shared_ptr<Function> {
auto A = make_shared<op::Parameter>(element::f32, Shape{1, 16, 2, 2});
auto B = make_shared<op::Parameter>(element::f32, Shape{32, 16, 1, 1});
auto conv = make_shared<op::Convolution>(A,
B,
Strides{1, 1},
Strides{1, 1},
CoordinateDiff{0, 0},
CoordinateDiff{0, 0},
Strides{1, 1});
auto squeeze = make_shared<op::Reshape>(conv, AxisVector{0, 1, 2, 3}, Shape{32, 2, 2});
return make_shared<Function>(NodeVector{squeeze}, op::ParameterVector{A, B});
};
auto backend = runtime::Backend::create("CPU");
auto cpu_f = make_function();
auto int_f = make_function();
test::Uniform<float> rng(-100.0f, 100.0f);
vector<vector<float>> args;
for (shared_ptr<op::Parameter> param : cpu_f->get_parameters())
{
vector<float> tensor_val(shape_size(param->get_shape()));
rng.initialize(tensor_val);
args.push_back(tensor_val);
}
auto int_results = execute(int_f, args, "INTERPRETER");
auto cpu_results = execute(cpu_f, args, "CPU");
// Two convert layouts for inputs and weights of convolution.
EXPECT_EQ(count_ops_of_type<runtime::cpu::op::ConvertLayout>(cpu_f), 2);
for (size_t i = 0; i < cpu_results.size(); i++)
{
EXPECT_TRUE(test::all_close(cpu_results.at(i), int_results.at(i), 1.0e-4f, 1.0e-4f));
}
}
TEST(cpu_test, reshape_expand)
{
auto make_function = []() -> std::shared_ptr<Function> {
auto A = make_shared<op::Parameter>(element::f32, Shape{1, 16, 2, 2});
auto B = make_shared<op::Parameter>(element::f32, Shape{32, 16, 1, 1});
auto conv = make_shared<op::Convolution>(A,
B,
Strides{1, 1},
Strides{1, 1},
CoordinateDiff{0, 0},
CoordinateDiff{0, 0},
Strides{1, 1});
auto expand =
make_shared<op::Reshape>(conv, AxisVector{0, 1, 2, 3}, Shape{1, 32, 2, 1, 2, 1});
return make_shared<Function>(NodeVector{expand}, op::ParameterVector{A, B});
};
auto backend = runtime::Backend::create("CPU");
auto cpu_f = make_function();
auto int_f = make_function();
test::Uniform<float> rng(-100.0f, 100.0f);
vector<vector<float>> args;
for (shared_ptr<op::Parameter> param : cpu_f->get_parameters())
{
vector<float> tensor_val(shape_size(param->get_shape()));
rng.initialize(tensor_val);
args.push_back(tensor_val);
}
auto int_results = execute(int_f, args, "INTERPRETER");
auto cpu_results = execute(cpu_f, args, "CPU");
EXPECT_EQ(count_ops_of_type<runtime::cpu::op::ConvertLayout>(cpu_f), 2);
for (size_t i = 0; i < cpu_results.size(); i++)
{
EXPECT_TRUE(test::all_close(cpu_results.at(i), int_results.at(i), 1.0e-4f, 1.0e-4f));
}
}
TEST(cpu_test, reshape_squeeze_padded)
{
auto make_function = []() -> std::shared_ptr<Function> {
auto A = make_shared<op::Parameter>(element::f32, Shape{1, 16, 2, 2});
auto B = make_shared<op::Parameter>(element::f32, Shape{1, 16, 1, 1});
auto conv = make_shared<op::Convolution>(A,
B,
Strides{1, 1},
Strides{1, 1},
CoordinateDiff{0, 0},
CoordinateDiff{0, 0},
Strides{1, 1});
auto squeeze = make_shared<op::Reshape>(conv, AxisVector{0, 1, 2, 3}, Shape{2, 2});
return make_shared<Function>(NodeVector{squeeze}, op::ParameterVector{A, B});
};
auto backend = runtime::Backend::create("CPU");
auto cpu_f = make_function();
auto int_f = make_function();
test::Uniform<float> rng(-100.0f, 100.0f);
vector<vector<float>> args;
for (shared_ptr<op::Parameter> param : cpu_f->get_parameters())
{
vector<float> tensor_val(shape_size(param->get_shape()));
rng.initialize(tensor_val);
args.push_back(tensor_val);
}
auto int_results = execute(int_f, args, "INTERPRETER");
auto cpu_results = execute(cpu_f, args, "CPU");
// Two convert layouts for inputs and weights of convolution.
// One convert layout after convolution
EXPECT_EQ(count_ops_of_type<runtime::cpu::op::ConvertLayout>(cpu_f), 3);
for (size_t i = 0; i < cpu_results.size(); i++)
{
EXPECT_TRUE(test::all_close(cpu_results.at(i), int_results.at(i), 1.0e-4f, 1.0e-4f));
}
}
TEST(cpu_test, reshape_expand_squeeze)
{
auto make_function = []() -> std::shared_ptr<Function> {
auto A = make_shared<op::Parameter>(element::f32, Shape{1, 16, 1, 8});
auto B1 = make_shared<op::Parameter>(element::f32, Shape{32, 16, 1, 1});
auto conv1 = make_shared<op::Convolution>(A,
B1,
Strides{1, 1},
Strides{1, 1},
CoordinateDiff{0, 0},
CoordinateDiff{0, 0},
Strides{1, 1});
auto squeeze = make_shared<op::Reshape>(conv1, AxisVector{0, 1, 2, 3}, Shape{1, 32, 8});
auto relu = make_shared<op::Relu>(squeeze);
auto expand = make_shared<op::Reshape>(relu, AxisVector{0, 1, 2}, Shape{1, 32, 1, 8});
auto B2 = make_shared<op::Parameter>(element::f32, Shape{8, 32, 1, 1});
auto conv2 = make_shared<op::Convolution>(expand,
B2,
Strides{1, 1},
Strides{1, 1},
CoordinateDiff{0, 0},
CoordinateDiff{0, 0},
Strides{1, 1});
return make_shared<Function>(NodeVector{conv2}, op::ParameterVector{A, B1, B2});
};
auto backend = runtime::Backend::create("CPU");
auto cpu_f = make_function();
auto int_f = make_function();
test::Uniform<float> rng(-100.0f, 100.0f);
vector<vector<float>> args;
for (shared_ptr<op::Parameter> param : cpu_f->get_parameters())
{
vector<float> tensor_val(shape_size(param->get_shape()));
rng.initialize(tensor_val);
args.push_back(tensor_val);
}
auto int_results = execute(int_f, args, "INTERPRETER");
auto cpu_results = execute(cpu_f, args, "CPU");
for (size_t i = 0; i < cpu_results.size(); i++)
{
EXPECT_TRUE(test::all_close(cpu_results.at(i), int_results.at(i), 1.0e-4f, 1.0e-4f));
}
EXPECT_LE(count_ops_of_type<runtime::cpu::op::ConvertLayout>(cpu_f), 4);
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment