Commit 39700785 authored by Amy Zhuang's avatar Amy Zhuang Committed by Scott Cyphers

Use MKLDNN for non-strided slice ops. (#1575)

* Use MKLDNN for non-strided slice ops.

* Remove unused variables.

* Add utility function is_strided to graph_util.
Add datatype check for slice in cpu_assignment.

* Create blocked mkldnn memory desc for output when input format is blocked.

* Add slice converLayout fusion to cpu_post_layout_optimizations.

* Rename tensor_view.
Fix a bug.

* Rename a variable.
parent 202fc714
......@@ -553,3 +553,8 @@ bool ngraph::possibly_overwritten(Node* node)
}
return false;
}
bool ngraph::is_strided(const Strides& strides)
{
return std::any_of(strides.begin(), strides.end(), [](size_t stride) { return stride != 1; });
}
......@@ -313,4 +313,6 @@ namespace ngraph
// Return true if a node's user could potentially overwrite
// the output of this node with in-place kernels
bool possibly_overwritten(Node* node);
bool is_strided(const Strides& strides);
}
......@@ -48,19 +48,30 @@ namespace ngraph
auto lower_bounds = slice->get_lower_bounds();
auto upper_bounds = slice->get_upper_bounds();
bool strided = false;
for (auto stride : strides)
if (runtime::cpu::mkldnn_utils::use_mkldnn_kernel(node))
{
if (stride != 1)
{
strided = true;
break;
}
}
auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
auto input_desc = mkldnn_utils::get_input_mkldnn_md(node, 0);
auto result_desc = mkldnn_utils::get_output_mkldnn_md(node, 0);
auto slice_index = mkldnn_emitter->build_slice(
input_desc, result_desc, lower_bounds, out_shape);
auto& deps = mkldnn_emitter->get_primitive_deps(slice_index);
if (strided)
auto functor = [&, slice_index](CPURuntimeContext* ctx) {
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[0], arg_tensor);
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[1], out_tensor);
cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, slice_index);
};
functors.emplace_back(functor);
}
else
{
if (is_strided(strides))
{
std::function<decltype(runtime::cpu::kernel::strided_slice<float, 2>)> kernel;
std::function<decltype(runtime::cpu::kernel::strided_slice<float, 2>)>
kernel;
SELECT_KERNEL_BY_RANK(kernel,
args[0].get_element_type(),
......@@ -89,13 +100,14 @@ namespace ngraph
arg_shape.size(),
runtime::cpu::kernel::slice);
auto functor =
[&, kernel, arg_shape, out_shape, lower_bounds](CPURuntimeContext* ctx) {
auto functor = [&, kernel, arg_shape, out_shape, lower_bounds](
CPURuntimeContext* ctx) {
kernel(arg_tensor, out_tensor, arg_shape, out_shape, lower_bounds);
};
functors.emplace_back(functor);
}
}
}
REGISTER_OP_BUILDER(Slice);
}
......
......@@ -1942,6 +1942,32 @@ namespace ngraph
{
const ngraph::op::Slice* slice = static_cast<const ngraph::op::Slice*>(node);
if (runtime::cpu::mkldnn_utils::use_mkldnn_kernel(node))
{
auto out_shape = out[0].get_shape();
auto lower_bounds = slice->get_lower_bounds();
auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
auto input_desc = mkldnn_utils::get_input_mkldnn_md(node, 0);
auto result_desc = mkldnn_utils::get_output_mkldnn_md(node, 0);
auto slice_index = mkldnn_emitter->build_slice(
input_desc, result_desc, lower_bounds, out_shape);
auto& deps = mkldnn_emitter->get_primitive_deps(slice_index);
writer.block_begin();
writer << "cpu::mkldnn_utils::set_memory_ptr(ctx, " << to_string(deps[0])
<< ", " << args[0].get_name() << ");\n";
writer << "cpu::mkldnn_utils::set_memory_ptr(ctx, " << to_string(deps[1])
<< ", " << out[0].get_name() << ");\n";
writer << "cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, "
<< to_string(slice_index) << ");\n";
writer.block_end();
return;
}
writer.block_begin();
#if USE_EIGEN_CORE_INLINE == 1
size_t arg_rank = args[0].get_shape().size();
......
......@@ -1113,6 +1113,37 @@ size_t MKLDNNEmitter::build_concat(const std::vector<mkldnn::memory::desc>& inpu
return concat_index;
}
size_t MKLDNNEmitter::build_slice(const mkldnn::memory::desc& input_desc,
const mkldnn::memory::desc& result_desc,
const ngraph::Coordinate& lower_bounds,
const ngraph::Shape& result_shape)
{
std::vector<size_t> in_out_index;
mkldnn::memory::primitive_desc input_pd =
mkldnn::memory::primitive_desc(input_desc, runtime::cpu::mkldnn_utils::global_cpu_engine);
size_t input_index = build_memory_primitive(input_desc);
auto dims = mkldnn::memory::dims(result_shape.begin(), result_shape.end());
auto offsets = mkldnn::memory::dims(lower_bounds.begin(), lower_bounds.end());
auto view_pd = mkldnn::view::primitive_desc(input_pd, dims, offsets).dst_primitive_desc();
mkldnn::memory::primitive_desc result_pd =
mkldnn::memory::primitive_desc(result_desc, runtime::cpu::mkldnn_utils::global_cpu_engine);
size_t result_index = build_memory_primitive(result_desc);
// reorder primitive descriptor
mkldnn::reorder::primitive_desc reorder_pd =
mkldnn::reorder::primitive_desc(view_pd, result_pd);
// reorder primitive
size_t reorder_index = insert_primitive(new mkldnn::reorder(
reorder_pd, *m_mkldnn_primitives[input_index], *m_mkldnn_primitives[result_index]));
in_out_index.push_back(input_index);
in_out_index.push_back(result_index);
m_primitive_deps[reorder_index] = in_out_index;
return reorder_index;
}
size_t MKLDNNEmitter::build_softmax_forward(const mkldnn::memory::desc& input_desc,
const mkldnn::memory::desc& result_desc,
int softmax_axis)
......
......@@ -515,6 +515,11 @@ namespace ngraph
const mkldnn::memory::desc& result_desc,
const size_t concat_dim);
size_t build_slice(const mkldnn::memory::desc& input_desc,
const mkldnn::memory::desc& result_desc,
const ngraph::Coordinate& lower_bounds,
const ngraph::Shape& result_shape);
size_t build_softmax_forward(const mkldnn::memory::desc& input_desc,
const mkldnn::memory::desc& result_desc,
int softmax_axis);
......
......@@ -34,6 +34,7 @@
#include "ngraph/op/max_pool.hpp"
#include "ngraph/op/relu.hpp"
#include "ngraph/op/replace_slice.hpp"
#include "ngraph/op/slice.hpp"
#include "ngraph/op/softmax.hpp"
#include "ngraph/runtime/cpu/cpu_op_annotations.hpp"
#include "ngraph/runtime/cpu/mkldnn_utils.hpp"
......@@ -669,6 +670,20 @@ namespace ngraph
}
}
template <>
void CPUAssignment::ASSIGN_DECL(ngraph::op::Slice)
{
auto slice = static_cast<op::Slice*>(node);
auto strides = slice->get_strides();
if (!is_strided(strides) && node->get_input_element_type(0) == element::f32)
{
auto op_annotations =
std::make_shared<ngraph::runtime::cpu::CPUOpAnnotations>();
op_annotations->set_mkldnn_op(true);
slice->set_op_annotations(op_annotations);
}
}
template <>
void CPUAssignment::ASSIGN_DECL(ngraph::op::QuantizedMaxPool)
{
......@@ -682,6 +697,7 @@ namespace ngraph
quantized_mp->set_op_annotations(op_annotations);
}
}
template <>
void CPUAssignment::ASSIGN_DECL(ngraph::op::QuantizedAvgPool)
{
......@@ -814,6 +830,7 @@ static const runtime::cpu::pass::AssignOpMap s_dispatcher{
{TI(ngraph::op::QuantizedAvgPool),
&runtime::cpu::pass::CPUAssignment::assign<ngraph::op::QuantizedAvgPool>},
{TI(ngraph::op::Softmax), &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::Softmax>},
{TI(ngraph::op::Slice), &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::Slice>},
{TI(ngraph::op::Quantize), &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::Quantize>},
{TI(ngraph::op::ReplaceSlice),
&runtime::cpu::pass::CPUAssignment::assign<ngraph::op::ReplaceSlice>},
......
......@@ -40,6 +40,7 @@
#include "ngraph/op/reshape.hpp"
#include "ngraph/op/result.hpp"
#include "ngraph/op/sigmoid.hpp"
#include "ngraph/op/slice.hpp"
#include "ngraph/op/softmax.hpp"
#include "ngraph/runtime/cpu/cpu_layout_descriptor.hpp"
#include "ngraph/runtime/cpu/cpu_op_annotations.hpp"
......@@ -1583,6 +1584,45 @@ namespace ngraph
}
}
template <>
void CPULayout::LAYOUT_DECL(ngraph::op::Slice)
{
if (mkldnn_utils::use_mkldnn_kernel(node.get()))
{
// pass input format to output
auto input_md = mkldnn_utils::get_input_mkldnn_md(node.get(), 0);
NGRAPH_DEBUG << "input memory format: " << input_md.data.format << "\n";
auto result_format =
static_cast<mkldnn::memory::format>(input_md.data.format);
vector<memory::desc> o_mds;
if (result_format == mkldnn::memory::blocked)
{
auto cpu_tvl = dynamic_pointer_cast<runtime::cpu::LayoutDescriptor>(
node->get_inputs()[0]
.get_output()
.get_tensor_ptr()
->get_tensor_layout());
auto result_desc =
mkldnn_utils::create_blocked_mkldnn_md(node->get_output_shape(0),
cpu_tvl->get_strides(),
node->get_element_type());
o_mds.push_back(result_desc);
}
else
{
auto result_desc = mkldnn_utils::create_default_mkldnn_md(
node.get(), 0, true, result_format);
o_mds.push_back(result_desc);
}
set_output_layouts(node, o_mds);
}
else
{
set_native_layouts(external_function, node);
}
}
template <>
void CPULayout::LAYOUT_DECL(ngraph::op::Concat)
{
......@@ -1783,6 +1823,7 @@ static const runtime::cpu::pass::LayoutOpMap s_dispatcher{
{TI(ngraph::op::ConvolutionAdd),
&runtime::cpu::pass::CPULayout::layout<ngraph::op::ConvolutionAdd>},
{TI(ngraph::op::Dequantize), &runtime::cpu::pass::CPULayout::layout<ngraph::op::Dequantize>},
{TI(ngraph::op::Slice), &runtime::cpu::pass::CPULayout::layout<ngraph::op::Slice>},
{TI(ngraph::op::Quantize), &runtime::cpu::pass::CPULayout::layout<ngraph::op::Quantize>},
};
......
......@@ -24,12 +24,15 @@
#include "ngraph/op/convolution.hpp"
#include "ngraph/op/parameter.hpp"
#include "ngraph/op/reshape.hpp"
#include "ngraph/op/slice.hpp"
#include "ngraph/pass/graph_rewrite.hpp"
#include "ngraph/pass/manager.hpp"
#include "ngraph/pattern/matcher.hpp"
#include "ngraph/pattern/op/label.hpp"
#include "ngraph/pattern/op/skip.hpp"
#include "ngraph/runtime/cpu/cpu_layout_descriptor.hpp"
#include "ngraph/runtime/cpu/cpu_op_annotations.hpp"
#include "ngraph/runtime/cpu/mkldnn_utils.hpp"
#include "ngraph/runtime/cpu/op/convert_layout.hpp"
#include "ngraph/runtime/cpu/pass/cpu_post_layout_optimizations.hpp"
......@@ -116,3 +119,54 @@ void ngraph::runtime::cpu::pass::CPUPostLayoutOptimizations::construct_weight_fu
auto m = make_shared<pattern::Matcher>(conv, callback);
this->add_matcher(m);
}
void ngraph::runtime::cpu::pass::CPUPostLayoutOptimizations::construct_slice_convertLayout_fusion()
{
auto param = std::make_shared<pattern::op::Label>(element::f32, Shape{1, 576, 17, 17});
auto slice = std::make_shared<ngraph::op::Slice>(
param, Coordinate{0, 0, 0, 0}, Coordinate{1, 192, 17, 17});
auto tvt = slice->get_outputs().at(0).get_tensor_ptr().get();
auto lt_desc = std::make_shared<runtime::cpu::LayoutDescriptor>(*tvt);
auto cvt_lt = std::make_shared<runtime::cpu::op::ConvertLayout>(slice, lt_desc);
pattern::graph_rewrite_callback callback = [param](pattern::Matcher& m) {
NGRAPH_DEBUG << "In a callback for construct_slice_converLayout against "
<< m.get_match_root()->get_name();
auto m_cvt_lt = m.get_match_root();
auto m_slice = m_cvt_lt->get_argument(0);
auto slice_ptr = static_cast<const ngraph::op::Slice*>(m_slice.get());
// do the fusion if slice has 1 user and uses mkldnn kernel.
if (!runtime::cpu::mkldnn_utils::use_mkldnn_kernel(slice_ptr) ||
m_slice->get_users().size() != 1)
{
return false;
}
for (auto u : m.get_pattern_map()[param]->get_users())
{
if (u != m_slice)
{
continue;
}
auto new_slice = std::make_shared<ngraph::op::Slice>(m_slice->get_argument(0),
slice_ptr->get_lower_bounds(),
slice_ptr->get_upper_bounds(),
slice_ptr->get_strides());
auto op_annotations = std::make_shared<ngraph::runtime::cpu::CPUOpAnnotations>();
op_annotations->set_mkldnn_op(true);
new_slice->set_op_annotations(op_annotations);
auto tv = new_slice->get_output_tensor_ptr(0);
auto layout = std::make_shared<ngraph::runtime::cpu::LayoutDescriptor>(*tv);
layout->set_mkldnn_md(mkldnn_utils::get_output_mkldnn_md(m_cvt_lt.get(), 0));
tv->set_tensor_layout(layout);
ngraph::replace_node(m_cvt_lt, new_slice);
}
return true;
};
auto m = make_shared<pattern::Matcher>(cvt_lt, callback);
this->add_matcher(m);
}
......@@ -38,6 +38,8 @@ public:
: GraphRewrite()
{
construct_weight_fusion();
construct_slice_convertLayout_fusion();
}
void construct_weight_fusion();
void construct_slice_convertLayout_fusion();
};
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment