Commit eda11da7 authored by Amy Zhuang's avatar Amy Zhuang Committed by Adam Procter

Refactor to create MKLDNN primitives on the first iteration: (#2363)

* Refactor to create MKLDNN primitives on the first iteration:
  add, avg_pool, batch_norm, bounded_relu, concat, convert_layout,
  leaky_relu, lrn, max_pool, quantized_avg_pool, quantized_max_pool,
  relu, sigmoid, slice, softmax.

* Refactor to create MKLDNN primitives on the first iteration:
  pooling backward, convolution.

* Refactor to create MKLDNN primitives on the first iteration:
  convolution backward, rnn, lstm, quantization, dequantization.

* Delete one duplicate declaration.

* Create and pass mkldnn descriptors/primitive-descriptors for ops.

* Create and pass mkldnn descriptors for convolution backward ops.

* Remove one unused variable.

* Remove unused variables.

* Remove unused variables.

* Address PR feedback.

* Fix a bug.

* Add one parameter to build_quantize_reorder.

* Address PR feedback.

* Fix bi-rnn issue.
parent c571b7a7
......@@ -38,28 +38,22 @@ namespace ngraph
{
auto& functors = external_function->get_functors();
vector<float> scale_vector(2, 1);
vector<mkldnn::memory::primitive_desc> inputs_pd;
auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
auto input0_data_desc = mkldnn_utils::get_input_mkldnn_md(node, 0);
auto input1_data_desc = mkldnn_utils::get_input_mkldnn_md(node, 1);
auto result_desc = mkldnn_utils::get_output_mkldnn_md(node, 0);
inputs_pd.push_back(mkldnn::memory::primitive_desc(
input0_data_desc, runtime::cpu::executor::global_cpu_engine));
inputs_pd.push_back(mkldnn::memory::primitive_desc(
input1_data_desc, runtime::cpu::executor::global_cpu_engine));
size_t add_index = mkldnn_emitter->build_elementwise_add(
input0_data_desc, input1_data_desc, result_desc, scale_vector, inputs_pd);
auto sum_pd = mkldnn_emitter->get_elementwise_add_desc(node);
// Add needs 4 primitives: input0, input1, result, and sum.
size_t add_index = mkldnn_emitter->reserve_primitive_space(4);
auto& deps = mkldnn_emitter->get_primitive_deps(add_index);
auto& arg0_tensor = external_function->get_tensor_data(args[0].get_name());
auto& arg1_tensor = external_function->get_tensor_data(args[1].get_name());
auto& out_tensor = external_function->get_tensor_data(out[0].get_name());
auto functor = [&, add_index](CPURuntimeContext* ctx,
CPUExecutionContext* ectx) {
auto functor = [&, sum_pd, add_index](CPURuntimeContext* ctx,
CPUExecutionContext* ectx) {
if (ctx->first_iteration)
{
mkldnn_emitter->build_elementwise_add(sum_pd, add_index);
}
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[0], arg0_tensor);
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[1], arg1_tensor);
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[2], out_tensor);
......
......@@ -52,24 +52,19 @@ namespace ngraph
if (runtime::cpu::mkldnn_utils::use_mkldnn_kernel(node))
{
auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
auto input_desc = mkldnn_utils::get_input_mkldnn_md(node, 0);
auto result_desc = mkldnn_utils::get_output_mkldnn_md(node, 0);
size_t avg_pool_index = mkldnn_emitter->build_pooling_forward(
(include_padding_in_avg_computation
? mkldnn::algorithm::pooling_avg_include_padding
: mkldnn::algorithm::pooling_avg_exclude_padding),
input_desc,
result_desc,
window_movement_strides,
window_shape,
padding_below,
padding_above);
auto avg_pool_desc =
mkldnn_emitter->get_avg_pooling_forward_desc<ngraph::op::AvgPool>(node,
false);
// AvgPool needs 3 primitives: input, result, and pooling_forward.
size_t avg_pool_index = mkldnn_emitter->reserve_primitive_space(3);
auto& deps = mkldnn_emitter->get_primitive_deps(avg_pool_index);
auto functor = [&, avg_pool_index](CPURuntimeContext* ctx,
CPUExecutionContext* ectx) {
auto functor = [&, avg_pool_desc, avg_pool_index](CPURuntimeContext* ctx,
CPUExecutionContext* ectx) {
if (ctx->first_iteration)
{
mkldnn_emitter->build_pooling_forward(avg_pool_desc, avg_pool_index);
}
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[0], arg0_tensor);
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[1], out_tensor);
cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, avg_pool_index);
......@@ -130,23 +125,23 @@ namespace ngraph
if (runtime::cpu::mkldnn_utils::use_mkldnn_kernel(node))
{
auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
auto diff_dst_desc = runtime::cpu::mkldnn_utils::get_input_mkldnn_md(node, 0);
auto diff_src_desc = runtime::cpu::mkldnn_utils::get_output_mkldnn_md(node, 0);
size_t avg_pool_index = mkldnn_emitter->build_pooling_backward(
(apb->get_include_padding_in_avg_computation()
? mkldnn::algorithm::pooling_avg_include_padding
: mkldnn::algorithm::pooling_avg_exclude_padding),
diff_dst_desc,
diff_src_desc,
apb->get_window_movement_strides(),
apb->get_window_shape(),
apb->get_padding_below(),
apb->get_padding_above());
auto avg_pool_fwd_desc =
mkldnn_emitter->get_avg_pooling_forward_desc<ngraph::op::AvgPoolBackprop>(
node, true);
auto avg_pool_desc =
mkldnn_emitter->get_avg_pooling_backward_desc<ngraph::op::AvgPoolBackprop>(
node);
// AvgPoolBackprop needs 3 primitives: input, result, and pooling_backward.
size_t avg_pool_index = mkldnn_emitter->reserve_primitive_space(3);
auto& deps = mkldnn_emitter->get_primitive_deps(avg_pool_index);
auto functor = [&, avg_pool_index](CPURuntimeContext* ctx,
CPUExecutionContext* ectx) {
auto functor = [&, avg_pool_desc, avg_pool_fwd_desc, avg_pool_index](
CPURuntimeContext* ctx, CPUExecutionContext* ectx) {
if (ctx->first_iteration)
{
mkldnn_emitter->build_pooling_backward(
avg_pool_desc, avg_pool_fwd_desc, avg_pool_index);
}
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[0], delta_tensor);
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[1], out_tensor);
cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, avg_pool_index);
......
......@@ -48,8 +48,6 @@ namespace ngraph
auto& arg2_tensor = external_function->get_tensor_data(args[2].get_name());
auto& out0_tensor = external_function->get_tensor_data(out[0].get_name());
const OP* batchnorm = static_cast<const OP*>(node);
// Kill clang diagnostics bug
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wmissing-braces"
......@@ -80,28 +78,32 @@ namespace ngraph
auto& out2_tensor = external_function->get_tensor_data(out[2].get_name());
auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
auto input_desc = mkldnn_utils::get_input_mkldnn_md(node, 2);
auto batchnorm_desc =
mkldnn_emitter->get_batchnorm_forward_desc<OP>(node, true);
auto weights_shape = Shape{2, args[0].get_size()};
auto weights_desc = mkldnn_emitter->build_memory_descriptor(
weights_shape, args[0].get_element_type(), mkldnn::memory::format::nc);
auto results_desc = mkldnn_utils::get_output_mkldnn_md(node, 0);
auto mean_desc = mkldnn_utils::get_output_mkldnn_md(node, 1);
auto variance_desc = mkldnn_utils::get_output_mkldnn_md(node, 2);
auto batchnorm_index =
mkldnn_emitter->build_batchnorm_forward(input_desc,
weights_desc,
results_desc,
mean_desc,
variance_desc,
batchnorm->get_eps_value(),
false,
training,
ops);
// batchnorm forward needs 6 primitives: input, weights, result, mean,
// variance, and batch_normalization_forward.
auto batchnorm_index = mkldnn_emitter->reserve_primitive_space(6);
auto& deps = mkldnn_emitter->get_primitive_deps(batchnorm_index);
auto functor = [&, batchnorm_index, stacked_weights, weight_sizes](
CPURuntimeContext* ctx, CPUExecutionContext* ectx) {
auto functor = [&,
batchnorm_desc,
weights_desc,
training,
ops,
batchnorm_index,
stacked_weights,
weight_sizes](CPURuntimeContext* ctx,
CPUExecutionContext* ectx) {
if (ctx->first_iteration)
{
mkldnn_emitter->build_batchnorm_forward(
batchnorm_desc, weights_desc, training, batchnorm_index, ops);
}
memcpy(stacked_weights.get(), arg0_tensor, weight_sizes[0]);
memcpy(
stacked_weights.get() + weight_sizes[0], arg1_tensor, weight_sizes[1]);
......@@ -122,29 +124,32 @@ namespace ngraph
auto& arg4_tensor = external_function->get_tensor_data(args[4].get_name());
auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
auto batchnorm_desc =
mkldnn_emitter->get_batchnorm_forward_desc<OP>(node, false);
auto weights_shape = Shape{2, args[0].get_size()};
auto input_desc = mkldnn_utils::get_input_mkldnn_md(node, 2);
auto weights_desc = mkldnn_emitter->build_memory_descriptor(
weights_shape, args[0].get_element_type(), mkldnn::memory::format::nc);
auto mean_desc = mkldnn_utils::get_input_mkldnn_md(node, 3);
auto variance_desc = mkldnn_utils::get_input_mkldnn_md(node, 4);
auto results_desc = mkldnn_utils::get_output_mkldnn_md(node, 0);
auto batchnorm_index =
mkldnn_emitter->build_batchnorm_forward(input_desc,
weights_desc,
results_desc,
mean_desc,
variance_desc,
batchnorm->get_eps_value(),
true,
training,
ops);
// batchnorm forward needs 6 primitives: input, weights, result, mean,
// variance, and batch_normalization_forward.
auto batchnorm_index = mkldnn_emitter->reserve_primitive_space(6);
auto& deps = mkldnn_emitter->get_primitive_deps(batchnorm_index);
auto functor = [&, batchnorm_index, stacked_weights, weight_sizes](
CPURuntimeContext* ctx, CPUExecutionContext* ectx) {
auto functor = [&,
batchnorm_desc,
weights_desc,
training,
ops,
batchnorm_index,
stacked_weights,
weight_sizes](CPURuntimeContext* ctx,
CPUExecutionContext* ectx) {
if (ctx->first_iteration)
{
mkldnn_emitter->build_batchnorm_forward(
batchnorm_desc, weights_desc, training, batchnorm_index, ops);
}
memcpy(stacked_weights.get(), arg0_tensor, weight_sizes[0]);
memcpy(
stacked_weights.get() + weight_sizes[0], arg1_tensor, weight_sizes[1]);
......@@ -295,9 +300,6 @@ namespace ngraph
template <>
void Builder::BUILDER_DECL(ngraph::op::BatchNormTrainingBackprop)
{
const ngraph::op::BatchNormTrainingBackprop* batchnorm =
static_cast<const ngraph::op::BatchNormTrainingBackprop*>(node);
auto& functors = external_function->get_functors();
auto& arg0_tensor = external_function->get_tensor_data(args[0].get_name());
......@@ -326,34 +328,31 @@ namespace ngraph
std::default_delete<uint8_t[]>());
auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
auto batchnorm_desc = mkldnn_emitter->get_batchnorm_backward_desc(node);
auto weights_shape = Shape{2, args[0].get_size()};
auto weights_desc = mkldnn_emitter->build_memory_descriptor(
weights_shape, args[0].get_element_type(), mkldnn::memory::format::nc);
auto input_desc = mkldnn_utils::get_input_mkldnn_md(node, 2);
auto mean_desc = mkldnn_utils::get_input_mkldnn_md(node, 3);
auto variance_desc = mkldnn_utils::get_input_mkldnn_md(node, 4);
auto delta_desc = mkldnn_utils::get_input_mkldnn_md(node, 5);
auto dinput_desc = mkldnn_utils::get_output_mkldnn_md(node, 0);
auto dweights_desc = mkldnn_emitter->build_memory_descriptor(
weights_shape, args[0].get_element_type(), mkldnn::memory::format::nc);
auto batchnorm_index =
mkldnn_emitter->build_batchnorm_backward(weights_desc,
input_desc,
mean_desc,
variance_desc,
delta_desc,
dinput_desc,
dweights_desc,
batchnorm->get_eps_value());
// batchnorm backward needs 8 primitives: weights, input, mean, variance,
// dinput, dweights, and batch_normalization_backward.
auto batchnorm_index = mkldnn_emitter->reserve_primitive_space(8);
auto& deps = mkldnn_emitter->get_primitive_deps(batchnorm_index);
auto functor = [&,
batchnorm_desc,
weights_desc,
dweights_desc,
batchnorm_index,
stacked_weights,
stacked_dweights,
weight_sizes](CPURuntimeContext* ctx, CPUExecutionContext* ectx) {
if (ctx->first_iteration)
{
mkldnn_emitter->build_batchnorm_backward(
batchnorm_desc, weights_desc, dweights_desc, batchnorm_index);
}
memcpy(stacked_weights.get(), arg0_tensor, weight_sizes[0]);
memcpy(stacked_weights.get() + weight_sizes[0], arg1_tensor, weight_sizes[1]);
......
......@@ -43,13 +43,18 @@ namespace ngraph
if (runtime::cpu::mkldnn_utils::use_mkldnn_kernel(node))
{
auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
auto input_desc = mkldnn_utils::get_input_mkldnn_md(node, 0);
auto result_desc = mkldnn_utils::get_output_mkldnn_md(node, 0);
auto bounded_relu_index =
mkldnn_emitter->build_bounded_relu(input_desc, result_desc, alpha);
auto bounded_relu_desc = mkldnn_emitter->get_bounded_relu_desc(node);
// BoundedRelu needs 3 primitives: input, result, and eltwise_forward.
auto bounded_relu_index = mkldnn_emitter->reserve_primitive_space(3);
auto& deps = mkldnn_emitter->get_primitive_deps(bounded_relu_index);
auto functor = [&, bounded_relu_index](CPURuntimeContext* ctx,
CPUExecutionContext* ectx) {
auto functor = [&, bounded_relu_desc, bounded_relu_index](
CPURuntimeContext* ctx, CPUExecutionContext* ectx) {
if (ctx->first_iteration)
{
mkldnn_emitter->build_bounded_relu(bounded_relu_desc,
bounded_relu_index);
}
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[0], input_tensor);
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[1], out_tensor);
cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, bounded_relu_index);
......
......@@ -92,29 +92,31 @@ namespace ngraph
if (runtime::cpu::mkldnn_utils::use_mkldnn_kernel(node))
{
auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
auto concat_pd = mkldnn_emitter->get_concat_desc(node, nargs);
std::vector<mkldnn::memory::desc> inputs_data_desc;
for (size_t i = 0; i < args.size(); i++)
for (size_t i = 0; i < nargs; i++)
{
inputs_data_desc.push_back(mkldnn_utils::get_input_mkldnn_md(node, i));
}
auto result_desc = mkldnn_utils::get_output_mkldnn_md(node, 0);
size_t concat_dim =
(dynamic_cast<const ngraph::op::Concat*>(node))->get_concatenation_axis();
auto concat_index =
mkldnn_emitter->build_concat(inputs_data_desc, result_desc, concat_dim);
// Concat needs number of inputs plus 2 primitives; those two are for result and concat.
auto concat_index = mkldnn_emitter->reserve_primitive_space(nargs + 2);
auto& deps = mkldnn_emitter->get_primitive_deps(concat_index);
auto functor = [&, arg_tensors, nargs, concat_index](
CPURuntimeContext* ctx, CPUExecutionContext* ectx) {
for (size_t i = 0; i < nargs; i++)
{
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[i], arg_tensors[i]);
}
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[nargs], out_tensor);
cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, concat_index);
};
auto functor =
[&, concat_pd, inputs_data_desc, arg_tensors, nargs, concat_index](
CPURuntimeContext* ctx, CPUExecutionContext* ectx) {
if (ctx->first_iteration)
{
mkldnn_emitter->build_concat(
concat_pd, inputs_data_desc, concat_index);
}
for (size_t i = 0; i < nargs; i++)
{
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[i], arg_tensors[i]);
}
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[nargs], out_tensor);
cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, concat_index);
};
functors.emplace_back(functor);
}
......
......@@ -81,11 +81,15 @@ namespace ngraph
mkldnn::memory::format::goihw);
}
size_t reorder_index = mkldnn_emitter->build_reorder(input_desc, result_desc);
// ConvertLayout needs 3 primitives: input, result, and reorder.
size_t reorder_index = mkldnn_emitter->reserve_primitive_space(3);
auto& deps = mkldnn_emitter->get_primitive_deps(reorder_index);
auto functor = [&, reorder_index](CPURuntimeContext* ctx,
CPUExecutionContext* ectx) {
auto functor = [&, input_desc, result_desc, reorder_index](
CPURuntimeContext* ctx, CPUExecutionContext* ectx) {
if (ctx->first_iteration)
{
mkldnn_emitter->build_reorder(input_desc, result_desc, reorder_index);
}
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[0], arg_tensor);
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[1], out_tensor);
cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, reorder_index);
......
......@@ -43,13 +43,17 @@ namespace ngraph
if (runtime::cpu::mkldnn_utils::use_mkldnn_kernel(node))
{
auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
auto input_desc = mkldnn_utils::get_input_mkldnn_md(node, 0);
auto result_desc = mkldnn_utils::get_output_mkldnn_md(node, 0);
auto leaky_relu_index =
mkldnn_emitter->build_leaky_relu(input_desc, result_desc, alpha);
auto leaky_relu_desc = mkldnn_emitter->get_leaky_relu_desc(node);
// LeakyRelu needs 3 primitives: input, result, and eltwise_forward.
auto leaky_relu_index = mkldnn_emitter->reserve_primitive_space(3);
auto& deps = mkldnn_emitter->get_primitive_deps(leaky_relu_index);
auto functor = [&, leaky_relu_index](CPURuntimeContext* ctx,
CPUExecutionContext* ectx) {
auto functor = [&, leaky_relu_desc, leaky_relu_index](
CPURuntimeContext* ctx, CPUExecutionContext* ectx) {
if (ctx->first_iteration)
{
mkldnn_emitter->build_leaky_relu(leaky_relu_desc, leaky_relu_index);
}
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[0], input_tensor);
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[1], out_tensor);
cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, leaky_relu_index);
......
......@@ -43,19 +43,17 @@ namespace ngraph
if (runtime::cpu::mkldnn_utils::use_mkldnn_kernel(node))
{
auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
auto input_data_desc = mkldnn_utils::get_input_mkldnn_md(node, 0);
auto result_desc = mkldnn_utils::get_output_mkldnn_md(node, 0);
auto lrn_index =
mkldnn_emitter->build_lrn_forward(input_data_desc,
result_desc,
static_cast<float>(lrn->get_alpha()),
static_cast<float>(lrn->get_beta()),
static_cast<float>(lrn->get_bias()),
static_cast<int>(lrn->get_nsize()));
auto lrn_desc = mkldnn_emitter->get_lrn_forward_desc(node);
// LRN needs 3 primitives: input, result, and lrn_forward.
auto lrn_index = mkldnn_emitter->reserve_primitive_space(3);
auto& deps = mkldnn_emitter->get_primitive_deps(lrn_index);
functor = [&, lrn_index](CPURuntimeContext* ctx, CPUExecutionContext* ectx) {
functor = [&, lrn_desc, lrn_index](CPURuntimeContext* ctx,
CPUExecutionContext* ectx) {
if (ctx->first_iteration)
{
mkldnn_emitter->build_lrn_forward(lrn_desc, lrn_index);
}
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[0], arg_tensor);
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[1], out_tensor);
cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, lrn_index);
......
......@@ -54,10 +54,22 @@ namespace ngraph
auto& dst_iter_tensor = external_function->get_tensor_data(out[1].get_name());
auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
auto lstm_index = mkldnn_emitter->build_rnn<ngraph::op::Lstm>(node, args, out);
auto lstm_desc =
mkldnn_emitter->get_rnn_forward_desc<ngraph::op::Lstm>(node, args, out);
// Lstm needs 9 primitives: src_layer, src_iter, weights_layer, weights_iter, bias,
// dst_layer, dst_iter, and rnn_forward.
// It needs a new workspace.
auto lstm_index =
mkldnn_emitter->reserve_primitive_space(9, true /* new workspace */);
auto& deps = mkldnn_emitter->get_primitive_deps(lstm_index);
auto functor = [&, lstm_index](CPURuntimeContext* ctx, CPUExecutionContext* ectx) {
auto functor = [&, lstm_desc, lstm_index](CPURuntimeContext* ctx,
CPUExecutionContext* ectx) {
if (ctx->first_iteration)
{
mkldnn_emitter->build_rnn_forward(lstm_desc, lstm_index);
ctx->mkldnn_workspaces = mkldnn_emitter->get_mkldnn_workspaces().data();
}
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[0], src_layer_tensor);
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[1], src_iter_tensor);
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[2], weights_layer_tensor);
......
This diff is collapsed.
......@@ -54,35 +54,26 @@ namespace ngraph
auto result_desc = mkldnn_utils::get_output_mkldnn_md(node, 0);
auto scale_const_op = std::dynamic_pointer_cast<ngraph::op::Constant>(
dequantize->get_argument(1));
std::vector<float> scales;
if (scale_const_op == nullptr)
{
auto& arg1_tensor = external_function->get_tensor_data(args[1].get_name());
auto scales_size = shape_size(args[1].get_shape());
size_t dequantize_index =
mkldnn_emitter->build_dequantization(node, input_desc, result_desc);
// Dequantize needs 3 primitives: input, result, and reorder.
size_t dequantize_index = mkldnn_emitter->reserve_primitive_space(3);
auto& deps = mkldnn_emitter->get_primitive_deps(dequantize_index);
functor = [&, input_desc, result_desc, scales_size, dequantize_index](
CPURuntimeContext* ctx, CPUExecutionContext* ectx) {
// Create MKLDNN reorder primitive during the first iteration.
// Assumes the scales dont change for the duration of the graph
if (ctx->first_iteration)
{
mkldnn::primitive_attr attr;
vector<float> dyn_scales;
dyn_scales.assign(static_cast<float*>(arg1_tensor),
static_cast<float*>(arg1_tensor) + scales_size);
attr.set_output_scales(0, dyn_scales);
attr.set_int_output_round_mode(mkldnn::round_mode::round_nearest);
auto reorder_desc = mkldnn::reorder::primitive_desc(
{input_desc, executor::global_cpu_engine},
{result_desc, executor::global_cpu_engine},
attr);
*ctx->mkldnn_primitives[dequantize_index] =
mkldnn::reorder(reorder_desc,
*ctx->mkldnn_primitives[deps[0]],
*ctx->mkldnn_primitives[deps[1]]);
mkldnn_emitter->build_quantize_reorder(
input_desc, result_desc, dyn_scales, dequantize_index);
}
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[0], arg0_tensor);
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[1], out_tensor);
......@@ -92,11 +83,19 @@ namespace ngraph
}
else
{
size_t dequantize_index =
mkldnn_emitter->build_dequantization(node, input_desc, result_desc);
std::vector<float> scale = scale_const_op->get_vector<float>();
std::vector<float> scales;
scales.push_back(scale[0]);
size_t dequantize_index = mkldnn_emitter->reserve_primitive_space(3);
auto& deps = mkldnn_emitter->get_primitive_deps(dequantize_index);
functor = [&, dequantize_index](CPURuntimeContext* ctx,
CPUExecutionContext* ectx) {
functor = [&, input_desc, result_desc, scales, dequantize_index](
CPURuntimeContext* ctx, CPUExecutionContext* ectx) {
if (ctx->first_iteration)
{
mkldnn_emitter->build_quantize_reorder(
input_desc, result_desc, scales, dequantize_index);
}
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[0], arg0_tensor);
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[1], out_tensor);
cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, dequantize_index);
......@@ -243,25 +242,21 @@ namespace ngraph
auto scale_const_op =
std::dynamic_pointer_cast<ngraph::op::Constant>(quantize->get_argument(1));
std::vector<float> scales;
if (scale_const_op == nullptr)
{
auto& arg1_tensor = external_function->get_tensor_data(args[1].get_name());
auto scales_size = shape_size(args[1].get_shape());
// Dummy value while we wait for the actual values that are provided during
// execution
scales.push_back(1.0f);
size_t quantize_index =
mkldnn_emitter->build_quantize_reorder(input_desc, result_desc, scales);
// Quantize needs 3 primitives: input, result, and reorder.
size_t quantize_index = mkldnn_emitter->reserve_primitive_space(3);
auto& deps = mkldnn_emitter->get_primitive_deps(quantize_index);
auto functor = [&, input_desc, result_desc, scales_size, quantize_index](
CPURuntimeContext* ctx, CPUExecutionContext* ectx) {
// Create MKLDNN reorder primitive during the first iteration.
// Assumes the scales dont change for the duration of the graph
if (ctx->first_iteration)
{
mkldnn::primitive_attr attr;
vector<float> dyn_scales;
dyn_scales.assign(static_cast<float*>(arg1_tensor),
static_cast<float*>(arg1_tensor) + scales_size);
......@@ -271,16 +266,8 @@ namespace ngraph
}
// quantize across first dim (mask=2^0) if dyn_scales is a vector
const int mask = scales_size == 1 ? 0 : 1;
attr.set_output_scales(mask, dyn_scales);
attr.set_int_output_round_mode(mkldnn::round_mode::round_nearest);
auto reorder_desc = mkldnn::reorder::primitive_desc(
{input_desc, executor::global_cpu_engine},
{result_desc, executor::global_cpu_engine},
attr);
*ctx->mkldnn_primitives[quantize_index] =
mkldnn::reorder(reorder_desc,
*ctx->mkldnn_primitives[deps[0]],
*ctx->mkldnn_primitives[deps[1]]);
mkldnn_emitter->build_quantize_reorder(
input_desc, result_desc, dyn_scales, quantize_index, mask);
}
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[0], arg0_tensor);
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[1], out_tensor);
......@@ -291,12 +278,18 @@ namespace ngraph
else
{
auto scale = scale_const_op->get_vector<float>();
std::vector<float> scales;
scales.push_back(1.0 / scale[0]);
size_t quantize_index =
mkldnn_emitter->build_quantize_reorder(input_desc, result_desc, scales);
size_t quantize_index = mkldnn_emitter->reserve_primitive_space(3);
auto& deps = mkldnn_emitter->get_primitive_deps(quantize_index);
auto functor = [&, quantize_index](CPURuntimeContext* ctx,
CPUExecutionContext* ectx) {
auto functor = [&, input_desc, result_desc, scales, quantize_index](
CPURuntimeContext* ctx, CPUExecutionContext* ectx) {
if (ctx->first_iteration)
{
mkldnn_emitter->build_quantize_reorder(
input_desc, result_desc, scales, quantize_index);
}
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[0], arg0_tensor);
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[1], out_tensor);
cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, quantize_index);
......
......@@ -35,15 +35,24 @@ namespace ngraph
if (runtime::cpu::mkldnn_utils::use_mkldnn_kernel(node))
{
auto& functors = external_function->get_functors();
auto& arg_tensor = external_function->get_tensor_data(args[0].get_name());
auto& out_tensor = external_function->get_tensor_data(out[0].get_name());
auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
size_t qavg_pool_index = mkldnn_emitter->build_quantized_avg_pool(node);
auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
auto qavg_pool_desc =
mkldnn_emitter->get_avg_pooling_forward_desc<ngraph::op::QuantizedAvgPool>(
node, false);
// QuantizedAvgPool needs 3 primitives: input, result, and pooling_forward.
size_t qavg_pool_index = mkldnn_emitter->reserve_primitive_space(3);
auto& deps = mkldnn_emitter->get_primitive_deps(qavg_pool_index);
auto functor = [&, qavg_pool_index](CPURuntimeContext* ctx,
CPUExecutionContext* ectx) {
auto functor = [&, qavg_pool_desc, qavg_pool_index](CPURuntimeContext* ctx,
CPUExecutionContext* ectx) {
if (ctx->first_iteration)
{
mkldnn_emitter->build_pooling_forward(qavg_pool_desc, qavg_pool_index);
}
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[0], arg_tensor);
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[1], out_tensor);
cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, qavg_pool_index);
......
......@@ -48,8 +48,7 @@ namespace ngraph
auto conv_desc =
mkldnn_emitter
->get_convolution_forward_desc<ngraph::op::QuantizedConvolution>(
node, args, out);
->get_convolution_forward_desc<ngraph::op::QuantizedConvolution>(node);
auto conv_attr =
mkldnn_emitter
->get_convolution_forward_attr<ngraph::op::QuantizedConvolution>(node);
......@@ -68,7 +67,7 @@ namespace ngraph
// use conv channelwise (dim 1, mask=2^1) if dyn_scales is a vector
const int mask = scales_size == 1 ? 0 : 2;
conv_attr.set_output_scales(mask, dyn_scales);
mkldnn_emitter->convolution_forward<false>(
mkldnn_emitter->build_convolution_forward<false>(
conv_desc, conv_attr, executor::global_cpu_engine, conv_index);
}
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[0], arg0_tensor);
......@@ -101,7 +100,7 @@ namespace ngraph
auto conv_desc =
mkldnn_emitter
->get_convolution_forward_desc<ngraph::op::QuantizedConvolutionRelu>(
node, args, out);
node);
auto conv_attr =
mkldnn_emitter
->get_convolution_forward_attr<ngraph::op::QuantizedConvolutionRelu>(
......@@ -119,7 +118,7 @@ namespace ngraph
// use conv channelwise (dim 1, mask=2^1) if dyn_scales is a vector
const int mask = scales_size == 1 ? 0 : 2;
conv_attr.set_output_scales(mask, dyn_scales);
mkldnn_emitter->convolution_forward<false>(
mkldnn_emitter->build_convolution_forward<false>(
conv_desc, conv_attr, executor::global_cpu_engine, conv_index);
}
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[0], arg0_tensor);
......@@ -154,7 +153,7 @@ namespace ngraph
auto conv_desc =
mkldnn_emitter
->get_convolution_forward_desc<ngraph::op::QuantizedConvolutionBias>(
node, args, out);
node);
auto conv_attr =
mkldnn_emitter
->get_convolution_forward_attr<ngraph::op::QuantizedConvolutionBias>(
......@@ -172,7 +171,7 @@ namespace ngraph
// use conv channelwise (dim 1, mask=2^1) if dyn_scales is a vector
const int mask = scales_size == 1 ? 0 : 2;
conv_attr.set_output_scales(mask, dyn_scales);
mkldnn_emitter->convolution_forward<true>(
mkldnn_emitter->build_convolution_forward<true>(
conv_desc, conv_attr, executor::global_cpu_engine, conv_index);
}
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[0], arg0_tensor);
......@@ -213,7 +212,7 @@ namespace ngraph
auto conv_desc =
mkldnn_emitter
->get_convolution_forward_desc<ngraph::op::QuantizedConvolutionBiasAdd>(
node, args, out);
node);
auto conv_attr =
mkldnn_emitter
->get_convolution_forward_attr<ngraph::op::QuantizedConvolutionBiasAdd>(
......@@ -259,7 +258,7 @@ namespace ngraph
const int mask = scales_size == 1 ? 0 : 2;
conv_attr.set_output_scales(mask, dyn_scales);
conv_attr.set_post_ops(new_pops);
mkldnn_emitter->convolution_forward<true>(
mkldnn_emitter->build_convolution_forward<true>(
conv_desc, conv_attr, executor::global_cpu_engine, conv_index);
}
......@@ -305,7 +304,7 @@ namespace ngraph
auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
auto conv_desc = mkldnn_emitter->get_convolution_forward_desc<
ngraph::op::QuantizedConvolutionBiasSignedAdd>(node, args, out);
ngraph::op::QuantizedConvolutionBiasSignedAdd>(node);
auto conv_attr = mkldnn_emitter->get_convolution_forward_attr<
ngraph::op::QuantizedConvolutionBiasSignedAdd>(node);
size_t conv_index = mkldnn_emitter->convolution_forward_init(true);
......@@ -349,7 +348,7 @@ namespace ngraph
// use conv channelwise (dim 1, mask=2^1) if dyn_scales is a vector
const int mask = scales_size == 1 ? 0 : 2;
conv_attr.set_output_scales(mask, dyn_scales);
mkldnn_emitter->convolution_forward<true>(
mkldnn_emitter->build_convolution_forward<true>(
conv_desc, conv_attr, executor::global_cpu_engine, conv_index);
}
......
......@@ -35,16 +35,24 @@ namespace ngraph
if (runtime::cpu::mkldnn_utils::use_mkldnn_kernel(node))
{
auto& functors = external_function->get_functors();
auto& arg_tensor = external_function->get_tensor_data(args[0].get_name());
auto& out_tensor = external_function->get_tensor_data(out[0].get_name());
auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
size_t qmax_pool_index = mkldnn_emitter->build_quantized_max_pool(node);
auto qmax_pool_desc =
mkldnn_emitter->get_max_pooling_forward_desc<ngraph::op::QuantizedMaxPool>(
node, false);
// QuantizedMaxPool needs 3 primitives: input, result, and pooling_forward.
size_t qmax_pool_index = mkldnn_emitter->reserve_primitive_space(3);
auto& deps = mkldnn_emitter->get_primitive_deps(qmax_pool_index);
auto functor = [&, qmax_pool_index](CPURuntimeContext* ctx,
CPUExecutionContext* ectx) {
auto functor = [&, qmax_pool_desc, qmax_pool_index](CPURuntimeContext* ctx,
CPUExecutionContext* ectx) {
if (ctx->first_iteration)
{
mkldnn_emitter->build_pooling_forward(qmax_pool_desc, qmax_pool_index);
}
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[0], arg_tensor);
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[1], out_tensor);
cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, qmax_pool_index);
......
......@@ -40,15 +40,17 @@ namespace ngraph
auto& out_tensor = external_function->get_tensor_data(out[0].get_name());
auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
auto input_desc = mkldnn_utils::get_input_mkldnn_md(node, 0);
auto result_desc = mkldnn_utils::get_output_mkldnn_md(node, 0);
size_t relu_index = mkldnn_emitter->build_relu_forward(input_desc, result_desc);
auto relu_desc = mkldnn_emitter->get_relu_forward_desc(node);
// Relu needs 3 primitives: input, result, and eltwise_forward.
size_t relu_index = mkldnn_emitter->reserve_primitive_space(3);
auto& deps = mkldnn_emitter->get_primitive_deps(relu_index);
auto functor = [&, relu_index](CPURuntimeContext* ctx,
CPUExecutionContext* ectx) {
auto functor = [&, relu_desc, relu_index](CPURuntimeContext* ctx,
CPUExecutionContext* ectx) {
if (ctx->first_iteration)
{
mkldnn_emitter->build_relu_forward(relu_desc, relu_index);
}
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[0], arg_tensor);
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[1], out_tensor);
cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, relu_index);
......@@ -74,16 +76,18 @@ namespace ngraph
if (runtime::cpu::mkldnn_utils::use_mkldnn_kernel(node))
{
auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
auto input_desc = mkldnn_utils::get_input_mkldnn_md(node, 0);
auto delta_desc = mkldnn_utils::get_input_mkldnn_md(node, 1);
auto result_desc = mkldnn_utils::get_output_mkldnn_md(node, 0);
size_t relu_index =
mkldnn_emitter->build_relu_backward(input_desc, delta_desc, result_desc);
auto bwd_desc = mkldnn_emitter->get_relu_backward_desc(node);
auto fwd_desc = mkldnn_emitter->get_relu_forward_desc(node);
// ReluBackprop needs 4 primitives: input, delta, result, and eltwise_backward.
size_t relu_index = mkldnn_emitter->reserve_primitive_space(4);
auto& deps = mkldnn_emitter->get_primitive_deps(relu_index);
auto functor = [&, relu_index](CPURuntimeContext* ctx,
CPUExecutionContext* ectx) {
auto functor = [&, bwd_desc, fwd_desc, relu_index](CPURuntimeContext* ctx,
CPUExecutionContext* ectx) {
if (ctx->first_iteration)
{
mkldnn_emitter->build_relu_backward(bwd_desc, fwd_desc, relu_index);
}
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[0], arg_fwd_tensor);
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[1], delta_tensor);
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[2], out_tensor);
......
......@@ -49,9 +49,22 @@ namespace ngraph
auto& dst_iter_tensor = external_function->get_tensor_data(out[1].get_name());
auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
auto rnn_index = mkldnn_emitter->build_rnn<ngraph::op::Rnn>(node, args, out);
auto rnn_desc =
mkldnn_emitter->get_rnn_forward_desc<ngraph::op::Rnn>(node, args, out);
// Rnn needs 9 primitives: src_layer, src_iter, weights_layer, weights_iter, bias,
// dst_layer, dst_iter, and rnn_forward.
// It needs a new workspace.
auto rnn_index =
mkldnn_emitter->reserve_primitive_space(9, true /* new workspace */);
auto& deps = mkldnn_emitter->get_primitive_deps(rnn_index);
auto functor = [&, rnn_index](CPURuntimeContext* ctx, CPUExecutionContext* ectx) {
auto functor = [&, rnn_desc, rnn_index](CPURuntimeContext* ctx,
CPUExecutionContext* ectx) {
if (ctx->first_iteration)
{
mkldnn_emitter->build_rnn_forward(rnn_desc, rnn_index);
ctx->mkldnn_workspaces = mkldnn_emitter->get_mkldnn_workspaces().data();
}
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[0], src_layer_tensor);
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[1], src_iter_tensor);
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[2], weights_layer_tensor);
......
......@@ -42,15 +42,17 @@ namespace ngraph
auto out_shape = out[0].get_shape();
auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
auto input_desc = mkldnn_utils::get_input_mkldnn_md(node, 0);
auto out_desc = mkldnn_utils::get_output_mkldnn_md(node, 0);
auto sigmoid_index = mkldnn_emitter->build_sigmoid_forward(input_desc, out_desc);
auto sigmoid_desc = mkldnn_emitter->get_sigmoid_forward_desc(node, false);
// Sigmoid needs 3 primitives: input, result, and eltwise_forward.
auto sigmoid_index = mkldnn_emitter->reserve_primitive_space(3);
auto& deps = mkldnn_emitter->get_primitive_deps(sigmoid_index);
auto functor = [&, sigmoid_index](CPURuntimeContext* ctx,
CPUExecutionContext* ectx) {
auto functor = [&, sigmoid_desc, sigmoid_index](CPURuntimeContext* ctx,
CPUExecutionContext* ectx) {
if (ctx->first_iteration)
{
mkldnn_emitter->build_sigmoid_forward(sigmoid_desc, sigmoid_index);
}
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[0], arg0_tensor);
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[1], out_tensor);
cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, sigmoid_index);
......@@ -72,17 +74,18 @@ namespace ngraph
auto out_shape = out[0].get_shape();
auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
auto input_desc = mkldnn_utils::get_input_mkldnn_md(node, 0);
auto delta_desc = mkldnn_utils::get_input_mkldnn_md(node, 1);
auto out_desc = mkldnn_utils::get_output_mkldnn_md(node, 0);
size_t sigmoid_index =
mkldnn_emitter->build_sigmoid_backward(input_desc, delta_desc, out_desc);
auto fwd_desc = mkldnn_emitter->get_sigmoid_forward_desc(node, true);
auto bwd_desc = mkldnn_emitter->get_sigmoid_backward_desc(node);
// SigmoidBackprop needs 4 primitives: input, delta, result, and eltwise_backward.
size_t sigmoid_index = mkldnn_emitter->reserve_primitive_space(4);
auto& deps = mkldnn_emitter->get_primitive_deps(sigmoid_index);
auto functor = [&, sigmoid_index](CPURuntimeContext* ctx,
CPUExecutionContext* ectx) {
auto functor = [&, bwd_desc, fwd_desc, sigmoid_index](CPURuntimeContext* ctx,
CPUExecutionContext* ectx) {
if (ctx->first_iteration)
{
mkldnn_emitter->build_sigmoid_backward(bwd_desc, fwd_desc, sigmoid_index);
}
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[0], arg0_tensor);
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[1], arg1_tensor);
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[2], out_tensor);
......
......@@ -84,17 +84,22 @@ namespace ngraph
auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
auto input_desc = mkldnn_utils::get_input_mkldnn_md(node, 0);
auto result_desc = mkldnn_utils::get_output_mkldnn_md(node, 0);
auto slice_index = mkldnn_emitter->build_slice(
input_desc, result_desc, lower_bounds, out_shape);
// Slice needs 3 primitives: input, result, and reorder.
auto slice_index = mkldnn_emitter->reserve_primitive_space(3);
auto& deps = mkldnn_emitter->get_primitive_deps(slice_index);
auto functor = [&, slice_index](CPURuntimeContext* ctx,
CPUExecutionContext* ectx) {
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[0], arg_tensor);
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[1], out_tensor);
cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, slice_index);
};
auto functor =
[&, input_desc, result_desc, lower_bounds, out_shape, slice_index](
CPURuntimeContext* ctx, CPUExecutionContext* ectx) {
if (ctx->first_iteration)
{
mkldnn_emitter->build_slice(
input_desc, result_desc, lower_bounds, out_shape, slice_index);
}
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[0], arg_tensor);
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[1], out_tensor);
cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, slice_index);
};
functors.emplace_back(functor);
}
......
......@@ -46,23 +46,18 @@ namespace ngraph
if (runtime::cpu::mkldnn_utils::use_mkldnn_kernel(node))
{
if (axes.size() != 1)
{
throw ngraph_error("MKLDNN supports softmax only across single axis");
}
int softmax_axis = static_cast<int>(*(axes.begin()));
auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
auto input_desc = mkldnn_utils::get_input_mkldnn_md(node, 0);
auto result_desc = mkldnn_utils::get_output_mkldnn_md(node, 0);
size_t softmax_index = mkldnn_emitter->build_softmax_forward(
input_desc, result_desc, softmax_axis);
auto softmax_desc = mkldnn_emitter->get_softmax_forward_desc(node);
// Softmax needs 3 primitives: input, result, and softmax_forward.
size_t softmax_index = mkldnn_emitter->reserve_primitive_space(3);
auto& deps = mkldnn_emitter->get_primitive_deps(softmax_index);
auto functor = [&, softmax_index](CPURuntimeContext* ctx,
CPUExecutionContext* ectx) {
auto functor = [&, softmax_desc, softmax_index](CPURuntimeContext* ctx,
CPUExecutionContext* ectx) {
if (ctx->first_iteration)
{
mkldnn_emitter->build_softmax_forward(softmax_desc, softmax_index);
}
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[0], arg_tensor);
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[1], out_tensor);
cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, softmax_index);
......
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment