Commit 73da681a authored by Pruthvi's avatar Pruthvi Committed by Scott Cyphers

Pruthvi/fix rnn precision (#1874)

* - Added reorder support for rnn weights_layer/iter

* i) fixed compilation issues ii) working but still observing precision error

* i) fixed failing rnn unit test for DEX ii) refactored workspace in RNN mkldnn emitter

* i) added support for src reorder to TNC from NTC

* reorder support for rnn output fron NTC to TNC

* - added support for rnn weight reorder ldgoi -> ldigo
- code refactor for lstm/rnn kernel in mkldnn emitter

* - refactor rnn mkldnnn kernel, change variable names

* fix RNN codegen kernel

* disbale layer rnn fusion pass, to test CI

* method to validate recurrent rnn inputs

* add correlated macthes for Recurrent RNN PM

* - simplify reorder logic for rnn_weights
- fix graph pattern for fusing rnn cell across time steps

* do weights reorders in rnn timesteps fusion

* refactored LSTM graph pass

* - Bug fix for finding the lstm inputs determenstically
- Refactored LSTM graph pass to single pass
- made changes to LSTM RNN time step fusion graph pass

* - use replace_node instead of replace_output in Lstm_step_wise fusion graph pass

* fix compilation error

* Fix GNMT rnn fusion

* check if the node is in use before replacing in RNN graph passes

*  i) fix style ii) fix topo sort issue in RNN graph pass

* style fix

* fix bug in simplify_concat pass

* replaces Lstm1 -> {GOE1, GOE2} -> {Slice1, Slice2} -> Concat -> Lstm2 with Lstm1 -> Lstm2

* cse for convert layout

* addressed PR comments

* - optimization pass to remove  Lstm1 -> {GOE1, GOE2} -> {Slice1, Slice2} -> Lstm2
- conditional fusing of LSTM cells only for the decoder

* made changes to multi layer RNN fusion callback

* fix asserts in RNN op

* - added support to fuse layers when slc=dlc for RNN cells
- bug fix on the sanity checks for RNN Op

* - support RNN layer fusion till slc = dlc
- bug fixes in multi layer rnn fusion call back

* capture reshape in the RNN weights

* Addressed PR comments

* - added comments in multi layer PM call back
- fuse only if slc == DLC across layers

* restore deleted 3_lstm_cell_forward.json file

* fix typo

* fix failing unit tets

* When processing in place slice, do not change the offset of the slice node if the argument pointer comes from function input.

* Address PR feedback: process in place slice after propagating in place input.

* Set INTERMEDIATE role before propagating in place input.

* Do not add temporaries to the variable name map before propagating in place input in codegen.

* Fix a bug in codegen.

* Fix a bug in codegen slice.

* reenable disabled rnn unit test

* fix compiler error

* - bug fix in the slicing logic for the layer fused rnn cell
- fix failing rnn unit test

* - Addressed PR comments
- removed redundant checks from the rnn graph pass
- simplified rnn call back replace node logic

* - added new multilayer rnn *.json file
- fix test case

* [PRIVATE BRANCH] Style fixes (#2080)

* Style fixes

* change order of lstm gates

* [PRIVATE BRANCH] Jbobba/rnn fusion review (#2113)

* Style fixes for single-layer RNN fusion

* Style fixes to multi-layer RNN

* style fix

* disable GPU test
parent 86b783c6
......@@ -140,7 +140,7 @@ static bool simplify_concat(std::shared_ptr<Node> n)
NGRAPH_DEBUG << "setting branch_tip to " << branch_tip->get_name();
}
if (slice->get_users().size() > 1)
if (slice->get_users(true).size() > 1)
{
NGRAPH_DEBUG << slice->get_name() << " has more than one user";
return false;
......@@ -162,7 +162,7 @@ static bool simplify_concat(std::shared_ptr<Node> n)
return false;
}
if (rcarg->get_users().size() > 1)
if (rcarg->get_users(true).size() > 1)
{
NGRAPH_DEBUG << rcarg->get_name() << " has more than one user";
return false;
......@@ -173,7 +173,7 @@ static bool simplify_concat(std::shared_ptr<Node> n)
auto concat = std::static_pointer_cast<op::Concat>(n);
size_t concat_axis = concat->get_concatenation_axis();
auto slice_shape = branch_tip->get_users().at(0)->get_shape();
auto slice_shape = branch_tip->get_users(true).at(0)->get_shape();
size_t slice_axis = std::numeric_limits<size_t>::max();
auto btip_shape = branch_tip->get_shape();
......@@ -200,6 +200,10 @@ static bool simplify_concat(std::shared_ptr<Node> n)
}
}
if (slice_axis == std::numeric_limits<size_t>::max())
{
return false;
}
auto replacement = branch_tip;
if (btip_shape != n->get_shape())
{
......
......@@ -101,7 +101,6 @@ set(SRC
op/conv_add.cpp
pass/cpu_assignment.cpp
pass/cpu_collapse_dims.cpp
pass/cpu_concat_inputs.cpp
pass/cpu_fusion.cpp
pass/cpu_horizontal_fusion.cpp
pass/cpu_layout.cpp
......
......@@ -37,9 +37,7 @@ namespace ngraph
"Lstm is supported only through MKLDNN and doesnt have reference "
"INTERPRETER implementation");
}
const ngraph::op::Lstm* lstm_node = static_cast<const ngraph::op::Lstm*>(node);
if (args.size() != 5 || !lstm_node->get_fused_inputs())
if (args.size() != 5)
{
throw ngraph_error(
"Lstm op doesnt have the required number of inputs to create MKLDNN "
......
......@@ -521,8 +521,7 @@ namespace ngraph
template <>
void CPU_Emitter::EMITTER_DECL(ngraph::op::Lstm)
{
const ngraph::op::Lstm* lstm_node = static_cast<const ngraph::op::Lstm*>(node);
if (args.size() != 5 || !lstm_node->get_fused_inputs())
if (args.size() != 5)
{
throw ngraph_error(
"Lstm op doesnt have the required number of inputs to emit MKLDNN kernel");
......
......@@ -163,7 +163,6 @@
#include "ngraph/runtime/cpu/op/sigmoid_mul.hpp"
#include "ngraph/runtime/cpu/pass/cpu_assignment.hpp"
#include "ngraph/runtime/cpu/pass/cpu_collapse_dims.hpp"
#include "ngraph/runtime/cpu/pass/cpu_concat_inputs.hpp"
#include "ngraph/runtime/cpu/pass/cpu_fusion.hpp"
#include "ngraph/runtime/cpu/pass/cpu_horizontal_fusion.hpp"
#include "ngraph/runtime/cpu/pass/cpu_layout.hpp"
......@@ -1076,11 +1075,10 @@ void runtime::cpu::CPU_ExternalFunction::register_common_passes(ngraph::pass::Ma
REGISTER_KNOBBED_PASS(LikeReplacement, true, ngraph::pass);
REGISTER_KNOBBED_PASS(NopElimination, true, ngraph::pass);
REGISTER_KNOBBED_PASS(ZeroDimTensorElimination, true, ngraph::pass);
REGISTER_KNOBBED_PASS(LSTMFusion, false, runtime::cpu::pass);
REGISTER_KNOBBED_PASS(RNNFusion, false, runtime::cpu::pass);
REGISTER_KNOBBED_PASS(LSTMFusion, true, runtime::cpu::pass);
REGISTER_KNOBBED_PASS(RNNFusion, true, runtime::cpu::pass);
REGISTER_KNOBBED_PASS(AlgebraicSimplification, true, ngraph::pass);
REGISTER_KNOBBED_PASS(MultiLayerRNNFusion, false, runtime::cpu::pass);
REGISTER_KNOBBED_PASS(ConcatInputs, false, runtime::cpu::pass);
REGISTER_KNOBBED_PASS(MultiLayerRNNFusion, true, runtime::cpu::pass);
REGISTER_KNOBBED_PASS(CPURnnMatFusion, true, runtime::cpu::pass);
REGISTER_KNOBBED_PASS(CPUBatchFusion, true, runtime::cpu::pass);
REGISTER_KNOBBED_PASS(CPUReshapeSinking, false, runtime::cpu::pass);
......
......@@ -23,102 +23,12 @@ using namespace ngraph;
shared_ptr<Node> op::Lstm::copy_with_new_args(const NodeVector& new_args) const
{
if (!m_fused_inputs)
{
if (new_args.size() != 7)
{
throw ngraph_error("Incorrect number of new arguments");
}
return make_shared<Lstm>(new_args.at(0),
new_args.at(1),
new_args.at(2),
new_args.at(3),
new_args.at(4),
new_args.at(5),
new_args.at(6));
}
else
{
if (new_args.size() != 5 && m_fused_inputs)
if (new_args.size() != 5)
{
throw ngraph_error("Incorrect number of new arguments");
}
return make_shared<Lstm>(
new_args.at(0), new_args.at(1), new_args.at(2), new_args.at(3), new_args.at(4));
}
}
op::Lstm::Lstm(std::shared_ptr<Node> input_xt_1,
std::shared_ptr<Node> i2h_weights,
std::shared_ptr<Node> hidden_state_ht_1,
std::shared_ptr<Node> h2h_weights,
std::shared_ptr<Node> i2h_bias,
std::shared_ptr<Node> h2h_bias,
std::shared_ptr<Node> cell_state_ct_1)
: Op("Lstm",
check_single_output_args({input_xt_1,
i2h_weights,
hidden_state_ht_1,
h2h_weights,
i2h_bias,
h2h_bias,
cell_state_ct_1}))
, m_output_tensor_shape(hidden_state_ht_1->get_shape())
, m_output_cell_shape(cell_state_ct_1->get_shape())
, m_num_timesteps(1)
, m_num_gates_per_cell(4)
, m_src_sequence_length(1)
, m_src_layer_feature_size(input_xt_1->get_shape()[1])
, m_src_iter_feature_size(hidden_state_ht_1->get_shape()[1])
, m_num_cell_states(2)
, m_direction(1)
, m_num_fused_layers(1)
, m_fused_inputs(false)
{
constructor_validate_and_infer_types();
if (input_xt_1->get_shape().size() != i2h_weights->get_shape().size())
{
throw ngraph_error("input_xt_1 and i2h weights size dont match");
}
if (hidden_state_ht_1->get_shape().size() != h2h_weights->get_shape().size())
{
throw ngraph_error("hidden_state_ht_1 and h2h weights size dont match");
}
if (input_xt_1->get_shape().size() == 2)
{
m_batch_size = input_xt_1->get_shape()[0];
}
else
{
throw ngraph_error("input_xt_1 doesnt have a rank 2");
}
if (shape_size(input_xt_1->get_shape()) !=
m_src_sequence_length * m_batch_size * m_src_layer_feature_size)
{
throw ngraph_error("input_xt_1 size is not equal t*n*c");
}
if (i2h_bias->get_shape()[0] != i2h_weights->get_shape()[0] ||
h2h_bias->get_shape()[0] != h2h_weights->get_shape()[0])
{
throw ngraph_error("bias and weights_shape are not compatible");
}
auto et = input_xt_1->get_element_type();
for (auto& lstm_input : get_arguments())
{
if (lstm_input->get_element_type() != et)
{
throw ngraph_error("all rnn inputs must have the same element type");
}
}
set_output_size(2);
set_output_type(0, hidden_state_ht_1->get_element_type(), hidden_state_ht_1->get_shape());
set_output_type(1, cell_state_ct_1->get_element_type(), cell_state_ct_1->get_shape());
}
op::Lstm::Lstm(std::shared_ptr<Node> src_layer,
......@@ -137,7 +47,6 @@ op::Lstm::Lstm(std::shared_ptr<Node> src_layer,
, m_num_cell_states(2)
, m_direction(1)
, m_num_fused_layers(1)
, m_fused_inputs(true)
{
constructor_validate_and_infer_types();
......@@ -166,8 +75,8 @@ op::Lstm::Lstm(std::shared_ptr<Node> src_layer,
throw ngraph_error("src_layer size is not equal t*n*c");
}
if (bias->get_shape()[0] != weights_layer->get_shape()[0] ||
bias->get_shape()[0] != weights_iter->get_shape()[0])
if (bias->get_shape()[0] != weights_layer->get_shape()[1] ||
bias->get_shape()[0] != weights_iter->get_shape()[1])
{
throw ngraph_error("bias and weights_shape are not compatible");
}
......
......@@ -26,29 +26,6 @@ namespace ngraph
class Lstm : public Op
{
public:
// INPUTS:
// [0] - xt, input tensor of layout TNC, Shape{sequence length*batch_size, feature_size}
// [1] - initializer for the input weights matrix, used for the linear transformation of the inputs.
// [2] - ht_1, hidden state of shape (batch_size, feature_size)
// [3] - initializer for the recurrent weights matrix, used for the linear transformation of the recurrent state.
// [4] - Initializer for the bias vector w.r.to inputs.
// [5] - Initializer for the bias vector w.r.to hidden state
// [6] - ct_1, cell state of shape (batch_size, feature_size)
// OUTPUT VALUE: A tuple with the following structure:
// [0] - ht, output tensor with shape (sequence_length*batch_size, num_hidden) .
// [1] - ct, output recurrent state tensor with the same shape as cell state
// This version of the LSTM op is only used to simplify recurrent RNN cell(LSTM) fusion across
// horizontal time steps. This doesnt have mkldnn emitter code.
Lstm(std::shared_ptr<Node> input_xt_1,
std::shared_ptr<Node> i2h_weights,
std::shared_ptr<Node> hidden_state_ht_1,
std::shared_ptr<Node> h2h_weights,
std::shared_ptr<Node> i2h_bias,
std::shared_ptr<Node> h2h_bias,
std::shared_ptr<Node> cell_state_ct_1);
// INPUTS:
// [0] - {Xt} input tensor of layout TNC, Shape{sequence length*batch_size, feature_size}
// [1] - recurrent state tensors {ht_1 | ct_1} of Shape{sequence length*batch_size, feature_size}
......@@ -78,7 +55,6 @@ namespace ngraph
size_t get_num_cell_states() const { return m_num_cell_states; }
size_t get_direction() const { return m_direction; }
size_t get_num_fused_layers() const { return m_num_fused_layers; }
size_t get_fused_inputs() const { return m_fused_inputs; }
virtual std::shared_ptr<Node>
copy_with_new_args(const NodeVector& new_args) const override;
......@@ -94,7 +70,6 @@ namespace ngraph
size_t m_num_cell_states;
size_t m_direction;
size_t m_num_fused_layers;
bool m_fused_inputs; // True if node gets fused inputs/weights
};
}
}
......@@ -35,8 +35,6 @@ shared_ptr<Node> op::Rnn::copy_with_new_args(const NodeVector& new_args) const
m_num_timesteps,
m_num_gates_per_cell,
m_src_sequence_length,
m_src_layer_feature_size,
m_src_iter_feature_size,
m_num_cell_states,
m_direction,
m_num_fused_layers);
......@@ -50,8 +48,6 @@ op::Rnn::Rnn(std::shared_ptr<Node> src_layer,
size_t num_timesteps,
size_t num_gates_per_cell,
size_t src_sequence_length,
size_t src_layer_feature_size,
size_t src_iter_feature_size,
size_t num_cell_states,
size_t direction,
size_t num_fused_layers)
......@@ -59,14 +55,11 @@ op::Rnn::Rnn(std::shared_ptr<Node> src_layer,
, m_num_timesteps(num_timesteps)
, m_num_gates_per_cell(num_gates_per_cell)
, m_src_sequence_length(src_sequence_length)
, m_src_layer_feature_size(src_layer_feature_size)
, m_src_iter_feature_size(src_iter_feature_size)
, m_num_cell_states(num_cell_states)
, m_direction(direction)
, m_num_fused_layers(num_fused_layers)
{
constructor_validate_and_infer_types();
if (src_layer->get_shape().size() != weights_layer->get_shape().size())
{
throw ngraph_error("src_layer and i2h weights size dont match");
......@@ -86,14 +79,19 @@ op::Rnn::Rnn(std::shared_ptr<Node> src_layer,
throw ngraph_error("src_layer doesnt have a rank 2");
}
m_dst_iter_feature_size = weights_iter->get_shape()[1] / (m_num_gates_per_cell);
m_dst_layer_feature_size = weights_layer->get_shape()[1] / (m_num_gates_per_cell);
m_src_iter_feature_size = weights_iter->get_shape()[0] / (m_direction * m_num_fused_layers);
m_src_layer_feature_size = weights_layer->get_shape()[0] / (m_direction * m_num_fused_layers);
if (shape_size(src_layer->get_shape()) !=
m_src_sequence_length * m_batch_size * m_src_layer_feature_size)
{
throw ngraph_error("src_layer size is not equal t*n*c");
}
if (bias->get_shape()[0] != weights_layer->get_shape()[0] ||
bias->get_shape()[0] != weights_iter->get_shape()[0])
if ((bias->get_shape()[0] / m_num_fused_layers) != (weights_layer->get_shape()[1]) ||
(bias->get_shape()[0] / m_num_fused_layers) != (weights_iter->get_shape()[1]))
{
throw ngraph_error("bias and weights_shape are not compatible");
}
......
......@@ -54,8 +54,6 @@ namespace ngraph
size_t num_timesteps,
size_t num_gates_per_cell,
size_t src_sequence_length,
size_t src_layer_feature_size,
size_t src_iter_feature_size,
size_t num_cell_states,
size_t direction,
size_t num_fused_layers);
......@@ -67,6 +65,8 @@ namespace ngraph
size_t get_batch_size() const { return m_batch_size; }
size_t get_src_layer_feature_size() const { return m_src_layer_feature_size; }
size_t get_src_iter_feature_size() const { return m_src_iter_feature_size; }
size_t get_dst_layer_feature_size() const { return m_dst_layer_feature_size; }
size_t get_dst_iter_feature_size() const { return m_dst_iter_feature_size; }
size_t get_num_cell_states() const { return m_num_cell_states; }
size_t get_direction() const { return m_direction; }
size_t get_num_fused_layers() const { return m_num_fused_layers; }
......@@ -77,6 +77,8 @@ namespace ngraph
size_t m_batch_size;
size_t m_src_layer_feature_size;
size_t m_src_iter_feature_size;
size_t m_dst_layer_feature_size;
size_t m_dst_iter_feature_size;
size_t m_num_cell_states;
size_t m_direction;
size_t m_num_fused_layers;
......
//*****************************************************************************
// Copyright 2017-2018 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#include "cpu_concat_inputs.hpp"
#include <algorithm>
#include <iostream>
#include <numeric>
#include <typeindex>
#include <typeinfo>
#include <unordered_set>
#include "ngraph/graph_util.hpp"
#include "ngraph/log.hpp"
#include "ngraph/op/add.hpp"
#include "ngraph/op/broadcast.hpp"
#include "ngraph/op/concat.hpp"
#include "ngraph/op/constant.hpp"
#include "ngraph/op/divide.hpp"
#include "ngraph/op/dot.hpp"
#include "ngraph/op/exp.hpp"
#include "ngraph/op/get_output_element.hpp"
#include "ngraph/op/multiply.hpp"
#include "ngraph/op/negative.hpp"
#include "ngraph/op/parameter.hpp"
#include "ngraph/op/relu.hpp"
#include "ngraph/op/reshape.hpp"
#include "ngraph/op/result.hpp"
#include "ngraph/op/slice.hpp"
#include "ngraph/op/sum.hpp"
#include "ngraph/op/tanh.hpp"
#include "ngraph/pattern/matcher.hpp"
#include "ngraph/pattern/op/label.hpp"
#include "ngraph/pattern/op/skip.hpp"
#include "ngraph/runtime/cpu/op/lstm.hpp"
#include "ngraph/runtime/cpu/op/rnn.hpp"
#include "ngraph/runtime/cpu/op/sigmoid.hpp"
using namespace ngraph;
void ngraph::runtime::cpu::pass::ConcatInputs::concat_lstm_inputs()
{
auto ht_1 = std::make_shared<pattern::op::Label>(element::f32, Shape{32, 100});
auto weights_h2h = std::make_shared<pattern::op::Label>(element::f32, Shape{400, 100});
auto xt = std::make_shared<pattern::op::Label>(element::f32, Shape{32, 100});
auto weights_i2h = std::make_shared<pattern::op::Label>(element::f32, Shape{400, 100});
auto bias1 = std::make_shared<pattern::op::Label>(element::f32, Shape{400});
auto bias2 = std::make_shared<pattern::op::Label>(element::f32, Shape{400});
auto ct_1 = std::make_shared<pattern::op::Label>(element::f32, Shape{32, 100});
auto lstm = std::make_shared<op::Lstm>(xt, weights_i2h, ht_1, weights_h2h, bias1, bias2, ct_1);
auto goe = std::make_shared<op::GetOutputElement>(lstm, 0);
auto lstm_node_label = std::make_shared<pattern::op::Label>(goe, nullptr, NodeVector{goe});
pattern::graph_rewrite_callback callback =
[lstm_node_label, xt, weights_h2h, ht_1, weights_i2h, bias1, bias2, ct_1](
pattern::Matcher& m) {
auto pattern_map = m.get_pattern_map();
NGRAPH_DEBUG << " In LSTM MKLDNN callback";
if (m.get_match_root()->get_element_type() != element::f32)
{
NGRAPH_DEBUG << "mpattern = " << m.get_match_root()->get_name()
<< " type is not float!";
return false;
}
std::shared_ptr<Node> src_layer = pattern_map[xt];
std::shared_ptr<Node> src_iter =
std::make_shared<op::Concat>(NodeVector{pattern_map[ht_1], pattern_map[ct_1]}, 0);
std::shared_ptr<Node> bias =
std::make_shared<op::Add>(pattern_map[bias1], pattern_map[bias2]);
auto lstm_node = std::static_pointer_cast<op::Lstm>(
pattern_map[lstm_node_label]->get_arguments()[0]);
auto batch_size = lstm_node->get_batch_size();
auto feature_size = lstm_node->get_src_iter_feature_size();
auto lstm_mkldnn_node = std::make_shared<op::Lstm>(
src_layer, src_iter, pattern_map[weights_i2h], pattern_map[weights_h2h], bias);
auto lstm_ht_out = std::make_shared<op::GetOutputElement>(lstm_mkldnn_node, 0);
auto lstm_ht_ct_out = std::make_shared<op::GetOutputElement>(lstm_mkldnn_node, 1);
// dst_iter of lstm mkldnn output holds the results of both recurrent state
// tensor outputs. we need to slice the ct.
auto ht_slice = std::make_shared<op::Slice>(
lstm_ht_ct_out, Coordinate{0, 0}, Coordinate{batch_size, feature_size});
auto ct_slice = std::make_shared<op::Slice>(lstm_ht_ct_out,
Coordinate{batch_size, 0},
Coordinate{(2 * batch_size), feature_size});
// now go through the GOE'sand replace the slices(ht)
std::set<std::shared_ptr<ngraph::Node>> lstm_outputs;
for (auto& goes : lstm_node->get_outputs().at(0).get_inputs())
{
auto goe_node = std::dynamic_pointer_cast<op::GetOutputElement>(goes->get_node());
NGRAPH_ASSERT(goe_node);
lstm_outputs.insert(goes->get_node());
// first output node of lstm
if (goe_node->get_n() == 0)
{
NGRAPH_DEBUG << "Replacing 1st output Lstm node " << goe_node->get_name()
<< " with " << lstm_ht_out->get_name();
if (!goe_node->get_users().empty())
{
ngraph::replace_node(goe_node, lstm_ht_out);
}
}
else if (goe_node->get_n() == 1)
{
for (auto& goe_ct_user : goe_node->get_users())
{
for (size_t i = 0; i < goe_ct_user->get_input_size(); i++)
{
if (goe_ct_user->get_argument(i) == goe_node)
{
goe_ct_user->get_inputs().at(i).replace_output(
ct_slice->get_outputs().at(0));
}
}
}
NGRAPH_DEBUG << "Replacing 2nd output Lstm node " << goe_node->get_name()
<< " with " << ct_slice->get_name();
}
}
if (lstm_outputs.find(m.get_match_root()) == lstm_outputs.end())
{
throw ngraph_error(
"Pattern matcher error, matched root node should be one of the LSTM outputs");
}
return true;
};
auto m = std::make_shared<pattern::Matcher>(lstm_node_label, callback);
this->add_matcher(m);
}
//*****************************************************************************
// Copyright 2017-2018 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#pragma once
#include "ngraph/pass/graph_rewrite.hpp"
namespace ngraph
{
namespace runtime
{
namespace cpu
{
namespace pass
{
class ConcatInputs;
}
}
}
}
class ngraph::runtime::cpu::pass::ConcatInputs : public ngraph::pass::GraphRewrite
{
public:
ConcatInputs()
: GraphRewrite()
{
concat_lstm_inputs();
}
private:
void concat_lstm_inputs();
};
......@@ -61,6 +61,7 @@
#include "ngraph/runtime/cpu/op/group_conv.hpp"
#include "ngraph/runtime/cpu/op/group_conv_bias.hpp"
#include "ngraph/runtime/cpu/op/leaky_relu.hpp"
#include "ngraph/runtime/cpu/op/lstm.hpp"
#include "ngraph/runtime/cpu/op/matmul_bias.hpp"
#include "ngraph/runtime/cpu/op/sigmoid_mul.hpp"
#include "ngraph/util.hpp"
......@@ -1753,3 +1754,45 @@ void ngraph::runtime::cpu::pass::CPUFusion::
auto m = std::make_shared<ngraph::pattern::Matcher>(prelu, callback);
this->add_matcher(m);
}
void ngraph::runtime::cpu::pass::CPUFusion::construct_fuse_lstm_recurrent_state()
{
auto src_layer_label = std::make_shared<pattern::op::Label>(element::f32, Shape{30, 100});
auto src_iter_label = std::make_shared<pattern::op::Label>(element::f32, Shape{20, 100});
auto weights_layer_label = std::make_shared<pattern::op::Label>(element::f32, Shape{100, 400});
auto weights_iter_label = std::make_shared<pattern::op::Label>(element::f32, Shape{100, 400});
auto bias_label = std::make_shared<pattern::op::Label>(element::f32, Shape{400});
auto lstm1 = std::make_shared<op::Lstm>(
src_layer_label, src_iter_label, weights_layer_label, weights_iter_label, bias_label);
auto lstm1_goe0 = std::make_shared<op::GetOutputElement>(lstm1, 0);
auto lstm1_goe1 = std::make_shared<op::GetOutputElement>(lstm1, 1);
auto lstm1_goe0_label =
std::make_shared<pattern::op::Label>(lstm1_goe0, nullptr, NodeVector{lstm1_goe0});
auto lstm1_goe1_label =
std::make_shared<pattern::op::Label>(lstm1_goe1, nullptr, NodeVector{lstm1_goe1});
auto lstm1_goe0_slice =
std::make_shared<op::Slice>(lstm1_goe0_label, Coordinate{0, 0}, Coordinate{10, 100});
auto lstm1_goe1_slice =
std::make_shared<op::Slice>(lstm1_goe1_label, Coordinate{10, 0}, Coordinate{20, 100});
auto concat = std::make_shared<op::Concat>(NodeVector{lstm1_goe0_slice, lstm1_goe1_slice}, 0);
auto concat_label = std::make_shared<pattern::op::Label>(concat, nullptr, NodeVector{concat});
ngraph::pattern::graph_rewrite_callback callback =
[lstm1, lstm1_goe0_label, concat_label, lstm1_goe1_label](pattern::Matcher& m) {
NGRAPH_DEBUG << "In Lstm concat fusion" << m.get_match_root()->get_name();
auto pattern_map = m.get_pattern_map();
if (pattern_map[lstm1_goe0_label]->get_arguments()[0] !=
pattern_map[lstm1_goe1_label]->get_arguments()[0])
{
return false;
}
// we can replace the concat lstm_goe_1 which had both recurrent state tensor
ngraph::replace_node(pattern_map[concat_label], pattern_map[lstm1_goe1_label]);
return true;
};
auto m = std::make_shared<ngraph::pattern::Matcher>(concat_label, callback);
this->add_matcher(m);
}
......@@ -79,6 +79,7 @@ public:
// construct_conv_add() should always be after construct_conv_bias()
construct_conv_add();
construct_conv_add_relu();
construct_fuse_lstm_recurrent_state();
}
}
......@@ -106,4 +107,5 @@ private:
void construct_conv_bias_affine_folding();
void construct_groupconv_batchnorm_global_stats_folding();
void construct_groupconv_batchnorm_global_stats_folding_relu();
void construct_fuse_lstm_recurrent_state();
};
......@@ -62,7 +62,6 @@
#include "ngraph/runtime/cpu/op/matmul_bias.hpp"
#include "ngraph/runtime/cpu/op/rnn.hpp"
#include "ngraph/runtime/cpu/op/sigmoid_mul.hpp"
#include "ngraph/runtime/cpu/pass/cpu_concat_inputs.hpp"
#include "ngraph/runtime/cpu/pass/cpu_fusion.hpp"
#include "ngraph/runtime/cpu/pass/cpu_loop_kernel_fusion.hpp"
#include "ngraph/runtime/cpu/pass/cpu_mat_fusion.hpp"
......@@ -2121,20 +2120,16 @@ TEST(cpu_fusion, group_convolution)
ASSERT_EQ(rv, erv);
}
//TODO(Pruthvi) enable this test after MKLDNN RNN bug is fixed
#if 0
TEST(cpu_fusion, rnn_fprop_1_lstm_cell)
{
auto src_layer = make_shared<op::Parameter>(element::f32, Shape{10, 100});
auto src_iter = make_shared<op::Parameter>(element::f32, Shape{20, 100});
auto weights_layer = make_shared<op::Parameter>(element::f32, Shape{400, 100});
auto weights_iter = make_shared<op::Parameter>(element::f32, Shape{400, 100});
auto weights_layer = make_shared<op::Parameter>(element::f32, Shape{100, 400});
auto weights_iter = make_shared<op::Parameter>(element::f32, Shape{100, 400});
auto biases = make_shared<op::Parameter>(element::f32, Shape{400});
const int number_of_timesteps = 1;
const int number_of_gates_per_cell = 4;
const int src_seq_length = 1;
const int src_layer_feature_size = 100;
const int feature_size = 100;
const int num_rnn_cell_states = 2;
const int rnn_direction = 1;
const int num_of_rnn_fused_layer = 1;
......@@ -2146,8 +2141,6 @@ TEST(cpu_fusion, rnn_fprop_1_lstm_cell)
number_of_timesteps,
number_of_gates_per_cell,
src_seq_length,
src_layer_feature_size,
feature_size,
num_rnn_cell_states,
rnn_direction,
num_of_rnn_fused_layer);
......@@ -2170,8 +2163,7 @@ TEST(cpu_fusion, rnn_fprop_1_lstm_cell)
shared_ptr<runtime::Tensor> biases_t =
backend->create_tensor(element::f32, biases->get_shape());
shared_ptr<runtime::Tensor> result_ht = backend->create_tensor(element::f32, {10, 100});
shared_ptr<runtime::Tensor> result_ct =
backend->create_tensor(element::f32, Shape{20, 100});
shared_ptr<runtime::Tensor> result_ct = backend->create_tensor(element::f32, Shape{20, 100});
copy_data(src_layer_t, vector<float>(1000, 1));
copy_data(src_iter_t, vector<float>(2000, 1));
......@@ -2200,13 +2192,11 @@ TEST(cpu_fusion, rnn_fprop_1_lstm_cell)
EXPECT_TRUE(test::all_close(expected_ht, read_vector<float>(result_ht)));
EXPECT_TRUE(test::all_close(expected_ct, read_vector<float>(result_ct)));
}
#endif
TEST(cpu_fusion, fuse_lstm_cells)
{
pass::Manager pass_manager;
pass_manager.register_pass<runtime::cpu::pass::LSTMFusion>();
pass_manager.register_pass<runtime::cpu::pass::ConcatInputs>();
const string json_path =
file_util::path_join(SERIALIZED_ZOO, "mxnet/2rnn_layer_3lstm_cell.json");
const string json_string = file_util::read_file_to_string(json_path);
......@@ -2269,12 +2259,12 @@ static std::shared_ptr<Function> make_function(const std::string& file_name)
return func;
}
TEST(DISABLED_cpu_fusion, rnn_fusion_inter_vs_cpu_1lstm_cell)
TEST(cpu_fusion, rnn_fusion_1lstm_cell)
{
const std::string file_name("mxnet/1_lstm_cell_forward.json");
auto cpu_f = make_function(file_name);
auto int_f = make_function(file_name);
test::Uniform<float> rng(0.0f, 1.0f);
test::Uniform<float> rng(-1.0f, 1.0f);
vector<vector<float>> args;
for (shared_ptr<op::Parameter> param : int_f->get_parameters())
......@@ -2291,12 +2281,12 @@ TEST(DISABLED_cpu_fusion, rnn_fusion_inter_vs_cpu_1lstm_cell)
}
}
TEST(DISABLED_cpu_fusion, rnn_fusion_inter_vs_cpu_1rnn_layer_3lstm_cell)
TEST(cpu_fusion, rnn_fusion_1rnn_layer_3lstm_cell)
{
const std::string file_name("mxnet/1rnn_layer_3lstm_cell.json");
auto cpu_f = make_function(file_name);
auto int_f = make_function(file_name);
test::Uniform<float> rng(0.0f, 1.0f);
test::Uniform<float> rng(-1.0f, 1.0f);
vector<vector<float>> args;
for (shared_ptr<op::Parameter> param : int_f->get_parameters())
......@@ -2313,12 +2303,12 @@ TEST(DISABLED_cpu_fusion, rnn_fusion_inter_vs_cpu_1rnn_layer_3lstm_cell)
}
}
TEST(DISABLED_cpu_fusion, rnn_fusion_inter_vs_cpu_2rnn_layer_3lstm_cell)
TEST(cpu_fusion, rnn_fusion_2rnn_layer_3lstm_cell)
{
const std::string file_name("mxnet/2rnn_layer_3lstm_cell.json");
auto cpu_f = make_function(file_name);
auto int_f = make_function(file_name);
test::Uniform<float> rng(0.0f, 1.0f);
test::Uniform<float> rng(-1.0f, 1.0f);
vector<vector<float>> args;
for (shared_ptr<op::Parameter> param : int_f->get_parameters())
......@@ -2993,30 +2983,12 @@ TEST(cpu_fusion, fuse_batch_dot_forward)
}
}
TEST(cpu_fusion, fuse_rnn_across_layer)
{
pass::Manager pass_manager;
pass_manager.register_pass<runtime::cpu::pass::LSTMFusion>();
pass_manager.register_pass<runtime::cpu::pass::RNNFusion>();
pass_manager.register_pass<ngraph::pass::AlgebraicSimplification>();
pass_manager.register_pass<runtime::cpu::pass::MultiLayerRNNFusion>();
const string json_path =
file_util::path_join(SERIALIZED_ZOO, "mxnet/2rnn_layer_1timestep.json");
const string json_string = file_util::read_file_to_string(json_path);
stringstream ss(json_string);
shared_ptr<Function> func = ngraph::deserialize(ss);
pass_manager.run_passes(func);
size_t ref_rnn_count = 1;
auto rnn_count = count_ops_of_type<op::Rnn>(func);
EXPECT_EQ(ref_rnn_count, rnn_count);
}
TEST(DISABLED_cpu_fusion, fuse_rnn_across_2layer_1timestep)
TEST(cpu_fusion, fuse_rnn_across_layer_2layer_3timestep)
{
const std::string file_name("mxnet/2rnn_layer_1timestep.json");
const std::string file_name("mxnet/2layer_3timestep_ic100oc100.json");
auto cpu_f = make_function(file_name);
auto int_f = make_function(file_name);
test::Uniform<float> rng(0.0f, 1.0f);
test::Uniform<float> rng(-1.0f, 1.0f);
vector<vector<float>> args;
for (shared_ptr<op::Parameter> param : int_f->get_parameters())
......@@ -3028,12 +3000,10 @@ TEST(DISABLED_cpu_fusion, fuse_rnn_across_2layer_1timestep)
auto int_results = execute(int_f, args, "INTERPRETER");
auto cpu_results = execute(cpu_f, args, "CPU");
// TODO (pruthvi): Enable this after fixing failing
// mxnet rnn unit tests
// EXPECT_EQ(1, count_ops_of_type<op::Rnn>(cpu_f));
EXPECT_EQ(1, count_ops_of_type<op::Rnn>(cpu_f));
for (size_t i = 0; i < cpu_results.size(); i++)
{
EXPECT_TRUE(test::all_close(cpu_results.at(1), int_results.at(1), 1.0e-4f, 1.0e-4f));
EXPECT_TRUE(test::all_close(cpu_results.at(i), int_results.at(i), 1.0e-4f, 1.0e-4f));
}
}
......
......@@ -161,7 +161,7 @@ TEST(gpu_fusion, fuse_2_layer_rnn)
}
}
TEST(gpu_fusion, fuse_1_layer_rnn)
TEST(DISABLED_gpu_fusion, fuse_1_layer_rnn)
{
pass::Manager pass_manager;
pass_manager.register_pass<runtime::gpu::pass::LSTMFusion>();
......
......@@ -873,11 +873,17 @@
"name" : "Result_122",
"op" : "Result",
"outputs" : ["Result_122_0"]
},
{
"inputs" : ["Add_86"],
"name" : "Result_123",
"op" : "Result",
"outputs" : ["Result_123_0"]
}
],
"parameters" : [
"Parameter_12", "Parameter_1", "Parameter_2", "Parameter_19",
"Parameter_20", "Parameter_7", "Parameter_0"
],
"result" : ["Result_122"]
"result" : ["Result_122", "Result_123"]
}]
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment