Adding support for fused ops that are decomposable to core ngraph ops (#2688)

* Initial support for specification of fused ops and type inference * Added FusedOpDecomposition pass and execution test cases * Serializer support * style fix * Add FusedOpDecomposition to GPU and IGPU backends * Addressed PR feedback * Fix comment * Addressed PR feedback

Adding support for fused ops that are decomposable to core ngraph ops (#2688)
* Initial support for specification of fused ops and type inference * Added FusedOpDecomposition pass and execution test cases * Serializer support * style fix * Add FusedOpDecomposition to GPU and IGPU backends * Addressed PR feedback * Fix comment * Addressed PR feedback
7775d49d · Jayaram Bobba · Scott Cyphers · 8c081092 · 7775d49d · 7775d49d
Commit 7775d49d authored Apr 05, 2019 by Jayaram Bobba Committed by Scott Cyphers Apr 05, 2019
21 changed files
--- a/src/ngraph/CMakeLists.txt
+++ b/src/ngraph/CMakeLists.txt
@@ -254,6 +254,8 @@ set (SRC
    op/tanh.hpp
    op/topk.cpp
    op/topk.hpp
+    op/fused/prelu.cpp
+    op/fused/prelu.hpp
    op/util/arithmetic_reduction.cpp
    op/util/arithmetic_reduction.hpp
    op/util/binary_elementwise_arithmetic.cpp
@@ -262,6 +264,10 @@ set (SRC
    op/util/binary_elementwise_comparison.hpp
    op/util/binary_elementwise_logical.cpp
    op/util/binary_elementwise_logical.hpp
+    op/util/broadcasting.cpp
+    op/util/broadcasting.hpp
+    op/util/fused_op.cpp
+    op/util/fused_op.hpp
    op/util/index_reduction.cpp
    op/util/index_reduction.hpp
    op/util/logical_reduction.cpp
@@ -284,6 +290,8 @@ set (SRC
    pass/cse.hpp
    pass/dump_sorted.cpp
    pass/dump_sorted.hpp
+    pass/fused_op_decomposition.cpp
+    pass/fused_op_decomposition.hpp
    pass/get_output_element_elimination.cpp
    pass/get_output_element_elimination.hpp
    pass/graph_rewrite.cpp

--- a/src/ngraph/graph_util.cpp
+++ b/src/ngraph/graph_util.cpp
@@ -65,18 +65,16 @@ void ngraph::traverse_nodes(const Function* p,
    traverse_nodes(nodes, f, include_control_deps);
 }

-// This version of traverses directly from input/output nodes to perform functions on
-// graphs that are not wrapped by functions. Most useful for finding parameters of a graph
-// directly from the result nodes, not from function parameters.
-void ngraph::traverse_nodes(const NodeVector& io_nodes,
+void ngraph::traverse_nodes(const NodeVector& subgraph_results,
                            std::function<void(std::shared_ptr<Node>)> f,
                            bool include_control_deps,
-                            NodeVector stop_nodes)
+                            const NodeVector& subgraph_params)
 {
-    std::unordered_set<std::shared_ptr<Node>> instances_seen(stop_nodes.begin(), stop_nodes.end());
+    std::unordered_set<std::shared_ptr<Node>> instances_seen{subgraph_params.begin(),
+                                                             subgraph_params.end()};
    std::deque<std::shared_ptr<Node>> stack;

-    for (auto r : io_nodes)
+    for (auto r : subgraph_results)
    {
        stack.push_front(r);
    }
@@ -484,6 +482,13 @@ NodeVector ngraph::get_subgraph_outputs(const NodeVector& nodes,
    return outputs;
 }

+NodeVector ngraph::extract_subgraph(const NodeVector& results, const NodeVector& args)
+{
+    NodeVector subgraph;
+    traverse_nodes(results, [&](std::shared_ptr<Node> n) { subgraph.push_back(n); }, true, args);
+    return subgraph;
+}
+
 bool ngraph::is_used(Node* node)
 {
    std::unordered_set<Node*> instances_seen;

--- a/src/ngraph/graph_util.hpp
+++ b/src/ngraph/graph_util.hpp
@@ -45,14 +45,29 @@ namespace ngraph
    void traverse_nodes(const std::shared_ptr<const Function> p,
                        std::function<void(std::shared_ptr<Node>)> f,
                        bool include_control_deps = false);
+
    void traverse_nodes(const Function* p,
                        std::function<void(std::shared_ptr<Node>)> f,
                        bool include_control_deps);

-    void traverse_nodes(const NodeVector& io_nodes,
+    /// \brief Visit each node in a sub-graph of the entire graph
+    /// \param subgraph_results The output nodes of the sub-graph
+    /// \param f Function to execute at each node in the traversal
+    /// \param include_control_deps Whether to include control deps
+    ///        while traversing the sub-graph
+    /// \param subgraph_params Input nodes of the sub-graph (optional)
+    ///
+    /// Traverses a sub-graph starting from subgraph_results moving up
+    /// towards parameter nodes. Traversal stops if it hits a node in
+    /// subgraph_params.
+    ///
+    /// Most useful for finding parameters of a graph directly from the
+    /// result nodes and not from function parameters or extracting a
+    /// subgraph relevant to the computation of certain outputs
+    void traverse_nodes(const NodeVector& subgraph_results,
                        std::function<void(std::shared_ptr<Node>)> f,
                        bool include_control_deps,
-                        NodeVector stop_nodes = {});
+                        const NodeVector& subgraph_params = {});

    void traverse_functions(std::shared_ptr<Function> p,
                            std::function<void(std::shared_ptr<Function>)> f);
@@ -125,6 +140,7 @@ namespace ngraph
        return result_list;
    }

+    // For cases, where `nodes` is a subset of the entire graph
    template <typename T>
    std::list<std::shared_ptr<Node>> subgraph_topological_sort(const T& nodes,
                                                               bool include_control_deps = false)
@@ -205,7 +221,7 @@ namespace ngraph
    template <typename T>
    void validate_nodes_and_infer_types(const T& nodes)
    {
-        for (auto node : topological_sort(nodes))
+        for (auto node : subgraph_topological_sort(nodes))
        {
            node->delayed_validate_and_infer_types();
        }
@@ -296,6 +312,10 @@ namespace ngraph
                                    const NodeVector& exclusions,
                                    bool ignore_unused = false);

+    // Extract sub-graph computing the `results`. Stops backward traversal at either a Parameter node
+    // or a node that belongs to args
+    NodeVector extract_subgraph(const NodeVector& results, const NodeVector& args);
+
    bool is_one(std::shared_ptr<Node> reduce_constant);

    bool compare_constants(const std::shared_ptr<Node>& n1, const std::shared_ptr<Node>& n2);

--- a/src/ngraph/ngraph.hpp
+++ b/src/ngraph/ngraph.hpp
@@ -95,6 +95,7 @@
 #include "ngraph/op/experimental/shape_of.hpp"
 #include "ngraph/op/experimental/transpose.hpp"
 #include "ngraph/op/floor.hpp"
+#include "ngraph/op/fused/prelu.hpp"
 #include "ngraph/op/get_output_element.hpp"
 #include "ngraph/op/greater.hpp"
 #include "ngraph/op/greater_eq.hpp"

--- a/src/ngraph/op/fused/prelu.cpp
+++ b/src/ngraph/op/fused/prelu.cpp
+//*****************************************************************************
+// Copyright 2017-2019 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+#include "ngraph/op/fused/prelu.hpp"
+
+#include "ngraph/op/add.hpp"
+#include "ngraph/op/broadcast.hpp"
+#include "ngraph/op/constant.hpp"
+#include "ngraph/op/convert.hpp"
+#include "ngraph/op/greater.hpp"
+#include "ngraph/op/less.hpp"
+#include "ngraph/op/multiply.hpp"
+#include "ngraph/op/reshape.hpp"
+#include "ngraph/op/util/broadcasting.hpp"
+#include "ngraph/util.hpp"
+
+using namespace std;
+using namespace ngraph;
+
+op::PRelu::PRelu(const shared_ptr<Node>& data, const shared_ptr<Node>& slope)
+    : FusedOp("PRelu", {data, slope})
+{
+    constructor_validate_and_infer_types();
+}
+
+NodeVector op::PRelu::decompose_op() const
+{
+    auto data = get_argument(0);
+    auto data_shape = data->get_shape();
+    auto slope = get_argument(1);
+    auto slope_shape = slope->get_shape();
+
+    if ((slope_shape.size() == 1) && (slope_shape.at(0) != 1))
+    {
+        auto it = std::find(std::begin(data_shape), std::end(data_shape), slope_shape.at(0));
+        auto index = std::distance(std::begin(data_shape), it);
+        slope = make_broadcast_node(slope, data->get_shape(), index);
+    }
+    else if (data_shape != slope_shape)
+    {
+        slope = numpy_style_broadcast({slope, data})[0];
+    }
+
+    // x <  0 => f(x) = x * slope
+    // x >= 0 => f(x) = x
+
+    std::shared_ptr<ngraph::Node> zero_node = std::make_shared<ngraph::op::Constant>(
+        data->get_element_type(), ngraph::Shape{}, std::vector<double>{0});
+    zero_node = make_broadcast_node(zero_node, data->get_shape());
+
+    std::shared_ptr<ngraph::Node> negative_map = std::make_shared<ngraph::op::Convert>(
+        std::make_shared<ngraph::op::Less>(data, zero_node), data->get_element_type());
+
+    std::shared_ptr<ngraph::Node> positive_map = std::make_shared<ngraph::op::Convert>(
+        std::make_shared<ngraph::op::Greater>(data, zero_node), data->get_element_type());
+
+    slope = negative_map * slope + positive_map;
+
+    return {data * slope};
+}
+
+shared_ptr<Node> op::PRelu::copy_with_new_args(const NodeVector& new_args) const
+{
+    if (new_args.size() != 2)
+    {
+        throw ngraph_error("Incorrect number of new arguments");
+    }
+    return make_shared<PRelu>(new_args.at(0), new_args.at(1));
+}
--- a/src/ngraph/op/fused/prelu.hpp
+++ b/src/ngraph/op/fused/prelu.hpp
+//*****************************************************************************
+// Copyright 2017-2019 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+
+#pragma once
+
+#include "ngraph/node.hpp"
+#include "ngraph/op/op.hpp"
+#include "ngraph/op/util/fused_op.hpp"
+
+namespace ngraph
+{
+    namespace op
+    {
+        /// \brief Parametrized Relu
+        /// x <  0 => f(x) = x * slope
+        /// x >= 0 => f(x) = x
+        ///
+        class PRelu : public ngraph::op::util::FusedOp
+        {
+        public:
+            /// \brief Constructs a PRelu operation.
+            ///
+            /// \param data Input tensor
+            /// \param slope Multipliers for negative values
+            PRelu(const std::shared_ptr<ngraph::Node>& data,
+                  const std::shared_ptr<ngraph::Node>& slope);
+
+            virtual NodeVector decompose_op() const override;
+
+            virtual std::shared_ptr<Node>
+                copy_with_new_args(const NodeVector& new_args) const override;
+        };
+    }
+}
--- a/src/ngraph/op/fused_op_tbl.hpp
+++ b/src/ngraph/op/fused_op_tbl.hpp
+//*****************************************************************************
+// Copyright 2017-2019 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+
+// This collection contains one entry for each fused op.
+//
+
+NGRAPH_OP(PRelu, ngraph::op)
--- a/src/ngraph/op/util/broadcasting.cpp
+++ b/src/ngraph/op/util/broadcasting.cpp
+//*****************************************************************************
+// Copyright 2017-2019 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+
+#include <cstddef>
+#include <iterator>
+#include <numeric>
+#include <vector>
+
+#include "broadcasting.hpp"
+#include "ngraph/axis_vector.hpp"
+#include "ngraph/log.hpp"
+#include "ngraph/op/broadcast.hpp"
+#include "ngraph/op/reshape.hpp"
+#include "ngraph/util.hpp"
+
+/// \brief Calculate the output shape of numpy-style broadcast operation for two shapes.
+///
+/// more info:
+/// https://docs.scipy.org/doc/numpy/user/basics.broadcasting.html#general-broadcasting-rules
+/// example:
+/// left: [3, 1, 10] right: [5, 1]
+/// return: [3, 5, 10]
+///
+/// \param left_shape First input shape.
+/// \param right_shape Second input Shape.
+/// \return Broadcast shape of input shapes.
+static ngraph::Shape calculate_broadcast_shape(ngraph::Shape left_shape, ngraph::Shape right_shape)
+{
+    ngraph::Shape result;
+    auto left_rank = left_shape.size();
+    auto right_rank = right_shape.size();
+    auto max_rank = std::max(left_rank, right_rank);
+
+    // left-pad the left_shape with zeros
+    left_shape.insert(std::begin(left_shape), max_rank - left_rank, 0);
+    // left-pad the right_shape with zeros
+    right_shape.insert(std::begin(right_shape), max_rank - right_rank, 0);
+
+    for (std::size_t index = 0; index < max_rank; ++index)
+    {
+        result.push_back(std::max(left_shape.at(index), right_shape.at(index)));
+    }
+
+    return result;
+};
+
+/// \brief Calculate the output shape of numpy-style broadcast operation for all input shapes.
+///
+/// This function finds the maximum tensor shape that will be the result of element-wise operation
+/// that will be applied to the input shapes vector. The function also prepares the shape of each input
+/// for the element-wise operation by left-padding those shapes so that their rank is equal to
+/// the left_shape's rank.
+///
+/// \param input_shapes A vector of input shapes for which a common shape should be found
+/// \return A pair that contains the target shape as its first object and a vector of padded
+///         input shapes ready to be broadcasted as the second object
+static std::pair<ngraph::Shape, std::vector<ngraph::Shape>>
+    get_numpy_broadcast_shapes(const std::vector<ngraph::Shape>& input_shapes)
+{
+    ngraph::Shape target_shape = std::accumulate(std::begin(input_shapes),
+                                                 std::end(input_shapes),
+                                                 ngraph::Shape{},
+                                                 calculate_broadcast_shape);
+
+    std::vector<ngraph::Shape> full_shapes;
+    for (const ngraph::Shape& input : input_shapes)
+    {
+        ngraph::Shape padded_shape{input};
+        padded_shape.insert(std::begin(padded_shape), target_shape.size() - padded_shape.size(), 1);
+        full_shapes.push_back(std::move(padded_shape));
+    }
+
+    return {target_shape, full_shapes};
+}
+
+/// \brief Calculate the output shape of numpy-style broadcast operation for all input nodes.
+///
+/// \param inputs A vector of input nodes for which a common shape should be found
+/// \return A pair that contains the target shape as its first object and a vector of padded
+///         input shapes ready to be broadcasted as the second object
+static std::pair<ngraph::Shape, std::vector<ngraph::Shape>>
+    get_numpy_broadcast_shapes(const ngraph::NodeVector& inputs)
+{
+    std::vector<ngraph::Shape> input_shapes;
+
+    for (const auto& input : inputs)
+    {
+        input_shapes.push_back(input->get_shape());
+    }
+
+    return get_numpy_broadcast_shapes(input_shapes);
+}
+
+/// \brief      Broadcast input node.
+///
+/// \note       The source shape does not have to be the actual shape of input node. However
+///             it should be a superset of it (containing it as a continuous subset). This implies
+///             we may expand the number of axes of input node.
+///             The ranks of source_shape and output_shape must be equal. This means that the
+///             source_shape has to be padded with ones for this operation.
+///
+/// \param[in]  node          The input Node to be broadcasted.
+/// \param[in]  output_shape  The output shape.
+/// \param[in]  source_shape  The source shape from which we want to broadcast input node.
+///
+/// \return     The broadcasted Node.
+///
+static std::shared_ptr<ngraph::Node>
+    broadcast_node_numpy_style(const std::shared_ptr<ngraph::Node>& node,
+                               const ngraph::Shape& output_shape,
+                               const ngraph::Shape& source_shape)
+{
+    if (source_shape.size() != output_shape.size())
+    {
+        NGRAPH_WARN << "Ranks of source_shape and output_shape dont match: " << source_shape.size()
+                    << " vs " << output_shape.size();
+    }
+
+    ngraph::AxisVector broadcast_axes;
+    ngraph::Shape squeezed_shape;
+    // Positions of axes which have length of 1 are needed to calculate broadcast_axes
+    // for nGraph broadcast operation. We need to remove all ones from source shape
+    // to avoid broadcasting axis conflict.
+    for (std::size_t index = 0; index < output_shape.size(); ++index)
+    {
+        if (source_shape.at(index) == 1)
+        {
+            broadcast_axes.push_back(index);
+        }
+        else
+        {
+            squeezed_shape.push_back(source_shape.at(index));
+        }
+    }
+
+    // Remove axes which have length of 1 from source shape
+    auto broadcasted_node = std::make_shared<ngraph::op::Reshape>(
+        node, ngraph::get_default_order(node->get_shape()), squeezed_shape);
+
+    return std::make_shared<ngraph::op::Broadcast>(broadcasted_node, output_shape, broadcast_axes);
+}
+
+namespace ngraph
+{
+    namespace op
+    {
+        NodeVector numpy_style_broadcast(const NodeVector& inputs)
+        {
+            if (inputs.size() <= 1)
+            {
+                return inputs;
+            }
+
+            // find the output tensor's shape, then broadcast all inputs so that they are compatible
+            auto bcast_shapes = get_numpy_broadcast_shapes(inputs);
+
+            NodeVector broadcasted_inputs;
+            for (std::size_t i = 0; i < inputs.size(); ++i)
+            {
+                const std::shared_ptr<ngraph::Node> input_node = inputs[i];
+
+                Shape source_shape = input_node->get_shape();
+                broadcasted_inputs.push_back(broadcast_node_numpy_style(
+                    inputs[i], bcast_shapes.first, bcast_shapes.second[i]));
+            }
+
+            return broadcasted_inputs;
+        }
+
+        NodeVector
+            numpy_style_broadcast_for_matmul_operation(const std::shared_ptr<ngraph::Node>& left,
+                                                       const std::shared_ptr<ngraph::Node>& right)
+        {
+            const auto& left_shape = left->get_shape();
+            const auto& right_shape = right->get_shape();
+            // Broadcast only _stack of matrices_ axes.
+            const auto& numpy_shapes = get_numpy_broadcast_shapes(
+                {Shape{std::begin(left_shape), std::next(std::end(left_shape), -2)},
+                 Shape{std::begin(right_shape), std::next(std::end(right_shape), -2)}});
+
+            // Prepare tensors output shapes with broadcasted _stack of matrices_ axes.
+            auto left_output_shape = numpy_shapes.first;
+            auto right_output_shape = numpy_shapes.first;
+            // Append the last two axes original dimensions.
+            left_output_shape.insert(std::end(left_output_shape),
+                                     std::next(std::begin(left_shape), left_shape.size() - 2),
+                                     std::end(left_shape));
+            right_output_shape.insert(std::end(right_output_shape),
+                                      std::next(std::begin(right_shape), right_shape.size() - 2),
+                                      std::end(right_shape));
+
+            auto left_full_shape = numpy_shapes.second.at(0);
+            auto right_full_shape = numpy_shapes.second.at(1);
+            // Append the last two axes original dimensions.
+            left_full_shape.insert(std::end(left_full_shape),
+                                   std::next(std::begin(left_shape), left_shape.size() - 2),
+                                   std::end(left_shape));
+            right_full_shape.insert(std::end(right_full_shape),
+                                    std::next(std::begin(right_shape), right_shape.size() - 2),
+                                    std::end(right_shape));
+
+            return {broadcast_node_numpy_style(left, left_output_shape, left_full_shape),
+                    broadcast_node_numpy_style(right, right_output_shape, right_full_shape)};
+        }
+
+        NodeVector
+            legacy_style_broadcast_for_binary_operation(const std::shared_ptr<ngraph::Node>& left,
+                                                        const std::shared_ptr<ngraph::Node>& right,
+                                                        std::size_t start_match_axis)
+        {
+            const auto& left_shape = left->get_shape();
+            const auto& right_shape = right->get_shape();
+
+            bool dimensions_identical = (left_shape == right_shape);
+            if (dimensions_identical)
+            {
+                return {left, right};
+            }
+
+            // Prepare new shape of right operand for broadcasting
+            // Remove dimensions with length=1 from back
+            auto new_right_shape = right_shape;
+            for (int dimension = new_right_shape.size() - 1; dimension >= 0; --dimension)
+            {
+                if (new_right_shape[dimension] == 1)
+                {
+                    new_right_shape.pop_back();
+                }
+                else
+                {
+                    break;
+                }
+            }
+
+            // Find first dimensions at front with length different from 1
+            std::size_t num_ones = 0;
+            for (std::size_t dimension : new_right_shape)
+            {
+                if (dimension == 1)
+                {
+                    ++num_ones;
+                }
+                else
+                {
+                    break;
+                }
+            }
+
+            // Remove dimensions with length=1 from front
+            new_right_shape.erase(std::begin(new_right_shape),
+                                  std::next(std::begin(new_right_shape), num_ones));
+
+            auto reshape_right = std::make_shared<ngraph::op::Reshape>(
+                right, ngraph::get_default_order(right_shape), new_right_shape);
+
+            // Move broadcast start axis parameter to right
+            start_match_axis += num_ones;
+
+            auto broadcast_right = std::make_shared<ngraph::op::Broadcast>(
+                reshape_right,
+                left_shape,
+                calculate_broadcast_axes(left_shape, new_right_shape, start_match_axis));
+
+            return {left, broadcast_right};
+        }
+
+        AxisSet calculate_broadcast_axes(const Shape& output_shape,
+                                         const Shape& input_shape,
+                                         std::size_t start_match_axis)
+        {
+            std::vector<std::size_t> result(output_shape.size() - input_shape.size());
+            // Populate the result vector with monotonic increasing series from 0 until
+            // output_shape_size, excluding values in range [start_match_axis, start_match_axis + input_shape.size()
+            std::iota(std::begin(result), std::begin(result) + start_match_axis, 0);
+            std::iota(std::begin(result) + start_match_axis,
+                      std::end(result),
+                      start_match_axis + input_shape.size());
+            return result;
+        }
+
+    } // namespace  op
+
+} // namespace  ngraph
--- a/src/ngraph/op/util/broadcasting.hpp
+++ b/src/ngraph/op/util/broadcasting.hpp
+//*****************************************************************************
+// Copyright 2017-2019 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+
+#pragma once
+
+#include <memory>
+
+#include "ngraph/axis_set.hpp"
+#include "ngraph/node.hpp"
+#include "ngraph/op/broadcast.hpp"
+#include "ngraph/shape.hpp"
+
+namespace ngraph
+{
+    namespace op
+    {
+        /// \brief Cast shape of all input nodes for an element-wise operation that requires shape-compatibility
+        ///
+        /// \param inputs Original list of inputs
+        ///
+        /// \return Numpy-style broadcasted list of nodes.
+        NodeVector numpy_style_broadcast(const NodeVector& inputs);
+
+        /// \brief Cast shape of two nodes to make them compatible for an element-wise binary operation.
+        ///
+        /// If necessary the right-hand-side argument will be broadcast to match the shape
+        /// of left-hand-side argument. The starting of the mutually equal shape is
+        /// specified by the argument "start_match_axis", and if it is not set,
+        /// suffix matching is assumed.
+        ///
+        /// This style of broadcast was used in ONNX Op sets prior to version 7, where it was
+        /// replaced by numpy-style broadcasting.
+        ///
+        /// \param left Node which contain input of binary op.
+        /// \param right Node which contain input of binary op.
+        /// \param start_match_axis position in shape denoting start of the mutually equal shape
+        ///
+        /// \return Left and right node after broadcasting.
+        NodeVector
+            legacy_style_broadcast_for_binary_operation(const std::shared_ptr<ngraph::Node>& left,
+                                                        const std::shared_ptr<ngraph::Node>& right,
+                                                        std::size_t start_match_axis);
+
+        /// \brief      Broadcast shape of two nodes to make them compatible for a matrix multiplication.
+        ///
+        /// \note       This function is reflecting broadcasting behaviour of NumPys' `matmul` operation
+        ///             \link https://docs.scipy.org/doc/numpy/reference/generated/numpy.matmul.html
+        ///             This mean that only \"stack of matrices\" axes are bidirectionally broadcasted.
+        ///             The last two dimension are left untouched.
+        ///
+        /// \param[in]  left   The Node providing data for the left-hand side of matrix multiplication.
+        /// \param[in]  right  The Node providing data for the right-hand side of matrix multiplication.
+        ///
+        /// \return     The vector containing both nodes broadcasted.
+        ///
+        NodeVector
+            numpy_style_broadcast_for_matmul_operation(const std::shared_ptr<ngraph::Node>& left,
+                                                       const std::shared_ptr<ngraph::Node>& right);
+
+        /// \brief Generate a list of broadcast axes.
+        ///
+        /// \details Informally, a broadcast "adds" axes to the input tensor, replicating
+        ///          elements from the input tensor as needed to fill the new dimensions.
+        ///          Function calculate which of the output axes are added in this way.
+        ///
+        /// \param output_shape      The new shape for the output tensor.
+        /// \param input_shape       The shape of input tensor.
+        /// \param start_match_axis  The axis along which we want to replicate elements.
+        ///                          The starting axis position (0-based) int the output
+        ///                          shape from which the current shape of the tensor
+        ///                          matches the desired new shape.
+        ///
+        /// \return The indices of added axes.
+        AxisSet calculate_broadcast_axes(const Shape& output_shape,
+                                         const Shape& input_shape,
+                                         std::size_t start_match_axis);
+
+        /// \brief Generate a list of broadcast along axes.
+        ///
+        /// \details Broadcast "adds" elements along axes to the input tensor, replicating
+        ///          elements from the input tensor as needed to fill the new dimensions.
+        ///          Function calculate which of the output axes are added in this way.
+        ///
+        ///          This function will attempt to match shapes, assuming the current shape
+        ///          matches the rightmost positions of the desired new shape. This behaviour
+        ///          is similar to NumPy's broadcasting.
+        ///
+        /// \param output_shape The new shape for the output tensor.
+        /// \param input_shape  The shape of input tensor.
+        ///
+        /// \return             The indices of added axes.
+        inline AxisSet calculate_broadcast_axes(const Shape& output_shape, const Shape& input_shape)
+        {
+            return calculate_broadcast_axes(
+                output_shape, input_shape, output_shape.size() - input_shape.size());
+        }
+
+        inline std::shared_ptr<ngraph::Node>
+            make_broadcast_node(const std::shared_ptr<ngraph::Node>& node, ngraph::Shape new_shape)
+        {
+            return std::make_shared<ngraph::op::Broadcast>(
+                node, new_shape, calculate_broadcast_axes(new_shape, node->get_shape()));
+        }
+
+        inline std::shared_ptr<ngraph::Node>
+            make_broadcast_node(const std::shared_ptr<ngraph::Node>& node,
+                                ngraph::Shape new_shape,
+                                std::size_t start_match_axis)
+        {
+            return std::make_shared<ngraph::op::Broadcast>(
+                node,
+                new_shape,
+                calculate_broadcast_axes(new_shape, node->get_shape(), start_match_axis));
+        }
+    } // namespace  op
+} // namespace  ngraph
--- a/src/ngraph/op/util/fused_op.cpp
+++ b/src/ngraph/op/util/fused_op.cpp
+//*****************************************************************************
+// Copyright 2017-2019 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+
+#include "ngraph/op/util/fused_op.hpp"
+
+#include "ngraph/graph_util.hpp"
+
+using namespace ngraph;
+
+op::util::FusedOp::FusedOp(const std::string& node_type, const NodeVector& args)
+    : Op(node_type, args)
+{
+}
+
+void op::util::FusedOp::validate_and_infer_types()
+{
+    auto subgraph_outputs = decompose_op();
+    auto subgraph = extract_subgraph(subgraph_outputs, get_arguments());
+    validate_nodes_and_infer_types(subgraph);
+
+    size_t i = 0;
+    for (auto output_node : subgraph_outputs)
+    {
+        for (size_t j = 0; j < output_node->get_output_size(); j++, i++)
+        {
+            set_output_type(
+                i, output_node->get_output_element_type(j), output_node->get_output_shape(j));
+        }
+    }
+}
+
+void op::util::FusedOp::generate_adjoints(autodiff::Adjoints& adjoints, const NodeVector& deltas)
+{
+    // TODO
+    throw ngraph_error("Autodiff on fused ops not supported yet");
+}
--- a/src/ngraph/op/util/fused_op.hpp
+++ b/src/ngraph/op/util/fused_op.hpp
+//*****************************************************************************
+// Copyright 2017-2019 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+
+#pragma once
+
+#include "ngraph/op/op.hpp"
+
+namespace ngraph
+{
+    namespace op
+    {
+        namespace util
+        {
+            /// \brief Abstract base class for fused ops, i.e ops that can be broken down into core ngraph ops
+            ///
+            class FusedOp : public Op
+            {
+            public:
+                /// \brief Decomposes the FusedOp into a sub-graph consisting of core ngraph ops
+                ///
+                /// \return A vector of nodes comprising the sub-graph. The order of output
+                ///         tensors must match the match output tensors of the FusedOp
+                virtual NodeVector decompose_op() const = 0;
+
+                void validate_and_infer_types() override;
+
+                void generate_adjoints(autodiff::Adjoints& adjoints,
+                                       const NodeVector& deltas) override;
+
+            protected:
+                /// \brief Constructs a FusedOp
+                ///
+                /// \param args Nodes that produce the input tensors for the fused op
+                FusedOp(const std::string& node_type, const NodeVector& args);
+            };
+        }
+    }
+}
--- a/src/ngraph/pass/fused_op_decomposition.cpp
+++ b/src/ngraph/pass/fused_op_decomposition.cpp
+//*****************************************************************************
+// Copyright 2017-2019 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+
+#include "ngraph/pass/fused_op_decomposition.hpp"
+
+#include "ngraph/graph_util.hpp"
+#include "ngraph/op/util/fused_op.hpp"
+
+using namespace std;
+using namespace ngraph;
+
+bool ngraph::pass::FusedOpDecomposition::run_on_node(std::shared_ptr<ngraph::Node> node)
+{
+    bool modified = false;
+
+    if (auto fused_op = std::dynamic_pointer_cast<ngraph::op::util::FusedOp>(node))
+    {
+        auto subgraph = fused_op->decompose_op();
+        if (subgraph.size() != fused_op->get_output_size())
+        {
+            throw ngraph_error("While replacing " + node->get_name() +
+                               ", mismatch between op output count and outputs of the decomposed "
+                               "subgraph. Expected: " +
+                               to_string(fused_op->get_output_size()) + " Got: " +
+                               to_string(subgraph.size()));
+        }
+        if (fused_op->get_output_size() == 1)
+        {
+            ngraph::replace_node(fused_op, subgraph[0]);
+        }
+        else
+        {
+            // TODO (jbobba): Handle multi-output ops. Need to find the GOE for the output and replace that with subgraph output node
+        }
+        modified = true;
+    }
+
+    return modified;
+}
--- a/src/ngraph/pass/fused_op_decomposition.hpp
+++ b/src/ngraph/pass/fused_op_decomposition.hpp
+//*****************************************************************************
+// Copyright 2017-2019 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+
+#pragma once
+
+#include "ngraph/pass/pass.hpp"
+
+namespace ngraph
+{
+    namespace pass
+    {
+        class FusedOpDecomposition : public NodePass
+        {
+        public:
+            bool run_on_node(std::shared_ptr<ngraph::Node> node) override;
+        };
+    }
+}
--- a/src/ngraph/runtime/cpu/cpu_external_function.cpp
+++ b/src/ngraph/runtime/cpu/cpu_external_function.cpp
@@ -125,6 +125,7 @@
 #include "ngraph/pass/core_fusion.hpp"
 #include "ngraph/pass/cse.hpp"
 #include "ngraph/pass/dump_sorted.hpp"
+#include "ngraph/pass/fused_op_decomposition.hpp"
 #include "ngraph/pass/get_output_element_elimination.hpp"
 #include "ngraph/pass/like_replacement.hpp"
 #include "ngraph/pass/liveness.hpp"
@@ -1117,6 +1118,7 @@ void runtime::cpu::CPU_ExternalFunction::register_common_passes(
    auto pass_map = pass_config.get_enables();

    REGISTER_KNOBBED_PASS(LikeReplacement, true, ngraph::pass);
+    REGISTER_KNOBBED_PASS(FusedOpDecomposition, true, ngraph::pass);
    REGISTER_KNOBBED_PASS(NopElimination, true, ngraph::pass);
    REGISTER_KNOBBED_PASS(ZeroDimTensorElimination, true, ngraph::pass);
    REGISTER_KNOBBED_PASS(LSTMFusion, true, runtime::cpu::pass);

--- a/src/ngraph/runtime/gpu/gpu_compiled_function.cpp
+++ b/src/ngraph/runtime/gpu/gpu_compiled_function.cpp
@@ -33,6 +33,7 @@
 #include "ngraph/function.hpp"
 #include "ngraph/node.hpp"
 #include "ngraph/pass/algebraic_simplification.hpp"
+#include "ngraph/pass/fused_op_decomposition.hpp"
 #include "ngraph/pass/get_output_element_elimination.hpp"
 #include "ngraph/pass/like_replacement.hpp"

@@ -170,6 +171,7 @@ void runtime::gpu::GPUCompiledFunction::compile()
 #endif
    pass_manager.register_pass<runtime::gpu::pass::BatchNormCache>();
    pass_manager.register_pass<ngraph::pass::LikeReplacement>();
+    pass_manager.register_pass<ngraph::pass::FusedOpDecomposition>();
    pass_manager.register_pass<runtime::gpu::pass::GPULayout>(this);
    pass_manager.register_pass<ngraph::pass::AssignLayout<descriptor::layout::DenseTensorLayout>>();
    pass_manager.register_pass<ngraph::pass::GetOutputElementElimination>();

--- a/src/ngraph/runtime/intelgpu/intelgpu_backend.cpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_backend.cpp
@@ -43,6 +43,7 @@

 #include "ngraph/pass/algebraic_simplification.hpp"
 #include "ngraph/pass/cse.hpp"
+#include "ngraph/pass/fused_op_decomposition.hpp"
 #include "ngraph/pass/get_output_element_elimination.hpp"
 #include "ngraph/pass/manager.hpp"
 #include "ngraph/pass/nop_elimination.hpp"
@@ -429,6 +430,7 @@ shared_ptr<runtime::Executable>
    {
        ngraph::pass::Manager pass_manager;

+        pass_manager.register_pass<ngraph::pass::FusedOpDecomposition>();
        pass_manager.register_pass<ngraph::pass::NopElimination>();
        pass_manager.register_pass<ngraph::pass::AlgebraicSimplification>();
        pass_manager.register_pass<ngraph::pass::CommonSubexpressionElimination>();

--- a/src/ngraph/runtime/interpreter/int_executable.cpp
+++ b/src/ngraph/runtime/interpreter/int_executable.cpp
@@ -21,6 +21,7 @@
 #include "ngraph/op/select.hpp"
 #include "ngraph/op/util/binary_elementwise_comparison.hpp"
 #include "ngraph/pass/assign_layout.hpp"
+#include "ngraph/pass/fused_op_decomposition.hpp"
 #include "ngraph/pass/like_replacement.hpp"
 #include "ngraph/pass/liveness.hpp"
 #include "ngraph/pass/manager.hpp"
@@ -39,6 +40,7 @@ runtime::interpreter::INTExecutable::INTExecutable(const shared_ptr<Function>& f
    m_is_compiled = true;
    pass::Manager pass_manager;
    pass_manager.register_pass<pass::LikeReplacement>();
+    pass_manager.register_pass<pass::FusedOpDecomposition>();
    pass_manager.register_pass<pass::AssignLayout<DenseTensorLayout>>();
    pass_manager.register_pass<pass::Liveness>();
    pass_manager.run_passes(function);

--- a/src/ngraph/serializer.cpp
+++ b/src/ngraph/serializer.cpp
@@ -64,6 +64,7 @@
 #include "ngraph/op/experimental/shape_of.hpp"
 #include "ngraph/op/experimental/transpose.hpp"
 #include "ngraph/op/floor.hpp"
+#include "ngraph/op/fused/prelu.hpp"
 #include "ngraph/op/get_output_element.hpp"
 #include "ngraph/op/greater.hpp"
 #include "ngraph/op/greater_eq.hpp"
@@ -124,6 +125,7 @@ using const_data_callback_t = shared_ptr<Node>(const string&, const element::Typ
 #define NGRAPH_OP(a, b) a,
 enum class OP_TYPEID
 {
+#include "ngraph/op/fused_op_tbl.hpp"
 #include "ngraph/op/op_tbl.hpp"
    UnknownOp
 };
@@ -137,6 +139,7 @@ static OP_TYPEID get_typeid(const string& s)
 // ...
 #define NGRAPH_OP(a, b) {#a, OP_TYPEID::a},
    static const unordered_map<string, OP_TYPEID> typeid_map{
+#include "ngraph/op/fused_op_tbl.hpp"
 #include "ngraph/op/op_tbl.hpp"
    };
 #undef NGRAPH_OP
@@ -1024,6 +1027,11 @@ static shared_ptr<ngraph::Function>
                node = make_shared<op::Power>(args[0], args[1]);
                break;
            }
+            case OP_TYPEID::PRelu:
+            {
+                node = make_shared<op::PRelu>(args[0], args[1]);
+                break;
+            }
            case OP_TYPEID::Product:
            {
                auto reduction_axes = node_js.at("reduction_axes").get<set<size_t>>();
@@ -1674,6 +1682,8 @@ static json write(const Node& n, bool binary_constant_data)
        node["output_shapes"] = std::move(outputs_js);
        break;
    }
+    case OP_TYPEID::PRelu: { break;
+    }
    case OP_TYPEID::Product:
    {
        auto tmp = dynamic_cast<const op::Product*>(&n);

--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -143,6 +143,7 @@ set(MULTI_TEST_SRC
    backend_comparison.in.cpp
    backend_dot.in.cpp
    backend_embedding_lookup.in.cpp
+    backend_fusedop.in.cpp
    backend_one_hot.in.cpp
    backend_pool.in.cpp
    backend_reshape.in.cpp

--- a/test/backend_fusedop.in.cpp
+++ b/test/backend_fusedop.in.cpp
+//*****************************************************************************
+// Copyright 2017-2019 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+
+#include <algorithm>
+#include <cinttypes>
+#include <cmath>
+#include <cstdlib>
+#include <random>
+#include <string>
+
+#include "gtest/gtest.h"
+#include "ngraph/ngraph.hpp"
+#include "util/all_close.hpp"
+#include "util/all_close_f.hpp"
+#include "util/ndarray.hpp"
+#include "util/random.hpp"
+#include "util/test_control.hpp"
+#include "util/test_tools.hpp"
+
+using namespace std;
+using namespace ngraph;
+
+static string s_manifest = "${MANIFEST}";
+
+NGRAPH_TEST(${BACKEND_NAME}, prelu)
+{
+    Shape shape{3, 2};
+    Shape rshape{3};
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto B = make_shared<op::Parameter>(element::f32, rshape);
+    auto prelu = make_shared<op::PRelu>(A, B);
+    auto f0 = make_shared<Function>(NodeVector{prelu}, ParameterVector{A, B});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape);
+    copy_data(a, vector<float>{-2, 3, -2, 1, -1, 0});
+    auto b = backend->create_tensor(element::f32, rshape);
+    copy_data(b, vector<float>{0, 0.5, 1});
+    auto result0 = backend->create_tensor(element::f32, shape);
+    auto handle = backend->compile(f0);
+    handle->call_with_validate({result0}, {a, b});
+    vector<float> expected{0, 3, -1, 1, -1, 0};
+    EXPECT_EQ(expected, read_vector<float>(result0));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, prelu_shared_slope)
+{
+    Shape shape{3, 2};
+    Shape rshape{};
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto B = make_shared<op::Parameter>(element::f32, rshape);
+    auto prelu = make_shared<op::PRelu>(A, B);
+    auto f0 = make_shared<Function>(NodeVector{prelu}, ParameterVector{A, B});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape);
+    copy_data(a, vector<float>{-2, 3, -2, 1, -1, 0});
+    auto b = backend->create_tensor(element::f32, rshape);
+    copy_data(b, vector<float>{0.5});
+    auto result0 = backend->create_tensor(element::f32, shape);
+    auto handle = backend->compile(f0);
+    handle->call_with_validate({result0}, {a, b});
+    vector<float> expected{-1, 3, -1, 1, -0.5, 0};
+    EXPECT_EQ(expected, read_vector<float>(result0));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, prelu_negative_slope)
+{
+    Shape shape{3, 2};
+    Shape rshape{};
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto B = make_shared<op::Parameter>(element::f32, rshape);
+    auto prelu = make_shared<op::PRelu>(A, B);
+    auto f0 = make_shared<Function>(NodeVector{prelu}, ParameterVector{A, B});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape);
+    copy_data(a, vector<float>{-2, 3, -2, 1, -1, 0});
+    auto b = backend->create_tensor(element::f32, rshape);
+    copy_data(b, vector<float>{-0.5});
+    auto result0 = backend->create_tensor(element::f32, shape);
+    auto handle = backend->compile(f0);
+    handle->call_with_validate({result0}, {a, b});
+    vector<float> expected{1, 3, 1, 1, 0.5, 0};
+    EXPECT_EQ(expected, read_vector<float>(result0));
+}
--- a/test/type_prop.cpp
+++ b/test/type_prop.cpp
@@ -13074,3 +13074,13 @@ TEST(type_prop, dynslice_params_et_wrong)
        DynSlice_Test_Type_Except(arg, lower_bounds, upper_bounds, strides);
    }
 }
+
+TEST(type_prop, prelu)
+{
+    auto param = make_shared<op::Parameter>(element::f32, Shape{2, 4});
+    auto slope = make_shared<op::Parameter>(element::f32, Shape{2});
+    Shape prelu_shape{2, 4};
+    auto prelu = make_shared<op::PRelu>(param, slope);
+    ASSERT_EQ(prelu->get_element_type(), element::f32);
+    ASSERT_EQ(prelu->get_shape(), prelu_shape);
+}