moving Relu op form Argon backend with CoreFusion (#489)

* relu for interpreter * relu in serializer * core fusion * relu backprop * relu backprop and test interpreter * core fusion for CPU * COREFusion -> CoreFusion * relu MKL dnn

moving Relu op form Argon backend with CoreFusion (#489)
* relu for interpreter * relu in serializer * core fusion * relu backprop * relu backprop and test interpreter * core fusion for CPU * COREFusion -> CoreFusion * relu MKL dnn
4e29c153 · Sandeep · GitHub · 3380928c · 4e29c153 · 4e29c153
Unverified Commit 4e29c153 authored Feb 21, 2018 by Sandeep Committed by GitHub Feb 21, 2018
23 changed files
--- a/src/ngraph/CMakeLists.txt
+++ b/src/ngraph/CMakeLists.txt
@@ -63,6 +63,7 @@ set (SRC
    ops/power.cpp
    ops/reduce.cpp
    ops/reduce_window.cpp
+    ops/relu.cpp
    ops/replace_slice.cpp
    ops/reshape.cpp
    ops/reverse.cpp
@@ -95,6 +96,7 @@ set (SRC
    pass/reshape_elimination.cpp
    pass/visualize_tree.cpp
    pattern/matcher.cpp
+    pattern/core_fusion.cpp
    runtime/aligned_buffer.cpp
    runtime/host_tensor_view.cpp
    runtime/interpreter/int_backend.cpp

--- a/src/ngraph/graph_util.cpp
+++ b/src/ngraph/graph_util.cpp
@@ -25,6 +25,7 @@
 #include "ngraph/graph_util.hpp"
 #include "ngraph/log.hpp"
 #include "ngraph/node.hpp"
+#include "ngraph/ops/constant.hpp"
 using namespace std;
@@ -256,3 +257,25 @@ std::shared_ptr<ngraph::Function> ngraph::clone_function(std::shared_ptr<ngraph:
    // create and return cloned function
    return std::make_shared<ngraph::Function>(cloned_results, cloned_params);
 }
+bool ngraph::is_equal_to_const_value(std::string const_value, std::shared_ptr<Node> reduce_constant)
+{
+    if (auto rc = dynamic_pointer_cast<ngraph::op::Constant>(reduce_constant))
+    {
+        auto cshape = rc->get_shape();
+        size_t n = shape_size(cshape);
+        // way to construct a constant of a given type, shape, value
+        std::vector<std::string> vector_zero{n, const_value};
+        auto constant_val_op =
+            std::make_shared<ngraph::op::Constant>(rc->get_element_type(), cshape, vector_zero);
+        // way to compare elements to const_value
+        size_t n_bytes = n * rc->get_element_type().size();
+        NGRAPH_DEBUG << "Comparing " << n_bytes << " bytes";
+        return !memcmp(constant_val_op->get_data_ptr(), rc->get_data_ptr(), n_bytes);
+    }
+    else
+    {
+        return false;
+    }
+}
--- a/src/ngraph/graph_util.hpp
+++ b/src/ngraph/graph_util.hpp
@@ -53,6 +53,8 @@ namespace ngraph
    std::list<std::shared_ptr<Node>>
        topological_sort(const std::list<std::shared_ptr<Node>>& nodes);
+    bool is_equal_to_const_value(std::string const_value, std::shared_ptr<Node> reduce_constant);
    // maps original to replacement nodes e.g. for clone utilities
    // performs index checking on access
    class NodeMap

--- a/src/ngraph/ngraph.hpp
+++ b/src/ngraph/ngraph.hpp
@@ -107,6 +107,7 @@
 #include "ngraph/ops/product.hpp"
 #include "ngraph/ops/reduce.hpp"
 #include "ngraph/ops/reduce_window.hpp"
+#include "ngraph/ops/relu.hpp"
 #include "ngraph/ops/remainder.hpp"
 #include "ngraph/ops/replace_slice.hpp"
 #include "ngraph/ops/reshape.hpp"

--- a/src/ngraph/ops/relu.cpp
+++ b/src/ngraph/ops/relu.cpp
+/*******************************************************************************
+* Copyright 2017-2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#include "ngraph/ops/relu.hpp"
+#include "ngraph/ops/multiply.hpp"
+using namespace std;
+using namespace ngraph;
+op::Relu::Relu(shared_ptr<Node> arg)
+    : UnaryElementwiseArithmetic("Relu", {arg})
+{
+    set_value_type_checked(arg->get_element_type(), arg->get_shape());
+}
+op::ReluBackprop::ReluBackprop(shared_ptr<Node> arg, shared_ptr<Node> delta)
+    : RequiresTensorViewArgs("ReluBackprop", {arg, delta})
+{
+    if (arg->get_element_type() != delta->get_element_type())
+    {
+        throw ngraph_error("Argument and delta element types for Relu backprop do not match");
+    }
+    if (arg->get_shape() != delta->get_shape())
+    {
+        throw ngraph_error("Argument and delta shape for Relu backprop do not match");
+    }
+    set_value_type_checked(delta->get_element_type(), delta->get_shape());
+}
+void op::Relu::generate_adjoints(autodiff::Adjoints& adjoints, const std::shared_ptr<Node>& delta)
+{
+    auto backprop = std::make_shared<op::ReluBackprop>(get_input_op(0), delta);
+    adjoints.add_delta(get_input_op(0), backprop);
+}
--- a/src/ngraph/ops/relu.hpp
+++ b/src/ngraph/ops/relu.hpp
+/*******************************************************************************
+* Copyright 2017-2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#pragma once
+#include "ngraph/node.hpp"
+#include "ngraph/ops/op.hpp"
+#include "ngraph/ops/util/requires_tensor_view_args.hpp"
+#include "ngraph/ops/util/unary_elementwise_arithmetic.hpp"
+#include "ngraph/util.hpp"
+#include <memory>
+namespace ngraph
+{
+    namespace op
+    {
+        /// \brief Elementwise Relu operation.
+        ///
+        class Relu : public ngraph::op::util::UnaryElementwiseArithmetic
+        {
+        public:
+            /// \brief Constructs a Relu operation.
+            ///
+            /// \param arg Node that produces the input tensor.
+            Relu(std::shared_ptr<ngraph::Node> arg);
+            virtual std::shared_ptr<Node> copy_with_new_args(
+                const std::vector<std::shared_ptr<Node>>& new_args) const override
+            {
+                if (new_args.size() != 1)
+                {
+                    throw ngraph_error("Incorrect number of new arguments");
+                }
+                return std::make_shared<Relu>(new_args.at(0));
+            }
+            virtual void generate_adjoints(autodiff::Adjoints& adjoints,
+                                           const std::shared_ptr<Node>& delta) override;
+        };
+        /// \brief Elementwise ReluBackprop operation.
+        ///
+        class ReluBackprop : public ngraph::op::util::RequiresTensorViewArgs
+        {
+        public:
+            /// \brief Constructs a ReluBackprop operation.
+            ///
+            /// \param arg Node that produces the relu forward input tensor.
+            ReluBackprop(std::shared_ptr<ngraph::Node> arg, std::shared_ptr<ngraph::Node> delta);
+            virtual std::shared_ptr<Node> copy_with_new_args(
+                const std::vector<std::shared_ptr<Node>>& new_args) const override
+            {
+                if (new_args.size() != 2)
+                {
+                    throw ngraph_error("Incorrect number of new arguments");
+                }
+                return std::make_shared<ReluBackprop>(new_args.at(0), new_args.at(1));
+            }
+        };
+    }
+}
--- a/src/ngraph/pattern/core_fusion.cpp
+++ b/src/ngraph/pattern/core_fusion.cpp
+/*******************************************************************************
+* Copyright 2017-2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#include <algorithm>
+#include <iostream>
+#include <unordered_set>
+#include "ngraph/pattern/core_fusion.hpp"
+#include "ngraph/graph_util.hpp"
+#include "ngraph/log.hpp"
+#include "ngraph/ops/constant.hpp"
+#include "ngraph/ops/maximum.hpp"
+#include "ngraph/ops/parameter.hpp"
+#include "ngraph/ops/relu.hpp"
+#include "ngraph/pass/graph_rewrite.hpp"
+#include "ngraph/pass/manager.hpp"
+#include "ngraph/pattern/matcher.hpp"
+#include "ngraph/pattern/op/any.hpp"
+#include "ngraph/pattern/op/label.hpp"
+using namespace ngraph;
+using namespace std;
+bool is_zero(shared_ptr<Node> reduce_constant)
+{
+    auto result_bool = is_equal_to_const_value("0", reduce_constant);
+    return result_bool;
+}
+static shared_ptr<Node> construct_constant_node(int n)
+{
+    return op::Constant::create(element::f32, Shape{}, {n});
+}
+void pass::CoreFusion::construct_relu_pattern()
+{
+    auto iconst0 = construct_constant_node(0);
+    auto val = make_shared<pattern::op::Label>(iconst0);
+    auto zero = make_shared<pattern::op::Label>(iconst0, nullptr, Nodes{iconst0});
+    auto max = make_shared<op::Maximum>(zero, val);
+    pattern::gr_callback_fn callback = [val, zero](pattern::Matcher& m) {
+        NGRAPH_DEBUG << "In a callback for construct_relu_pattern against "
+                     << m.match_root()->get_name();
+        auto pattern_map = m.get_pattern_map();
+        shared_ptr<Node> nn;
+        auto mzero = m.get_pattern_map()[zero];
+        if (!is_zero(mzero))
+        {
+            NGRAPH_DEBUG << "zero constant = " << mzero->get_name() << " not equal to 0\n";
+            return nn;
+        }
+        auto mpattern = m.match_root();
+        auto cg = shared_ptr<Node>(new op::Relu(pattern_map[val]));
+        return cg;
+    };
+    auto m = make_shared<pattern::Matcher>(max, callback);
+    this->add_matcher(m);
+}
--- a/src/ngraph/pattern/core_fusion.hpp
+++ b/src/ngraph/pattern/core_fusion.hpp
+/*******************************************************************************
+* Copyright 2017-2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#pragma once
+#include "ngraph/pass/graph_rewrite.hpp"
+namespace ngraph
+{
+    namespace pass
+    {
+        class CoreFusion;
+    }
+}
+class ngraph::pass::CoreFusion : public ngraph::pass::GraphRewrite
+{
+public:
+    CoreFusion()
+        : GraphRewrite()
+    {
+        construct_relu_pattern();
+    }
+    void construct_relu_pattern();
+};
--- a/src/ngraph/pattern/matcher.hpp
+++ b/src/ngraph/pattern/matcher.hpp
@@ -19,6 +19,7 @@
 #include <cassert>
 #include <memory.h>
 #include "ngraph/node.hpp"
+#include "ngraph/ops/constant.hpp"
 #include "ngraph/pattern/op/any.hpp"
 #include "ngraph/pattern/op/label.hpp"

--- a/src/ngraph/runtime/cpu/cpu_emitter.cpp
+++ b/src/ngraph/runtime/cpu/cpu_emitter.cpp
@@ -69,6 +69,7 @@
 #include "ngraph/ops/product.hpp"
 #include "ngraph/ops/reduce.hpp"
 #include "ngraph/ops/reduce_window.hpp"
+#include "ngraph/ops/relu.hpp"
 #include "ngraph/ops/remainder.hpp"
 #include "ngraph/ops/replace_slice.hpp"
 #include "ngraph/ops/reshape.hpp"
@@ -3014,6 +3015,123 @@ namespace ngraph
                writer.indent--;
                writer << "}\n";
            }
+            template <>
+            void CPU_Emitter::EMITTER_DECL(ngraph::op::ReluBackprop)
+            {
+                const auto& arg_shape = args[0].get_shape();
+                const size_t arg_rank = arg_shape.size();
+                const auto& result_shape = out[0].get_shape();
+                const string& et = get_mkldnn_data_type(args[0].get_element_type().c_type_string());
+                if (arg_rank == 4 && args[0].get_element_type() == element::f32)
+                {
+                    writer << "{\n";
+                    writer.indent++;
+                    writer << "try {\n";
+                    writer.indent++;
+                    writer << "engine cpu_engine = engine(engine::cpu, 0);\n";
+                    writer << "memory::desc input_data_desc = memory::desc({" << join(arg_shape)
+                           << "}, " << et << ", memory::format::nchw);\n";
+                    writer << "memory::desc delta_data_desc = memory::desc({"
+                           << join(args[1].get_shape()) << "}, " << et
+                           << ", memory::format::nchw);\n";
+                    writer << "memory::desc result_desc = memory::desc({" << join(result_shape)
+                           << "}, " << et << ", memory::format::nchw);\n";
+                    writer << "memory input_data = memory({input_data_desc, cpu_engine}, "
+                           << args[0].get_name() << ");\n";
+                    writer << "memory delta_data = memory({delta_data_desc, cpu_engine}, "
+                           << args[1].get_name() << ");\n";
+                    writer << "memory result = memory({result_desc, cpu_engine}, "
+                           << out[0].get_name() << ");\n";
+                    writer << "relu_forward::desc relu_fwd_desc = "
+                              "relu_forward::desc(prop_kind::forward, "
+                              "algorithm::eltwise_relu, input_data_desc, 0, 0);\n";
+                    writer << "relu_forward::primitive_desc relu_fwd_prim_desc = "
+                              "relu_forward::primitive_desc(relu_fwd_desc, cpu_engine);\n";
+                    writer << "relu_backward::desc relu_bwd_desc = "
+                              "relu_backward::desc(algorithm::eltwise_relu, "
+                              "delta_data_desc, input_data_desc, 0, 0);\n";
+                    writer << "relu_backward::primitive_desc relu_bdw_prim_desc = "
+                              "relu_backward::primitive_desc(relu_bwd_desc, cpu_engine, "
+                              "relu_fwd_prim_desc);\n";
+                    writer
+                        << "relu_backward relu_bwd= relu_backward(relu_bdw_prim_desc, input_data, "
+                           "delta_data, result);\n";
+                    writer << "stream s = stream(stream::kind::eager);\n"
+                              "s.submit({relu_bwd}).wait();\n";
+                    writer.indent--;
+                    writer << "} catch (const mkldnn::error& e) {\n";
+                    writer.indent++;
+                    writer << "throw ngraph::ngraph_error(\"MKLDNN ERROR (\" + std::to_string("
+                              "e.status) + \"): \" + e.message);\n";
+                    writer.indent--;
+                    writer << "}\n";
+                    writer.indent--;
+                    writer << "}\n";
+                }
+                else
+                {
+                    writer << "kernel::relu_backprop<" << out[0].get_type() << ">("
+                           << args[0].get_name() << ",\n";
+                    writer << "                      " << args[1].get_name() << ",\n";
+                    writer << "                   " << out[0].get_name() << ",\n";
+                    writer << "                   " << out[0].get_size() << ");\n";
+                }
+            }
+            template <>
+            void CPU_Emitter::EMITTER_DECL(ngraph::op::Relu)
+            {
+                const auto& arg_shape = args[0].get_shape();
+                const size_t arg_rank = arg_shape.size();
+                const auto& result_shape = out[0].get_shape();
+                const string& et = get_mkldnn_data_type(args[0].get_element_type().c_type_string());
+                if (arg_rank == 4 && args[0].get_element_type() == element::f32)
+                {
+                    writer << "{\n";
+                    writer.indent++;
+                    writer << "try {\n";
+                    writer.indent++;
+                    writer << "engine cpu_engine = engine(engine::cpu, 0);\n";
+                    writer << "memory::desc input_data_desc = memory::desc({" << join(arg_shape)
+                           << "}, " << et << ", memory::format::nchw);\n";
+                    writer << "memory::desc result_desc = memory::desc({" << join(result_shape)
+                           << "}, " << et << ", memory::format::nchw);\n";
+                    writer << "memory input_data = memory({input_data_desc, cpu_engine}, "
+                           << args[0].get_name() << ");\n";
+                    writer << "memory result = memory({result_desc, cpu_engine}, "
+                           << out[0].get_name() << ");\n";
+                    writer << "relu_forward::desc relu_fwd_desc = "
+                              "relu_forward::desc(prop_kind::forward_training, "
+                              "algorithm::eltwise_relu, input_data_desc, 0, 0);\n";
+                    writer << "relu_forward::primitive_desc relu_prim_desc = "
+                              "relu_forward::primitive_desc(relu_fwd_desc, cpu_engine);\n";
+                    writer << "relu_forward relu_fwd= relu_forward(relu_prim_desc, input_data, "
+                              "result);\n";
+                    writer << "stream s = stream(stream::kind::eager);\n"
+                              "s.submit({relu_fwd}).wait();\n";
+                    writer.indent--;
+                    writer << "} catch (const mkldnn::error& e) {\n";
+                    writer.indent++;
+                    writer << "throw ngraph::ngraph_error(\"MKLDNN ERROR (\" + std::to_string("
+                              "e.status) + \"): \" + e.message);\n";
+                    writer.indent--;
+                    writer << "}\n";
+                    writer.indent--;
+                    writer << "}\n";
+                }
+                else
+                {
+                    writer << "kernel::relu<" << out[0].get_type() << ">(" << args[0].get_name()
+                           << ",\n";
+                    writer << "                   " << out[0].get_name() << ",\n";
+                    writer << "                   " << out[0].get_size() << ");\n";
+                }
+            }
        }
    }
 }

--- a/src/ngraph/runtime/cpu/cpu_external_function.cpp
+++ b/src/ngraph/runtime/cpu/cpu_external_function.cpp
@@ -78,6 +78,7 @@
 #include "ngraph/ops/product.hpp"
 #include "ngraph/ops/reduce.hpp"
 #include "ngraph/ops/reduce_window.hpp"
+#include "ngraph/ops/relu.hpp"
 #include "ngraph/ops/remainder.hpp"
 #include "ngraph/ops/replace_slice.hpp"
 #include "ngraph/ops/reshape.hpp"
@@ -97,6 +98,7 @@
 #include "ngraph/pass/liveness.hpp"
 #include "ngraph/pass/manager.hpp"
 #include "ngraph/pass/memory_layout.hpp"
+#include "ngraph/pattern/core_fusion.hpp"
 #include "ngraph/runtime/cpu/cpu_backend.hpp"
 #include "ngraph/runtime/cpu/cpu_call_frame.hpp"
 #include "ngraph/runtime/cpu/cpu_emitter.hpp"
@@ -229,6 +231,8 @@ static const runtime::cpu::OpMap dispatcher{
    {TI(ngraph::op::Product), &runtime::cpu::CPU_Emitter::emit<op::Product>},
    {TI(ngraph::op::Max), &runtime::cpu::CPU_Emitter::emit<op::Max>},
    {TI(ngraph::op::Min), &runtime::cpu::CPU_Emitter::emit<op::Min>},
+    {TI(ngraph::op::Relu), &runtime::cpu::CPU_Emitter::emit<op::Relu>},
+    {TI(ngraph::op::ReluBackprop), &runtime::cpu::CPU_Emitter::emit<op::ReluBackprop>},
 };
 runtime::cpu::CPU_ExternalFunction::CPU_ExternalFunction(
@@ -253,6 +257,7 @@ void runtime::cpu::CPU_ExternalFunction::compile()
    ngraph::pass::Manager pass_manager;
+    pass_manager.register_pass<ngraph::pass::CoreFusion>();
    pass_manager.register_pass<runtime::cpu::pass::CPUFusion>();
    pass_manager.register_pass<runtime::cpu::pass::CPULayout>();
    pass_manager.register_pass<ngraph::pass::Liveness>();
@@ -301,6 +306,7 @@ void runtime::cpu::CPU_ExternalFunction::compile()
 #include "ngraph/runtime/kernel/product.hpp"
 #include "ngraph/runtime/kernel/reduce.hpp"
 #include "ngraph/runtime/kernel/reduce_window.hpp"
+#include "ngraph/runtime/kernel/relu.hpp"
 #include "ngraph/runtime/kernel/replace_slice.hpp"
 #include "ngraph/runtime/kernel/reshape.hpp"
 #include "ngraph/runtime/kernel/reverse.hpp"

--- a/src/ngraph/runtime/cpu/mkldnn_utils.cpp
+++ b/src/ngraph/runtime/cpu/mkldnn_utils.cpp
@@ -25,6 +25,7 @@
 #include "ngraph/ops/batch_norm.hpp"
 #include "ngraph/ops/convolution.hpp"
 #include "ngraph/ops/max_pool.hpp"
+#include "ngraph/ops/relu.hpp"
 #include "mkldnn_utils.hpp"
@@ -46,7 +47,9 @@ namespace ngraph
                    TI(ngraph::op::ConvolutionBackpropData),
                    TI(ngraph::op::ConvolutionBackpropFilters),
                    TI(ngraph::op::MaxPool),
-                    TI(ngraph::op::MaxPoolBackprop)};
+                    TI(ngraph::op::MaxPoolBackprop),
+                    TI(ngraph::op::Relu),
+                    TI(ngraph::op::ReluBackprop)};
                static const std::unordered_map<std::string, const mkldnn::memory::data_type>
                    s_data_type_map{{"char", mkldnn::memory::data_type::s8},

--- a/src/ngraph/runtime/interpreter/int_call_frame.hpp
+++ b/src/ngraph/runtime/interpreter/int_call_frame.hpp
@@ -85,6 +85,7 @@
 #include "ngraph/runtime/kernel/product.hpp"
 #include "ngraph/runtime/kernel/reduce.hpp"
 #include "ngraph/runtime/kernel/reduce_window.hpp"
+#include "ngraph/runtime/kernel/relu.hpp"
 #include "ngraph/runtime/kernel/replace_slice.hpp"
 #include "ngraph/runtime/kernel/reshape.hpp"
 #include "ngraph/runtime/kernel/reverse.hpp"
@@ -677,6 +678,19 @@ private:
                                  reduce_window->get_window_shape(),
                                  reduce_window->get_window_movement_strides());
        }
+        else if (node_op == "Relu")
+        {
+            kernel::relu<T>(reinterpret_cast<T*>(args[0]->get_data_ptr()),
+                            reinterpret_cast<T*>(out[0]->get_data_ptr()),
+                            out[0]->get_element_count());
+        }
+        else if (node_op == "ReluBackprop")
+        {
+            kernel::relu_backprop<T>(reinterpret_cast<T*>(args[0]->get_data_ptr()),
+                                     reinterpret_cast<T*>(args[1]->get_data_ptr()),
+                                     reinterpret_cast<T*>(out[0]->get_data_ptr()),
+                                     out[0]->get_element_count());
+        }
        // else if (node_op == "Remainder")
        // {
        //     // node = make_shared<op::Remainder>(args[0], args[1]);

--- a/src/ngraph/runtime/interpreter/int_external_function.cpp
+++ b/src/ngraph/runtime/interpreter/int_external_function.cpp
@@ -57,6 +57,7 @@
 #include "ngraph/ops/not_equal.hpp"
 #include "ngraph/ops/power.hpp"
 #include "ngraph/ops/reduce.hpp"
+#include "ngraph/ops/relu.hpp"
 #include "ngraph/ops/reshape.hpp"
 #include "ngraph/ops/select.hpp"
 #include "ngraph/ops/sign.hpp"

--- a/src/ngraph/runtime/kernel/relu.hpp
+++ b/src/ngraph/runtime/kernel/relu.hpp
+/*******************************************************************************
+* Copyright 2017-2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#pragma once
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace kernel
+        {
+            template <typename T>
+            void relu(T* arg, T* out, size_t count)
+            {
+                T zero = 0;
+                for (size_t i = 0; i < count; i++)
+                {
+                    out[i] = arg[i] > zero ? arg[i] : zero;
+                }
+            }
+            template <typename T>
+            void relu_backprop(T* arg, T* delta_arg, T* out, size_t count)
+            {
+                T zero = 0;
+                for (size_t i = 0; i < count; i++)
+                {
+                    out[i] = arg[i] > zero ? delta_arg[i] : zero;
+                }
+            }
+        }
+    }
+}
--- a/src/ngraph/serializer.cpp
+++ b/src/ngraph/serializer.cpp
@@ -58,6 +58,7 @@
 #include "ngraph/ops/product.hpp"
 #include "ngraph/ops/reduce.hpp"
 #include "ngraph/ops/reduce_window.hpp"
+#include "ngraph/ops/relu.hpp"
 #include "ngraph/ops/remainder.hpp"
 #include "ngraph/ops/replace_slice.hpp"
 #include "ngraph/ops/reshape.hpp"
@@ -686,6 +687,14 @@ static shared_ptr<ngraph::Function>
        {
            node = make_shared<op::Remainder>(args[0], args[1]);
        }
+        else if (node_op == "Relu")
+        {
+            node = make_shared<op::Relu>(args[0]);
+        }
+        else if (node_op == "ReluBackprop")
+        {
+            node = make_shared<op::ReluBackprop>(args[0], args[1]);
+        }
        else if (node_op == "ReplaceSlice")
        {
            auto lower_bounds = node_js.at("lower_bounds").get<vector<size_t>>();
@@ -1053,6 +1062,12 @@ static json write(const Node& n)
        node["window_shape"] = tmp->get_window_shape();
        node["window_movement_strides"] = tmp->get_window_movement_strides();
    }
+    else if (node_op == "Relu")
+    {
+    }
+    else if (node_op == "ReluBackprop")
+    {
+    }
    else if (node_op == "Remainder")
    {
    }

--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -31,6 +31,7 @@ set (SRC
    builder_xla.cpp
    build_graph.cpp
    copy.cpp
+    core_fusion.cpp
    eigen.cpp
    element_type.cpp
    file_util.cpp

--- a/test/argon_fusion.cpp
+++ b/test/argon_fusion.cpp
@@ -27,13 +27,13 @@
 #include "ngraph/json.hpp"
 #include "ngraph/log.hpp"
 #include "ngraph/ngraph.hpp"
+#include "ngraph/ops/relu.hpp"
 #include "ngraph/ops/sum.hpp"
 #include "ngraph/pass/graph_rewrite.hpp"
 #include "ngraph/pass/manager.hpp"
 #include "ngraph/pattern/matcher.hpp"
 #include "ngraph/pattern/op/any.hpp"
 #include "ngraph/pattern/op/label.hpp"
-#include "ngraph/runtime/argon/ops/relu.hpp"
 #include "ngraph/runtime/argon/pass/argon_fusion.hpp"
 #include "ngraph/serializer.hpp"
 #include "ngraph/util.hpp"

--- a/test/autodiff.in.cpp
+++ b/test/autodiff.in.cpp
@@ -1000,6 +1000,37 @@ TEST(${BACKEND_NAME}, backwards_power)
        autodiff_numeric_compare<float>(manager, backend, make_graph, {x0, x1}, .01f, .01f));
 }
+TEST(${BACKEND_NAME}, backwards_relu)
+{
+    auto manager = runtime::Manager::get("${BACKEND_NAME}");
+    auto backend = manager->allocate_backend();
+    test::Uniform<float> rng_neg(-1.0f, -0.01f);
+    test::Uniform<float> rng_pos(0.01f, 1.0f);
+    Shape shape{2, 3};
+    auto x0 = rng_neg.initialize(backend->make_primary_tensor_view<float>(shape));
+    auto x1 = rng_pos.initialize(backend->make_primary_tensor_view<float>(shape));
+    auto make_graph = [shape]() {
+        auto X = make_shared<op::Parameter>(element::f32, shape);
+        return make_shared<Function>(make_shared<op::Relu>(X),
+                                     std::vector<std::shared_ptr<op::Parameter>>{X});
+    };
+    for (auto i = 0; i < ${TEST_LOOPS}; i++)
+    {
+        auto x_neg = rng_neg.initialize(backend->make_primary_tensor_view<float>(shape));
+        EXPECT_TRUE(
+            autodiff_numeric_compare<float>(manager, backend, make_graph, {x_neg}, .01f, .01f));
+        auto x_pos = rng_pos.initialize(backend->make_primary_tensor_view<float>(shape));
+        EXPECT_TRUE(
+            autodiff_numeric_compare<float>(manager, backend, make_graph, {x_pos}, .01f, .01f));
+    }
+}
 TEST(${BACKEND_NAME}, backwards_replace_slice)
 {
    auto manager = runtime::Manager::get("${BACKEND_NAME}");

--- a/test/backend_test.in.cpp
+++ b/test/backend_test.in.cpp
@@ -7677,3 +7677,120 @@ TEST(${BACKEND_NAME}, min_3d_eliminate_zero_dim)
    cf->call({a}, {result});
    EXPECT_EQ((vector<float>{inf, inf, inf, inf, inf, inf}), read_vector<float>(result));
 }
+TEST(${BACKEND_NAME}, relu_2Dfprop)
+{
+    auto shape_a = Shape{2, 5};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    auto relu = make_shared<op::Relu>(A);
+    auto shape_rt = Shape{2, 5};
+    auto f = make_shared<Function>(relu, op::Parameters{A});
+    auto manager = runtime::Manager::get("${BACKEND_NAME}");
+    auto external = manager->compile(f);
+    auto backend = manager->allocate_backend();
+    auto cf = backend->make_call_frame(external);
+    auto a = backend->make_primary_tensor_view(element::f32, shape_a);
+    copy_data(a, vector<float>{1, 8, -8, 17, -0.5, 1, 8, -8, 17, -0.5});
+    auto result = backend->make_primary_tensor_view(element::f32, shape_rt);
+    vector<float> expected{1, 8, 0, 17, 0, 1, 8, 0, 17, 0};
+    cf->call({a}, {result});
+    EXPECT_EQ(read_vector<float>(result), expected);
+}
+TEST(${BACKEND_NAME}, relu_4Dfprop)
+{
+    auto shape_a = Shape{2, 2, 2, 2};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    auto relu = make_shared<op::Relu>(A);
+    auto shape_rt = Shape{2, 2, 2, 2};
+    auto f = make_shared<Function>(relu, op::Parameters{A});
+    auto manager = runtime::Manager::get("${BACKEND_NAME}");
+    auto external = manager->compile(f);
+    auto backend = manager->allocate_backend();
+    auto cf = backend->make_call_frame(external);
+    auto a = backend->make_primary_tensor_view(element::f32, shape_a);
+    copy_data(a, vector<float>{1, 8, -8, 17, -0.5, 1, 8, -8, 17, -0.5, 1, 8, -8, 17, -0.5, 1});
+    auto result = backend->make_primary_tensor_view(element::f32, shape_rt);
+    vector<float> expected{1, 8, 0, 17, 0, 1, 8, 0, 17, 0, 1, 8, 0, 17, 0, 1};
+    cf->call({a}, {result});
+    EXPECT_EQ(read_vector<float>(result), expected);
+}
+TEST(${BACKEND_NAME}, fuse_max_with_constant_zero_input_as_relu)
+{
+    auto shape_a = Shape{2, 5};
+    auto A = op::Constant::create(element::f32, shape_a, {0, 0, 0, 0, 0, 0, 0, 0, 0, 0});
+    auto B = make_shared<op::Parameter>(element::f32, shape_a);
+    auto max = make_shared<op::Maximum>(A, B);
+    auto shape_rt = Shape{2, 5};
+    auto f = make_shared<Function>(max, op::Parameters{B});
+    auto manager = runtime::Manager::get("${BACKEND_NAME}");
+    auto external = manager->compile(f);
+    auto backend = manager->allocate_backend();
+    auto cf = backend->make_call_frame(external);
+    auto b = backend->make_primary_tensor_view(element::f32, shape_a);
+    copy_data(b, vector<float>{1, 8, -8, 17, -0.5, 1, 8, -8, 17, -0.5});
+    auto result = backend->make_primary_tensor_view(element::f32, shape_rt);
+    vector<float> expected{1, 8, 0, 17, 0, 1, 8, 0, 17, 0};
+    cf->call({b}, {result});
+    EXPECT_EQ(read_vector<float>(result), expected);
+}
+TEST(${BACKEND_NAME}, relu_2Dbackprop)
+{
+    auto shape_a = Shape{2, 5};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    auto delta_val = make_shared<op::Parameter>(element::f32, shape_a);
+    auto relu = make_shared<op::ReluBackprop>(A, delta_val);
+    auto shape_rt = Shape{2, 5};
+    auto f = make_shared<Function>(relu, op::Parameters{A, delta_val});
+    auto manager = runtime::Manager::get("${BACKEND_NAME}");
+    auto external = manager->compile(f);
+    auto backend = manager->allocate_backend();
+    auto cf = backend->make_call_frame(external);
+    auto a = backend->make_primary_tensor_view(element::f32, shape_a);
+    copy_data(a, vector<float>{1, 8, -8, 17, -0.5, 1, 8, -8, 17, -0.5});
+    auto delta = backend->make_primary_tensor_view(element::f32, shape_a);
+    copy_data(delta, vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
+    auto result = backend->make_primary_tensor_view(element::f32, shape_rt);
+    vector<float> expected{1, 2, 0, 4, 0, 6, 7, 0, 9, 0};
+    cf->call({a, delta}, {result});
+    EXPECT_EQ(read_vector<float>(result), expected);
+}
+TEST(${BACKEND_NAME}, relu_4Dbackprop)
+{
+    auto shape_a = Shape{2, 2, 2, 2};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    auto delta_val = make_shared<op::Parameter>(element::f32, shape_a);
+    auto relu = make_shared<op::ReluBackprop>(A, delta_val);
+    auto shape_rt = Shape{2, 2, 2, 2};
+    auto f = make_shared<Function>(relu, op::Parameters{A, delta_val});
+    auto manager = runtime::Manager::get("${BACKEND_NAME}");
+    auto external = manager->compile(f);
+    auto backend = manager->allocate_backend();
+    auto cf = backend->make_call_frame(external);
+    auto a = backend->make_primary_tensor_view(element::f32, shape_a);
+    copy_data(a, vector<float>{1, 8, -8, 17, -0.5, 1, 8, -8, 17, -0.5, 1, 8, -8, 17, -0.5, 1});
+    auto delta = backend->make_primary_tensor_view(element::f32, shape_a);
+    copy_data(delta, vector<float>{1, 8, -8, 17, -0.5, 1, 8, -8, 17, -0.5, 1, 8, -8, 17, -0.5, 1});
+    auto result = backend->make_primary_tensor_view(element::f32, shape_rt);
+    vector<float> expected{1, 8, 0, 17, 0, 1, 8, 0, 17, 0, 1, 8, 0, 17, 0, 1};
+    cf->call({a, delta}, {result});
+    EXPECT_EQ(read_vector<float>(result), expected);
+}
--- a/test/core_fusion.cpp
+++ b/test/core_fusion.cpp
+/*******************************************************************************
+* Copyright 2017-2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#include <algorithm>
+#include <cstdio>
+#include <iostream>
+#include <list>
+#include <memory>
+#include "gtest/gtest.h"
+#include "ngraph/file_util.hpp"
+#include "ngraph/graph_util.hpp"
+#include "ngraph/json.hpp"
+#include "ngraph/log.hpp"
+#include "ngraph/ngraph.hpp"
+#include "ngraph/ops/relu.hpp"
+#include "ngraph/pass/graph_rewrite.hpp"
+#include "ngraph/pass/manager.hpp"
+#include "ngraph/pattern/core_fusion.hpp"
+#include "ngraph/pattern/matcher.hpp"
+#include "ngraph/pattern/op/any.hpp"
+#include "ngraph/pattern/op/label.hpp"
+#include "ngraph/serializer.hpp"
+#include "ngraph/util.hpp"
+#include "util/matcher.hpp"
+#include "util/test_tools.hpp"
+using namespace ngraph;
+using namespace std;
+TEST(core_fusion, core_fusion_pass_basic)
+{
+    auto shape_a = Shape{1, 5};
+    auto A = op::Constant::create(element::f32, shape_a, {0, 0, 0, 0, 0});
+    auto B = make_shared<op::Parameter>(element::f32, shape_a);
+    auto max = make_shared<op::Maximum>(A, B);
+    auto graph = make_shared<op::Abs>(max);
+    pass::Manager pass_manager;
+    pass_manager.register_pass<pass::CoreFusion>();
+    auto func = make_shared<Function>(graph, op::Parameters{B});
+    pass_manager.run_passes(func);
+    ASSERT_NE(std::dynamic_pointer_cast<op::Relu>(graph->get_input_op(0)), nullptr);
+}
--- a/test/pattern.cpp
+++ b/test/pattern.cpp
@@ -70,27 +70,6 @@ static std::shared_ptr<Node> construct_constant_node(int n)
    return op::Constant::create(element::i32, Shape{}, {n});
 }
-bool is_equal_to_const_value(std::string const_value, std::shared_ptr<Node> reduce_constant)
-{
-    if (auto rc = std::dynamic_pointer_cast<op::Constant>(reduce_constant))
-    {
-        auto cshape = rc->get_shape();
-        size_t n = shape_size(cshape);
-        //awkward(but generic) way to construct a constant of a given type, shape, value
-        std::vector<std::string> vz{n, const_value};
-        auto zero_constant = std::make_shared<op::Constant>(rc->get_element_type(), cshape, vz);
-        //equally awkward way to compare elements to const_value
-        size_t n_bytes = n * rc->get_element_type().size();
-        NGRAPH_DEBUG << "Comparing " << n_bytes << " bytes";
-        return !memcmp(zero_constant->get_data_ptr(), rc->get_data_ptr(), n_bytes);
-    }
-    else
-    {
-        return false;
-    }
-}
 bool is_zero(std::shared_ptr<Node> reduce_constant)
 {
    return is_equal_to_const_value("0", reduce_constant);

--- a/test/util/test_tools.hpp
+++ b/test/util/test_tools.hpp
@@ -75,4 +75,3 @@ size_t count_ops_of_type(std::shared_ptr<ngraph::Function> f)
    return count;
 }