Merge remote-tracking branch 'origin/master' into pruthvi/mkldnn_elementwise_add

583dba23 · pthoreho · d57ef7d3 · 59bdd6ee · 583dba23 · 583dba23
Commit 583dba23 authored Feb 22, 2018 by pthoreho
31 changed files
--- a/.gitignore
+++ b/.gitignore
@@ -58,6 +58,7 @@ nervana_aeon.egg-info/
 # vim
 *.swp
 *.swo
+tags

 build/


--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -18,3 +18,4 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DIN_NGRAPH_LIBRARY")

 add_subdirectory(resource)
 add_subdirectory(ngraph)
+add_subdirectory(tools)
--- a/src/ngraph/CMakeLists.txt
+++ b/src/ngraph/CMakeLists.txt
@@ -63,6 +63,7 @@ set (SRC
    ops/power.cpp
    ops/reduce.cpp
    ops/reduce_window.cpp
+    ops/relu.cpp
    ops/replace_slice.cpp
    ops/reshape.cpp
    ops/reverse.cpp
@@ -95,6 +96,7 @@ set (SRC
    pass/reshape_elimination.cpp
    pass/visualize_tree.cpp
    pattern/matcher.cpp
+    pattern/core_fusion.cpp
    runtime/aligned_buffer.cpp
    runtime/host_tensor_view.cpp
    runtime/interpreter/int_backend.cpp

--- a/src/ngraph/codegen/code_writer.hpp
+++ b/src/ngraph/codegen/code_writer.hpp
@@ -70,6 +70,18 @@ public:

    std::string generate_temporary_name(std::string prefix = "tempvar");

+    void block_begin()
+    {
+        *this << "{\n";
+        indent++;
+    }
+
+    void block_end()
+    {
+        indent--;
+        *this << "}\n";
+    }
+
 private:
    std::stringstream m_ss;
    bool m_pending_indent;

--- a/src/ngraph/graph_util.cpp
+++ b/src/ngraph/graph_util.cpp
@@ -25,6 +25,7 @@
 #include "ngraph/graph_util.hpp"
 #include "ngraph/log.hpp"
 #include "ngraph/node.hpp"
+#include "ngraph/ops/constant.hpp"

 using namespace std;

@@ -256,3 +257,25 @@ std::shared_ptr<ngraph::Function> ngraph::clone_function(std::shared_ptr<ngraph:
    // create and return cloned function
    return std::make_shared<ngraph::Function>(cloned_results, cloned_params);
 }
+
+bool ngraph::is_equal_to_const_value(std::string const_value, std::shared_ptr<Node> reduce_constant)
+{
+    if (auto rc = dynamic_pointer_cast<ngraph::op::Constant>(reduce_constant))
+    {
+        auto cshape = rc->get_shape();
+        size_t n = shape_size(cshape);
+        // way to construct a constant of a given type, shape, value
+        std::vector<std::string> vector_zero{n, const_value};
+        auto constant_val_op =
+            std::make_shared<ngraph::op::Constant>(rc->get_element_type(), cshape, vector_zero);
+
+        // way to compare elements to const_value
+        size_t n_bytes = n * rc->get_element_type().size();
+        NGRAPH_DEBUG << "Comparing " << n_bytes << " bytes";
+        return !memcmp(constant_val_op->get_data_ptr(), rc->get_data_ptr(), n_bytes);
+    }
+    else
+    {
+        return false;
+    }
+}
--- a/src/ngraph/graph_util.hpp
+++ b/src/ngraph/graph_util.hpp
@@ -53,6 +53,8 @@ namespace ngraph
    std::list<std::shared_ptr<Node>>
        topological_sort(const std::list<std::shared_ptr<Node>>& nodes);

+    bool is_equal_to_const_value(std::string const_value, std::shared_ptr<Node> reduce_constant);
+
    // maps original to replacement nodes e.g. for clone utilities
    // performs index checking on access
    class NodeMap

--- a/src/ngraph/ngraph.hpp
+++ b/src/ngraph/ngraph.hpp
@@ -107,6 +107,7 @@
 #include "ngraph/ops/product.hpp"
 #include "ngraph/ops/reduce.hpp"
 #include "ngraph/ops/reduce_window.hpp"
+#include "ngraph/ops/relu.hpp"
 #include "ngraph/ops/remainder.hpp"
 #include "ngraph/ops/replace_slice.hpp"
 #include "ngraph/ops/reshape.hpp"

--- a/src/ngraph/ops/relu.cpp
+++ b/src/ngraph/ops/relu.cpp
+/*******************************************************************************
+* Copyright 2017-2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "ngraph/ops/relu.hpp"
+#include "ngraph/ops/multiply.hpp"
+
+using namespace std;
+using namespace ngraph;
+
+op::Relu::Relu(shared_ptr<Node> arg)
+    : UnaryElementwiseArithmetic("Relu", {arg})
+{
+    set_value_type_checked(arg->get_element_type(), arg->get_shape());
+}
+
+op::ReluBackprop::ReluBackprop(shared_ptr<Node> arg, shared_ptr<Node> delta)
+    : RequiresTensorViewArgs("ReluBackprop", {arg, delta})
+{
+    if (arg->get_element_type() != delta->get_element_type())
+    {
+        throw ngraph_error("Argument and delta element types for Relu backprop do not match");
+    }
+    if (arg->get_shape() != delta->get_shape())
+    {
+        throw ngraph_error("Argument and delta shape for Relu backprop do not match");
+    }
+    set_value_type_checked(delta->get_element_type(), delta->get_shape());
+}
+
+void op::Relu::generate_adjoints(autodiff::Adjoints& adjoints, const std::shared_ptr<Node>& delta)
+{
+    auto backprop = std::make_shared<op::ReluBackprop>(get_input_op(0), delta);
+    adjoints.add_delta(get_input_op(0), backprop);
+}
--- a/src/ngraph/ops/relu.hpp
+++ b/src/ngraph/ops/relu.hpp
+/*******************************************************************************
+* Copyright 2017-2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#pragma once
+
+#include "ngraph/node.hpp"
+#include "ngraph/ops/op.hpp"
+#include "ngraph/ops/util/requires_tensor_view_args.hpp"
+#include "ngraph/ops/util/unary_elementwise_arithmetic.hpp"
+#include "ngraph/util.hpp"
+
+#include <memory>
+
+namespace ngraph
+{
+    namespace op
+    {
+        /// \brief Elementwise Relu operation.
+        ///
+        class Relu : public ngraph::op::util::UnaryElementwiseArithmetic
+        {
+        public:
+            /// \brief Constructs a Relu operation.
+            ///
+            /// \param arg Node that produces the input tensor.
+            Relu(std::shared_ptr<ngraph::Node> arg);
+
+            virtual std::shared_ptr<Node> copy_with_new_args(
+                const std::vector<std::shared_ptr<Node>>& new_args) const override
+            {
+                if (new_args.size() != 1)
+                {
+                    throw ngraph_error("Incorrect number of new arguments");
+                }
+                return std::make_shared<Relu>(new_args.at(0));
+            }
+
+            virtual void generate_adjoints(autodiff::Adjoints& adjoints,
+                                           const std::shared_ptr<Node>& delta) override;
+        };
+
+        /// \brief Elementwise ReluBackprop operation.
+        ///
+        class ReluBackprop : public ngraph::op::util::RequiresTensorViewArgs
+        {
+        public:
+            /// \brief Constructs a ReluBackprop operation.
+            ///
+            /// \param arg Node that produces the relu forward input tensor.
+            ReluBackprop(std::shared_ptr<ngraph::Node> arg, std::shared_ptr<ngraph::Node> delta);
+
+            virtual std::shared_ptr<Node> copy_with_new_args(
+                const std::vector<std::shared_ptr<Node>>& new_args) const override
+            {
+                if (new_args.size() != 2)
+                {
+                    throw ngraph_error("Incorrect number of new arguments");
+                }
+                return std::make_shared<ReluBackprop>(new_args.at(0), new_args.at(1));
+            }
+        };
+    }
+}
--- a/src/ngraph/pattern/core_fusion.cpp
+++ b/src/ngraph/pattern/core_fusion.cpp
+/*******************************************************************************
+* Copyright 2017-2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include <algorithm>
+#include <iostream>
+#include <unordered_set>
+
+#include "ngraph/pattern/core_fusion.hpp"
+
+#include "ngraph/graph_util.hpp"
+#include "ngraph/log.hpp"
+#include "ngraph/ops/constant.hpp"
+#include "ngraph/ops/maximum.hpp"
+#include "ngraph/ops/parameter.hpp"
+#include "ngraph/ops/relu.hpp"
+#include "ngraph/pass/graph_rewrite.hpp"
+#include "ngraph/pass/manager.hpp"
+#include "ngraph/pattern/matcher.hpp"
+#include "ngraph/pattern/op/any.hpp"
+#include "ngraph/pattern/op/label.hpp"
+
+using namespace ngraph;
+using namespace std;
+
+bool is_zero(shared_ptr<Node> reduce_constant)
+{
+    auto result_bool = is_equal_to_const_value("0", reduce_constant);
+    return result_bool;
+}
+
+static shared_ptr<Node> construct_constant_node(int n)
+{
+    return op::Constant::create(element::f32, Shape{}, {n});
+}
+
+void pass::CoreFusion::construct_relu_pattern()
+{
+    auto iconst0 = construct_constant_node(0);
+    auto val = make_shared<pattern::op::Label>(iconst0);
+    auto zero = make_shared<pattern::op::Label>(iconst0, nullptr, Nodes{iconst0});
+    auto max = make_shared<op::Maximum>(zero, val);
+
+    pattern::gr_callback_fn callback = [val, zero](pattern::Matcher& m) {
+        NGRAPH_DEBUG << "In a callback for construct_relu_pattern against "
+                     << m.match_root()->get_name();
+        auto pattern_map = m.get_pattern_map();
+        shared_ptr<Node> nn;
+
+        auto mzero = m.get_pattern_map()[zero];
+        if (!is_zero(mzero))
+        {
+            NGRAPH_DEBUG << "zero constant = " << mzero->get_name() << " not equal to 0\n";
+            return nn;
+        }
+        auto mpattern = m.match_root();
+
+        auto cg = shared_ptr<Node>(new op::Relu(pattern_map[val]));
+        return cg;
+    };
+
+    auto m = make_shared<pattern::Matcher>(max, callback);
+    this->add_matcher(m);
+}
--- a/src/ngraph/pattern/core_fusion.hpp
+++ b/src/ngraph/pattern/core_fusion.hpp
+/*******************************************************************************
+* Copyright 2017-2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#pragma once
+
+#include "ngraph/pass/graph_rewrite.hpp"
+
+namespace ngraph
+{
+    namespace pass
+    {
+        class CoreFusion;
+    }
+}
+
+class ngraph::pass::CoreFusion : public ngraph::pass::GraphRewrite
+{
+public:
+    CoreFusion()
+        : GraphRewrite()
+    {
+        construct_relu_pattern();
+    }
+    void construct_relu_pattern();
+};
--- a/src/ngraph/pattern/matcher.hpp
+++ b/src/ngraph/pattern/matcher.hpp
@@ -19,6 +19,7 @@
 #include <cassert>
 #include <memory.h>
 #include "ngraph/node.hpp"
+#include "ngraph/ops/constant.hpp"
 #include "ngraph/pattern/op/any.hpp"
 #include "ngraph/pattern/op/label.hpp"


--- a/src/ngraph/runtime/cpu/cpu_emitter.cpp
+++ b/src/ngraph/runtime/cpu/cpu_emitter.cpp
--- a/src/ngraph/runtime/cpu/cpu_external_function.cpp
+++ b/src/ngraph/runtime/cpu/cpu_external_function.cpp
@@ -78,6 +78,7 @@
 #include "ngraph/ops/product.hpp"
 #include "ngraph/ops/reduce.hpp"
 #include "ngraph/ops/reduce_window.hpp"
+#include "ngraph/ops/relu.hpp"
 #include "ngraph/ops/remainder.hpp"
 #include "ngraph/ops/replace_slice.hpp"
 #include "ngraph/ops/reshape.hpp"
@@ -97,6 +98,7 @@
 #include "ngraph/pass/liveness.hpp"
 #include "ngraph/pass/manager.hpp"
 #include "ngraph/pass/memory_layout.hpp"
+#include "ngraph/pattern/core_fusion.hpp"
 #include "ngraph/runtime/cpu/cpu_backend.hpp"
 #include "ngraph/runtime/cpu/cpu_call_frame.hpp"
 #include "ngraph/runtime/cpu/cpu_emitter.hpp"
@@ -229,6 +231,8 @@ static const runtime::cpu::OpMap dispatcher{
    {TI(ngraph::op::Product), &runtime::cpu::CPU_Emitter::emit<op::Product>},
    {TI(ngraph::op::Max), &runtime::cpu::CPU_Emitter::emit<op::Max>},
    {TI(ngraph::op::Min), &runtime::cpu::CPU_Emitter::emit<op::Min>},
+    {TI(ngraph::op::Relu), &runtime::cpu::CPU_Emitter::emit<op::Relu>},
+    {TI(ngraph::op::ReluBackprop), &runtime::cpu::CPU_Emitter::emit<op::ReluBackprop>},
 };

 runtime::cpu::CPU_ExternalFunction::CPU_ExternalFunction(
@@ -253,6 +257,7 @@ void runtime::cpu::CPU_ExternalFunction::compile()

    ngraph::pass::Manager pass_manager;

+    pass_manager.register_pass<ngraph::pass::CoreFusion>();
    pass_manager.register_pass<runtime::cpu::pass::CPUFusion>();
    pass_manager.register_pass<runtime::cpu::pass::CPULayout>();
    pass_manager.register_pass<ngraph::pass::Liveness>();
@@ -301,6 +306,7 @@ void runtime::cpu::CPU_ExternalFunction::compile()
 #include "ngraph/runtime/kernel/product.hpp"
 #include "ngraph/runtime/kernel/reduce.hpp"
 #include "ngraph/runtime/kernel/reduce_window.hpp"
+#include "ngraph/runtime/kernel/relu.hpp"
 #include "ngraph/runtime/kernel/replace_slice.hpp"
 #include "ngraph/runtime/kernel/reshape.hpp"
 #include "ngraph/runtime/kernel/reverse.hpp"

--- a/src/ngraph/runtime/cpu/mkldnn_utils.cpp
+++ b/src/ngraph/runtime/cpu/mkldnn_utils.cpp
@@ -26,6 +26,7 @@
 #include "ngraph/ops/batch_norm.hpp"
 #include "ngraph/ops/convolution.hpp"
 #include "ngraph/ops/max_pool.hpp"
+#include "ngraph/ops/relu.hpp"

 #include "mkldnn_utils.hpp"

@@ -48,7 +49,9 @@ namespace ngraph
                    TI(ngraph::op::ConvolutionBackpropData),
                    TI(ngraph::op::ConvolutionBackpropFilters),
                    TI(ngraph::op::MaxPool),
-                    TI(ngraph::op::MaxPoolBackprop)};
+                    TI(ngraph::op::MaxPoolBackprop),
+                    TI(ngraph::op::Relu),
+                    TI(ngraph::op::ReluBackprop)};

                static const std::unordered_map<std::string, const mkldnn::memory::data_type>
                    s_data_type_map{{"char", mkldnn::memory::data_type::s8},

--- a/src/ngraph/runtime/interpreter/int_call_frame.hpp
+++ b/src/ngraph/runtime/interpreter/int_call_frame.hpp
@@ -85,6 +85,7 @@
 #include "ngraph/runtime/kernel/product.hpp"
 #include "ngraph/runtime/kernel/reduce.hpp"
 #include "ngraph/runtime/kernel/reduce_window.hpp"
+#include "ngraph/runtime/kernel/relu.hpp"
 #include "ngraph/runtime/kernel/replace_slice.hpp"
 #include "ngraph/runtime/kernel/reshape.hpp"
 #include "ngraph/runtime/kernel/reverse.hpp"
@@ -677,6 +678,19 @@ private:
                                  reduce_window->get_window_shape(),
                                  reduce_window->get_window_movement_strides());
        }
+        else if (node_op == "Relu")
+        {
+            kernel::relu<T>(reinterpret_cast<T*>(args[0]->get_data_ptr()),
+                            reinterpret_cast<T*>(out[0]->get_data_ptr()),
+                            out[0]->get_element_count());
+        }
+        else if (node_op == "ReluBackprop")
+        {
+            kernel::relu_backprop<T>(reinterpret_cast<T*>(args[0]->get_data_ptr()),
+                                     reinterpret_cast<T*>(args[1]->get_data_ptr()),
+                                     reinterpret_cast<T*>(out[0]->get_data_ptr()),
+                                     out[0]->get_element_count());
+        }
        // else if (node_op == "Remainder")
        // {
        //     // node = make_shared<op::Remainder>(args[0], args[1]);

--- a/src/ngraph/runtime/interpreter/int_external_function.cpp
+++ b/src/ngraph/runtime/interpreter/int_external_function.cpp
@@ -57,6 +57,7 @@
 #include "ngraph/ops/not_equal.hpp"
 #include "ngraph/ops/power.hpp"
 #include "ngraph/ops/reduce.hpp"
+#include "ngraph/ops/relu.hpp"
 #include "ngraph/ops/reshape.hpp"
 #include "ngraph/ops/select.hpp"
 #include "ngraph/ops/sign.hpp"

--- a/src/ngraph/runtime/kernel/relu.hpp
+++ b/src/ngraph/runtime/kernel/relu.hpp
+/*******************************************************************************
+* Copyright 2017-2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#pragma once
+
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace kernel
+        {
+            template <typename T>
+            void relu(T* arg, T* out, size_t count)
+            {
+                T zero = 0;
+                for (size_t i = 0; i < count; i++)
+                {
+                    out[i] = arg[i] > zero ? arg[i] : zero;
+                }
+            }
+            template <typename T>
+            void relu_backprop(T* arg, T* delta_arg, T* out, size_t count)
+            {
+                T zero = 0;
+                for (size_t i = 0; i < count; i++)
+                {
+                    out[i] = arg[i] > zero ? delta_arg[i] : zero;
+                }
+            }
+        }
+    }
+}
--- a/src/ngraph/serializer.cpp
+++ b/src/ngraph/serializer.cpp
@@ -58,6 +58,7 @@
 #include "ngraph/ops/product.hpp"
 #include "ngraph/ops/reduce.hpp"
 #include "ngraph/ops/reduce_window.hpp"
+#include "ngraph/ops/relu.hpp"
 #include "ngraph/ops/remainder.hpp"
 #include "ngraph/ops/replace_slice.hpp"
 #include "ngraph/ops/reshape.hpp"
@@ -686,6 +687,14 @@ static shared_ptr<ngraph::Function>
        {
            node = make_shared<op::Remainder>(args[0], args[1]);
        }
+        else if (node_op == "Relu")
+        {
+            node = make_shared<op::Relu>(args[0]);
+        }
+        else if (node_op == "ReluBackprop")
+        {
+            node = make_shared<op::ReluBackprop>(args[0], args[1]);
+        }
        else if (node_op == "ReplaceSlice")
        {
            auto lower_bounds = node_js.at("lower_bounds").get<vector<size_t>>();
@@ -1053,6 +1062,12 @@ static json write(const Node& n)
        node["window_shape"] = tmp->get_window_shape();
        node["window_movement_strides"] = tmp->get_window_movement_strides();
    }
+    else if (node_op == "Relu")
+    {
+    }
+    else if (node_op == "ReluBackprop")
+    {
+    }
    else if (node_op == "Remainder")
    {
    }

--- a/src/tools/CMakeLists.txt
+++ b/src/tools/CMakeLists.txt
+# ******************************************************************************
+# Copyright 2017-2018 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ******************************************************************************
+
+if(MKLDNN_INCLUDE_DIR)
+    link_directories(${MKLDNN_LIB_DIR})
+endif()
+
+if (NGRAPH_CPU_ENABLE)
+    set (SRC
+        nbench.cpp
+        ${PROJECT_SOURCE_DIR}/test/util/benchmark.cpp
+    )
+
+    add_executable(nbench ${SRC})
+    add_dependencies(nbench ngraph)
+    
+    set(HEADER_SEARCH_DEFINES
+        "NGRAPH_HEADERS_PATH=\"${NGRAPH_INCLUDE_PATH}\""
+    )
+    
+    target_link_libraries(nbench ngraph)
+    
+    set_source_files_properties(nbench.cpp PROPERTIES COMPILE_DEFINITIONS "${HEADER_SEARCH_DEFINES}")
+
+endif()
--- a/src/tools/nbench.cpp
+++ b/src/tools/nbench.cpp
+/*******************************************************************************
+* Copyright 2017-2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+// tool to benchmark any ngraph json model with given backend.
+// compile and run with:
+// g++ ./nbench.cpp -std=c++11 -I$HOME/ngraph_dist/include -L$HOME/ngraph_dist/lib -lngraph -o nbench
+// env LD_LIBRARY_PATH=$HOME/ngraph_dist/lib env NGRAPH_INTERPRETER_EMIT_TIMING=1 ./nbench
+// sample models are under ../../test/models
+
+#include <fstream>
+#include <ngraph/runtime/backend.hpp>
+#include <ngraph/runtime/call_frame.hpp>
+#include <ngraph/runtime/manager.hpp>
+#include "../../test/util/benchmark.hpp"
+#include "../../test/util/test_tools.hpp"
+using namespace std;
+
+int main(int argc, char** argv)
+{
+    string model = "model.json";
+    string backend = "INTERPRETER";
+    int iter = 10;
+    bool failed = false;
+    for (size_t i = 1; i < argc; i++)
+    {
+        if (string(argv[i]) == "-f")
+        {
+            model = argv[++i];
+        }
+        else if (string(argv[i]) == "-b")
+        {
+            backend = argv[++i];
+        }
+        else if (string(argv[i]) == "-i")
+        {
+            try
+            {
+                iter = stoi(argv[++i]);
+            }
+            catch (...)
+            {
+                cout << "Invalid Argument\n";
+                failed = true;
+            }
+        }
+    }
+    if (!static_cast<bool>(ifstream(model)))
+    {
+        cout << "File " << model << " not found\n";
+        failed = true;
+    }
+
+    if (failed)
+    {
+        cout << R"###(
+DESCRIPTION                                                         
+    Benchmark ngraph json model with given backend.                 
+                                                                    
+SYNOPSIS                                                            
+        nbench [-f <filename>] [-b <backend>] [-i <iterations>]
+                                                                    
+OPTIONS                                                             
+        -f          model json file to use (default: model.json)    
+        -b          Backend to use (default: INTERPRETER)           
+        -i          Iterations (default: 10)                        
+)###";
+        return 1;
+    }
+    cout << "Benchmarking " << model << ", " << backend << " backend, " << iter << " iterations.\n";
+    run_benchmark(model, backend, iter);
+}
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -31,6 +31,7 @@ set (SRC
    builder_xla.cpp
    build_graph.cpp
    copy.cpp
+    core_fusion.cpp
    eigen.cpp
    element_type.cpp
    file_util.cpp
@@ -50,6 +51,7 @@ set (SRC
    type_prop.cpp
    util/autodiff/backprop_function.cpp
    util/test_tools.cpp
+    util/benchmark.cpp
    util.cpp
    uuid.cpp
 )

--- a/test/argon_fusion.cpp
+++ b/test/argon_fusion.cpp
@@ -27,13 +27,13 @@
 #include "ngraph/json.hpp"
 #include "ngraph/log.hpp"
 #include "ngraph/ngraph.hpp"
+#include "ngraph/ops/relu.hpp"
 #include "ngraph/ops/sum.hpp"
 #include "ngraph/pass/graph_rewrite.hpp"
 #include "ngraph/pass/manager.hpp"
 #include "ngraph/pattern/matcher.hpp"
 #include "ngraph/pattern/op/any.hpp"
 #include "ngraph/pattern/op/label.hpp"
-#include "ngraph/runtime/argon/ops/relu.hpp"
 #include "ngraph/runtime/argon/pass/argon_fusion.hpp"
 #include "ngraph/serializer.hpp"
 #include "ngraph/util.hpp"

--- a/test/autodiff.in.cpp
+++ b/test/autodiff.in.cpp
@@ -1000,6 +1000,37 @@ TEST(${BACKEND_NAME}, backwards_power)
        autodiff_numeric_compare<float>(manager, backend, make_graph, {x0, x1}, .01f, .01f));
 }

+TEST(${BACKEND_NAME}, backwards_relu)
+{
+    auto manager = runtime::Manager::get("${BACKEND_NAME}");
+    auto backend = manager->allocate_backend();
+
+    test::Uniform<float> rng_neg(-1.0f, -0.01f);
+    test::Uniform<float> rng_pos(0.01f, 1.0f);
+    Shape shape{2, 3};
+    auto x0 = rng_neg.initialize(backend->make_primary_tensor_view<float>(shape));
+    auto x1 = rng_pos.initialize(backend->make_primary_tensor_view<float>(shape));
+
+    auto make_graph = [shape]() {
+        auto X = make_shared<op::Parameter>(element::f32, shape);
+        return make_shared<Function>(make_shared<op::Relu>(X),
+                                     std::vector<std::shared_ptr<op::Parameter>>{X});
+    };
+
+    for (auto i = 0; i < ${TEST_LOOPS}; i++)
+    {
+        auto x_neg = rng_neg.initialize(backend->make_primary_tensor_view<float>(shape));
+
+        EXPECT_TRUE(
+            autodiff_numeric_compare<float>(manager, backend, make_graph, {x_neg}, .01f, .01f));
+
+        auto x_pos = rng_pos.initialize(backend->make_primary_tensor_view<float>(shape));
+
+        EXPECT_TRUE(
+            autodiff_numeric_compare<float>(manager, backend, make_graph, {x_pos}, .01f, .01f));
+    }
+}
+
 TEST(${BACKEND_NAME}, backwards_replace_slice)
 {
    auto manager = runtime::Manager::get("${BACKEND_NAME}");

--- a/test/backend_performance.cpp
+++ b/test/backend_performance.cpp
@@ -31,92 +31,13 @@
 #include "ngraph/runtime/manager.hpp"
 #include "ngraph/serializer.hpp"
 #include "ngraph/util.hpp"
+#include "util/benchmark.hpp"
 #include "util/random.hpp"
 #include "util/test_tools.hpp"

 using namespace std;
 using namespace ngraph;

-static multimap<size_t, string>
-    agregate_timing(const vector<runtime::PerformanceCounter>& perf_data)
-{
-    unordered_map<string, size_t> timing;
-    for (const runtime::PerformanceCounter& p : perf_data)
-    {
-        string op = p.name().substr(0, p.name().find('_'));
-        timing[op] += p.microseconds();
-    }
-
-    multimap<size_t, string> rc;
-    for (const pair<string, size_t>& t : timing)
-    {
-        rc.insert({t.second, t.first});
-    }
-    return rc;
-}
-
-void run_benchmark(const string& json_path, const string& backend_name, size_t iterations)
-{
-    string env_var_name = "NGRAPH_" + backend_name + "_EMIT_TIMING";
-    bool emit_timing = (std::getenv(env_var_name.c_str()) != nullptr);
-    if (!emit_timing)
-    {
-        cout << "To get per-op timing set the environment variable " << env_var_name << "\n";
-    }
-
-    test::Uniform<float> rng{-1, 1, 0};
-    const string json_string = file_util::read_file_to_string(json_path);
-    stringstream ss(json_string);
-    shared_ptr<Function> f = ngraph::deserialize(ss);
-
-    stopwatch build_time;
-    build_time.start();
-    auto manager = runtime::Manager::get(backend_name);
-    auto external = manager->compile(f);
-    auto backend = manager->allocate_backend();
-    auto cf = backend->make_call_frame(external);
-    build_time.stop();
-    cout << "build_time " << build_time.get_milliseconds() << "ms" << endl;
-
-    vector<shared_ptr<runtime::TensorView>> args;
-    for (shared_ptr<op::Parameter> param : f->get_parameters())
-    {
-        auto tensor =
-            backend->make_primary_tensor_view(param->get_element_type(), param->get_shape());
-        rng.initialize(tensor);
-        args.push_back(tensor);
-    }
-    vector<shared_ptr<runtime::TensorView>> results;
-    for (shared_ptr<Node> out : f->get_results())
-    {
-        auto result = backend->make_primary_tensor_view(out->get_element_type(), out->get_shape());
-        results.push_back(result);
-    }
-
-    stopwatch t1;
-    t1.start();
-    for (size_t i = 0; i < static_cast<size_t>(iterations); i++)
-    {
-        cf->tensor_call(args, results);
-    }
-    t1.stop();
-    float time = t1.get_milliseconds();
-    cout << time / iterations << "ms per iteration" << endl;
-
-    vector<runtime::PerformanceCounter> perf_data = cf->get_performance_data();
-    sort(perf_data.begin(),
-         perf_data.end(),
-         [](const runtime::PerformanceCounter& p1, const runtime::PerformanceCounter& p2) {
-             return p1.total_microseconds() > p2.total_microseconds();
-         });
-    multimap<size_t, string> timing = agregate_timing(perf_data);
-    for (auto it = timing.rbegin(); it != timing.rend(); it++)
-    {
-        cout.imbue(locale(""));
-        cout << setw(15) << left << it->second << " " << setw(10) << right << it->first << "us\n";
-    }
-}
-
 TEST(benchmark, mxnet_mnist_mlp_forward)
 {
    const string json_path = file_util::path_join(SERIALIZED_ZOO, "mxnet/mnist_mlp_forward.json");

--- a/test/backend_test.in.cpp
+++ b/test/backend_test.in.cpp
@@ -7677,3 +7677,120 @@ TEST(${BACKEND_NAME}, min_3d_eliminate_zero_dim)
    cf->call({a}, {result});
    EXPECT_EQ((vector<float>{inf, inf, inf, inf, inf, inf}), read_vector<float>(result));
 }
+
+TEST(${BACKEND_NAME}, relu_2Dfprop)
+{
+    auto shape_a = Shape{2, 5};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    auto relu = make_shared<op::Relu>(A);
+    auto shape_rt = Shape{2, 5};
+    auto f = make_shared<Function>(relu, op::Parameters{A});
+
+    auto manager = runtime::Manager::get("${BACKEND_NAME}");
+    auto external = manager->compile(f);
+    auto backend = manager->allocate_backend();
+    auto cf = backend->make_call_frame(external);
+
+    auto a = backend->make_primary_tensor_view(element::f32, shape_a);
+    copy_data(a, vector<float>{1, 8, -8, 17, -0.5, 1, 8, -8, 17, -0.5});
+    auto result = backend->make_primary_tensor_view(element::f32, shape_rt);
+    vector<float> expected{1, 8, 0, 17, 0, 1, 8, 0, 17, 0};
+
+    cf->call({a}, {result});
+    EXPECT_EQ(read_vector<float>(result), expected);
+}
+
+TEST(${BACKEND_NAME}, relu_4Dfprop)
+{
+    auto shape_a = Shape{2, 2, 2, 2};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    auto relu = make_shared<op::Relu>(A);
+    auto shape_rt = Shape{2, 2, 2, 2};
+    auto f = make_shared<Function>(relu, op::Parameters{A});
+
+    auto manager = runtime::Manager::get("${BACKEND_NAME}");
+    auto external = manager->compile(f);
+    auto backend = manager->allocate_backend();
+    auto cf = backend->make_call_frame(external);
+
+    auto a = backend->make_primary_tensor_view(element::f32, shape_a);
+    copy_data(a, vector<float>{1, 8, -8, 17, -0.5, 1, 8, -8, 17, -0.5, 1, 8, -8, 17, -0.5, 1});
+    auto result = backend->make_primary_tensor_view(element::f32, shape_rt);
+    vector<float> expected{1, 8, 0, 17, 0, 1, 8, 0, 17, 0, 1, 8, 0, 17, 0, 1};
+
+    cf->call({a}, {result});
+    EXPECT_EQ(read_vector<float>(result), expected);
+}
+
+TEST(${BACKEND_NAME}, fuse_max_with_constant_zero_input_as_relu)
+{
+    auto shape_a = Shape{2, 5};
+    auto A = op::Constant::create(element::f32, shape_a, {0, 0, 0, 0, 0, 0, 0, 0, 0, 0});
+    auto B = make_shared<op::Parameter>(element::f32, shape_a);
+    auto max = make_shared<op::Maximum>(A, B);
+    auto shape_rt = Shape{2, 5};
+    auto f = make_shared<Function>(max, op::Parameters{B});
+
+    auto manager = runtime::Manager::get("${BACKEND_NAME}");
+    auto external = manager->compile(f);
+    auto backend = manager->allocate_backend();
+    auto cf = backend->make_call_frame(external);
+
+    auto b = backend->make_primary_tensor_view(element::f32, shape_a);
+    copy_data(b, vector<float>{1, 8, -8, 17, -0.5, 1, 8, -8, 17, -0.5});
+    auto result = backend->make_primary_tensor_view(element::f32, shape_rt);
+    vector<float> expected{1, 8, 0, 17, 0, 1, 8, 0, 17, 0};
+
+    cf->call({b}, {result});
+    EXPECT_EQ(read_vector<float>(result), expected);
+}
+
+TEST(${BACKEND_NAME}, relu_2Dbackprop)
+{
+    auto shape_a = Shape{2, 5};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    auto delta_val = make_shared<op::Parameter>(element::f32, shape_a);
+    auto relu = make_shared<op::ReluBackprop>(A, delta_val);
+    auto shape_rt = Shape{2, 5};
+    auto f = make_shared<Function>(relu, op::Parameters{A, delta_val});
+
+    auto manager = runtime::Manager::get("${BACKEND_NAME}");
+    auto external = manager->compile(f);
+    auto backend = manager->allocate_backend();
+    auto cf = backend->make_call_frame(external);
+
+    auto a = backend->make_primary_tensor_view(element::f32, shape_a);
+    copy_data(a, vector<float>{1, 8, -8, 17, -0.5, 1, 8, -8, 17, -0.5});
+    auto delta = backend->make_primary_tensor_view(element::f32, shape_a);
+    copy_data(delta, vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
+    auto result = backend->make_primary_tensor_view(element::f32, shape_rt);
+    vector<float> expected{1, 2, 0, 4, 0, 6, 7, 0, 9, 0};
+
+    cf->call({a, delta}, {result});
+    EXPECT_EQ(read_vector<float>(result), expected);
+}
+
+TEST(${BACKEND_NAME}, relu_4Dbackprop)
+{
+    auto shape_a = Shape{2, 2, 2, 2};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    auto delta_val = make_shared<op::Parameter>(element::f32, shape_a);
+    auto relu = make_shared<op::ReluBackprop>(A, delta_val);
+    auto shape_rt = Shape{2, 2, 2, 2};
+    auto f = make_shared<Function>(relu, op::Parameters{A, delta_val});
+
+    auto manager = runtime::Manager::get("${BACKEND_NAME}");
+    auto external = manager->compile(f);
+    auto backend = manager->allocate_backend();
+    auto cf = backend->make_call_frame(external);
+
+    auto a = backend->make_primary_tensor_view(element::f32, shape_a);
+    copy_data(a, vector<float>{1, 8, -8, 17, -0.5, 1, 8, -8, 17, -0.5, 1, 8, -8, 17, -0.5, 1});
+    auto delta = backend->make_primary_tensor_view(element::f32, shape_a);
+    copy_data(delta, vector<float>{1, 8, -8, 17, -0.5, 1, 8, -8, 17, -0.5, 1, 8, -8, 17, -0.5, 1});
+    auto result = backend->make_primary_tensor_view(element::f32, shape_rt);
+    vector<float> expected{1, 8, 0, 17, 0, 1, 8, 0, 17, 0, 1, 8, 0, 17, 0, 1};
+
+    cf->call({a, delta}, {result});
+    EXPECT_EQ(read_vector<float>(result), expected);
+}
--- a/test/core_fusion.cpp
+++ b/test/core_fusion.cpp
+/*******************************************************************************
+* Copyright 2017-2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include <algorithm>
+#include <cstdio>
+#include <iostream>
+#include <list>
+#include <memory>
+
+#include "gtest/gtest.h"
+
+#include "ngraph/file_util.hpp"
+#include "ngraph/graph_util.hpp"
+#include "ngraph/json.hpp"
+#include "ngraph/log.hpp"
+#include "ngraph/ngraph.hpp"
+#include "ngraph/ops/relu.hpp"
+#include "ngraph/pass/graph_rewrite.hpp"
+#include "ngraph/pass/manager.hpp"
+#include "ngraph/pattern/core_fusion.hpp"
+#include "ngraph/pattern/matcher.hpp"
+#include "ngraph/pattern/op/any.hpp"
+#include "ngraph/pattern/op/label.hpp"
+#include "ngraph/serializer.hpp"
+#include "ngraph/util.hpp"
+#include "util/matcher.hpp"
+#include "util/test_tools.hpp"
+
+using namespace ngraph;
+using namespace std;
+
+TEST(core_fusion, core_fusion_pass_basic)
+{
+    auto shape_a = Shape{1, 5};
+    auto A = op::Constant::create(element::f32, shape_a, {0, 0, 0, 0, 0});
+    auto B = make_shared<op::Parameter>(element::f32, shape_a);
+    auto max = make_shared<op::Maximum>(A, B);
+    auto graph = make_shared<op::Abs>(max);
+    pass::Manager pass_manager;
+    pass_manager.register_pass<pass::CoreFusion>();
+    auto func = make_shared<Function>(graph, op::Parameters{B});
+    pass_manager.run_passes(func);
+    ASSERT_NE(std::dynamic_pointer_cast<op::Relu>(graph->get_input_op(0)), nullptr);
+}
--- a/test/pattern.cpp
+++ b/test/pattern.cpp
@@ -70,27 +70,6 @@ static std::shared_ptr<Node> construct_constant_node(int n)
    return op::Constant::create(element::i32, Shape{}, {n});
 }

-bool is_equal_to_const_value(std::string const_value, std::shared_ptr<Node> reduce_constant)
-{
-    if (auto rc = std::dynamic_pointer_cast<op::Constant>(reduce_constant))
-    {
-        auto cshape = rc->get_shape();
-        size_t n = shape_size(cshape);
-        //awkward(but generic) way to construct a constant of a given type, shape, value
-        std::vector<std::string> vz{n, const_value};
-        auto zero_constant = std::make_shared<op::Constant>(rc->get_element_type(), cshape, vz);
-
-        //equally awkward way to compare elements to const_value
-        size_t n_bytes = n * rc->get_element_type().size();
-        NGRAPH_DEBUG << "Comparing " << n_bytes << " bytes";
-        return !memcmp(zero_constant->get_data_ptr(), rc->get_data_ptr(), n_bytes);
-    }
-    else
-    {
-        return false;
-    }
-}
-
 bool is_zero(std::shared_ptr<Node> reduce_constant)
 {
    return is_equal_to_const_value("0", reduce_constant);

--- a/test/util/benchmark.cpp
+++ b/test/util/benchmark.cpp
+/*******************************************************************************
+* Copyright 2017-2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#include "benchmark.hpp"
+#include "ngraph/runtime/backend.hpp"
+#include "ngraph/runtime/call_frame.hpp"
+#include "ngraph/runtime/manager.hpp"
+#include "ngraph/runtime/tensor_view.hpp"
+#include "ngraph/serializer.hpp"
+#include "random.hpp"
+
+std::multimap<size_t, std::string>
+    aggregate_timing(const std::vector<ngraph::runtime::PerformanceCounter>& perf_data)
+{
+    std::unordered_map<std::string, size_t> timing;
+    for (const ngraph::runtime::PerformanceCounter& p : perf_data)
+    {
+        std::string op = p.name().substr(0, p.name().find('_'));
+        timing[op] += p.microseconds();
+    }
+
+    std::multimap<size_t, std::string> rc;
+    for (const std::pair<std::string, size_t>& t : timing)
+    {
+        rc.insert({t.second, t.first});
+    }
+    return rc;
+}
+
+void run_benchmark(const std::string& json_path, const std::string& backend_name, size_t iterations)
+{
+    using namespace std;
+    using namespace ngraph;
+    string env_var_name = "NGRAPH_" + backend_name + "_EMIT_TIMING";
+    bool emit_timing = (std::getenv(env_var_name.c_str()) != nullptr);
+    if (!emit_timing)
+    {
+        cout << "To get per-op timing set the environment variable " << env_var_name << "\n";
+    }
+
+    ngraph::test::Uniform<float> rng{-1, 1, 0};
+    const string json_string = file_util::read_file_to_string(json_path);
+    stringstream ss(json_string);
+    shared_ptr<Function> f = deserialize(ss);
+
+    stopwatch build_time;
+    build_time.start();
+    auto manager = runtime::Manager::get(backend_name);
+    auto external = manager->compile(f);
+    auto backend = manager->allocate_backend();
+    auto cf = backend->make_call_frame(external);
+    build_time.stop();
+    cout << "build_time " << build_time.get_milliseconds() << "ms" << endl;
+
+    vector<shared_ptr<runtime::TensorView>> args;
+    for (shared_ptr<op::Parameter> param : f->get_parameters())
+    {
+        auto tensor =
+            backend->make_primary_tensor_view(param->get_element_type(), param->get_shape());
+        rng.initialize(tensor);
+        args.push_back(tensor);
+    }
+    vector<shared_ptr<runtime::TensorView>> results;
+    for (shared_ptr<Node> out : f->get_results())
+    {
+        auto result = backend->make_primary_tensor_view(out->get_element_type(), out->get_shape());
+        results.push_back(result);
+    }
+
+    stopwatch t1;
+    t1.start();
+    for (size_t i = 0; i < static_cast<size_t>(iterations); i++)
+    {
+        cf->tensor_call(args, results);
+    }
+    t1.stop();
+    float time = t1.get_milliseconds();
+    cout << time / iterations << "ms per iteration" << endl;
+
+    vector<runtime::PerformanceCounter> perf_data = cf->get_performance_data();
+    sort(perf_data.begin(),
+         perf_data.end(),
+         [](const runtime::PerformanceCounter& p1, const runtime::PerformanceCounter& p2) {
+             return p1.total_microseconds() > p2.total_microseconds();
+         });
+    multimap<size_t, string> timing = aggregate_timing(perf_data);
+    for (auto it = timing.rbegin(); it != timing.rend(); it++)
+    {
+        cout.imbue(locale(""));
+        cout << setw(15) << left << it->second << " " << setw(10) << right << it->first << "us\n";
+    }
+}
--- a/test/util/benchmark.hpp
+++ b/test/util/benchmark.hpp
+/*******************************************************************************
+* Copyright 2017-2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include <ngraph/runtime/call_frame.hpp>
+#include "test_tools.hpp"
+
+/// performance test utilities
+std::multimap<size_t, std::string>
+    aggregate_timing(const std::vector<ngraph::runtime::PerformanceCounter>& perf_data);
+
+void run_benchmark(const std::string& json_path,
+                   const std::string& backend_name,
+                   size_t iterations);
--- a/test/util/test_tools.hpp
+++ b/test/util/test_tools.hpp
@@ -21,7 +21,9 @@
 #include <memory>

 #include "ngraph/descriptor/layout/tensor_view_layout.hpp"
+#include "ngraph/file_util.hpp"
 #include "ngraph/runtime/tensor_view.hpp"
+#include "ngraph/serializer.hpp"

 namespace ngraph
 {