Optimizations to reduce compile time (#357)

* much faster compile time * Remove all variables and just directly access inputs, output, and temps. * compare layouts when checking if two ops are equal * make performance counters available to all backends

Optimizations to reduce compile time (#357)
* much faster compile time * Remove all variables and just directly access inputs, output, and temps. * compare layouts when checking if two ops are equal * make performance counters available to all backends
7f3dc2d7 · Robert Kimball · GitHub · 524d04fc · 7f3dc2d7 · 7f3dc2d7
Unverified Commit 7f3dc2d7 authored Jan 09, 2018 by Robert Kimball Committed by GitHub Jan 09, 2018
37 changed files
--- a/src/ngraph/codegen/execution_engine.cpp
+++ b/src/ngraph/codegen/execution_engine.cpp
@@ -76,5 +76,6 @@ void codegen::ExecutionEngine::finalize()

 void* codegen::ExecutionEngine::get_pointer_to_named_function(const std::string& func_name)
 {
-    return m_execution_engine->getPointerToNamedFunction(func_name);
+    // set AbortOnFailure flag to false so call fails by returning nullptr
+    return m_execution_engine->getPointerToNamedFunction(func_name, false);
 }
--- a/src/ngraph/node.cpp
+++ b/src/ngraph/node.cpp
@@ -18,6 +18,7 @@
 #include <typeinfo>

 #include "ngraph/autodiff/adjoints.hpp"
+#include "ngraph/descriptor/layout/tensor_view_layout.hpp"
 #include "ngraph/descriptor/primary_tensor_view.hpp"
 #include "ngraph/ops/parameter.hpp"

@@ -334,3 +335,53 @@ bool Node::has_same_type(std::shared_ptr<const Node> node) const
    }
    return true;
 }
+
+bool Node::is_functionally_identical(const Node& other) const
+{
+    bool rc = true;
+    if (this->description() == other.description())
+    {
+        const deque<descriptor::Input>& i1 = this->get_inputs();
+        const deque<descriptor::Input>& i2 = other.get_inputs();
+        const deque<descriptor::Output>& o1 = this->get_outputs();
+        const deque<descriptor::Output>& o2 = other.get_outputs();
+        if (i1.size() == i2.size() && o1.size() == o2.size())
+        {
+            for (size_t i = 0; i < i1.size(); i++)
+            {
+                auto tvl1 = i1[i].get_output().get_tensor_view()->get_tensor_view_layout();
+                auto tvl2 = i2[i].get_output().get_tensor_view()->get_tensor_view_layout();
+                if (tvl1->get_shape() != tvl2->get_shape())
+                {
+                    rc = false;
+                }
+                else if (*tvl1 != *tvl2)
+                {
+                    rc = false;
+                }
+            }
+            for (size_t i = 0; i < o1.size(); i++)
+            {
+                auto tvl1 = o1[i].get_tensor_view()->get_tensor_view_layout();
+                auto tvl2 = o2[i].get_tensor_view()->get_tensor_view_layout();
+                if (tvl1->get_shape() != tvl2->get_shape())
+                {
+                    rc = false;
+                }
+                else if (*tvl1 != *tvl2)
+                {
+                    rc = false;
+                }
+            }
+        }
+        else
+        {
+            rc = false;
+        }
+    }
+    else
+    {
+        rc = false;
+    }
+    return rc;
+}
--- a/src/ngraph/node.hpp
+++ b/src/ngraph/node.hpp
@@ -160,6 +160,8 @@ namespace ngraph
        // True if this and node have one output with same element type and shape
        bool has_same_type(std::shared_ptr<const Node> node) const;

+        virtual bool is_functionally_identical(const Node&) const;
+
    protected:
        void add_output(const element::Type& element_type, const Shape& shape);
        void assert_argument_list_equivalency(const Nodes& b);

--- a/src/ngraph/ops/concatenate.cpp
+++ b/src/ngraph/ops/concatenate.cpp
@@ -102,3 +102,18 @@ void op::Concat::generate_adjoints(autodiff::Adjoints& adjoints, const std::shar
        pos = next_pos;
    }
 }
+
+bool op::Concat::is_functionally_identical(const Node& other) const
+{
+    bool rc = true;
+    if (Node::is_functionally_identical(other))
+    {
+        const Concat& concat = dynamic_cast<const Concat&>(other);
+        rc &= m_concatenation_axis == concat.m_concatenation_axis;
+    }
+    else
+    {
+        rc = false;
+    }
+    return rc;
+}
--- a/src/ngraph/ops/concatenate.hpp
+++ b/src/ngraph/ops/concatenate.hpp
@@ -74,6 +74,8 @@ namespace ngraph

            /// \return The concatenation axis.
            size_t get_concatenation_axis() const { return m_concatenation_axis; }
+            bool is_functionally_identical(const Node&) const override;
+
        protected:
            virtual void generate_adjoints(autodiff::Adjoints& adjoints,
                                           const std::shared_ptr<Node>& delta) override;

--- a/src/ngraph/ops/constant.cpp
+++ b/src/ngraph/ops/constant.cpp
@@ -43,7 +43,9 @@ std::string to_cpp_string(T value)
    }
    else
    {
-        rc = to_string(value);
+        stringstream ss;
+        ss << value;
+        rc = ss.str();
    }
    return rc;
 }

--- a/src/ngraph/ops/convolution.cpp
+++ b/src/ngraph/ops/convolution.cpp
@@ -257,6 +257,30 @@ std::shared_ptr<Node>
                                         m_padding_above);
 }

+bool op::Convolution::is_functionally_identical(const Node& other) const
+{
+    bool rc = true;
+    if (Node::is_functionally_identical(other))
+    {
+        const Convolution& rhs = dynamic_cast<const Convolution&>(other);
+        rc &= m_window_movement_strides == rhs.m_window_movement_strides;
+        rc &= m_window_dilation_strides == rhs.m_window_dilation_strides;
+        rc &= m_input_channel_count == rhs.m_input_channel_count;
+        rc &= m_output_channel_count == rhs.m_output_channel_count;
+        rc &= m_input_image_shape == rhs.m_input_image_shape;
+        rc &= m_output_image_shape == rhs.m_output_image_shape;
+        rc &= m_window_physical_shape == rhs.m_window_physical_shape;
+        rc &= m_window_virtual_shape == rhs.m_window_virtual_shape;
+        rc &= m_batch_size == rhs.m_batch_size;
+        rc &= m_image_dimension_count == rhs.m_image_dimension_count;
+    }
+    else
+    {
+        rc = false;
+    }
+    return rc;
+}
+
 /*
 void op::Convolution::generate_adjoints(autodiff::Adjoints& adjoints, const std::shared_ptr<Node>& delta)
 {

--- a/src/ngraph/ops/convolution.hpp
+++ b/src/ngraph/ops/convolution.hpp
@@ -125,6 +125,8 @@ namespace ngraph
            size_t get_batch_size() const { return m_batch_size; }
            /// \return The number of image dimensions.
            size_t get_image_dimension_count() const { return m_image_dimension_count; }
+            bool is_functionally_identical(const Node&) const override;
+
        protected:
            Strides m_window_movement_strides;
            Strides m_window_dilation_strides;

--- a/src/ngraph/ops/dot.cpp
+++ b/src/ngraph/ops/dot.cpp
@@ -143,3 +143,18 @@ void op::Dot::generate_adjoints(autodiff::Adjoints& adjoints, const std::shared_
    auto x_reshaped_dot_delta = make_shared<Dot>(x_reshaped, delta, I_shape.size()); // JK
    adjoints.add_delta(y, x_reshaped_dot_delta);
 }
+
+bool op::Dot::is_functionally_identical(const Node& other) const
+{
+    bool rc = true;
+    if (Node::is_functionally_identical(other))
+    {
+        const Dot& rhs = dynamic_cast<const Dot&>(other);
+        rc &= m_reduction_axes_count == rhs.m_reduction_axes_count;
+    }
+    else
+    {
+        rc = false;
+    }
+    return rc;
+}
--- a/src/ngraph/ops/dot.hpp
+++ b/src/ngraph/ops/dot.hpp
@@ -85,6 +85,7 @@ namespace ngraph
                return std::make_shared<Dot>(
                    new_args.at(0), new_args.at(1), m_reduction_axes_count);
            }
+            bool is_functionally_identical(const Node&) const override;

        protected:
            size_t m_reduction_axes_count;

--- a/src/ngraph/ops/function_call.cpp
+++ b/src/ngraph/ops/function_call.cpp
@@ -47,3 +47,8 @@ op::FunctionCall::FunctionCall(std::shared_ptr<Function> function,
        add_output(function->get_output_element_type(i), function->get_output_shape(i));
    }
 }
+
+bool op::FunctionCall::is_functionally_identical(const Node&) const
+{
+    return false;
+}
--- a/src/ngraph/ops/function_call.hpp
+++ b/src/ngraph/ops/function_call.hpp
@@ -55,6 +55,8 @@ namespace ngraph
                return std::make_shared<FunctionCall>(m_function, new_args);
            }

+            /// \return The function to be called.
+            bool is_functionally_identical(const Node&) const override;
            /// \return A singleton vector containing the function to be called.
            std::vector<std::shared_ptr<Function>> get_functions() const override
            {

--- a/src/ngraph/ops/max_pool.cpp
+++ b/src/ngraph/ops/max_pool.cpp
@@ -147,6 +147,27 @@ op::MaxPool::MaxPool(const std::shared_ptr<Node>& arg, const Shape& window_shape
 {
 }

+bool op::MaxPool::is_functionally_identical(const Node& other) const
+{
+    bool rc = true;
+    if (Node::is_functionally_identical(other))
+    {
+        const MaxPool& rhs = dynamic_cast<const MaxPool&>(other);
+        rc &= m_window_shape == rhs.m_window_shape;
+        rc &= m_window_movement_strides == rhs.m_window_movement_strides;
+        rc &= m_channel_count == rhs.m_channel_count;
+        rc &= m_input_image_shape == rhs.m_input_image_shape;
+        rc &= m_output_image_shape == rhs.m_output_image_shape;
+        rc &= m_batch_size == rhs.m_batch_size;
+        rc &= m_image_dimension_count == rhs.m_image_dimension_count;
+    }
+    else
+    {
+        rc = false;
+    }
+    return rc;
+}
+
 /*
 void op::MaxPool::generate_adjoints(autodiff::Adjoints& adjoints, const std::shared_ptr<Node>& delta)
 {

--- a/src/ngraph/ops/max_pool.hpp
+++ b/src/ngraph/ops/max_pool.hpp
@@ -77,6 +77,8 @@ namespace ngraph
            size_t get_batch_size() const { return m_batch_size; }
            /// \return The number of image dimensions.
            size_t get_image_dimension_count() const { return m_image_dimension_count; }
+            bool is_functionally_identical(const Node&) const override;
+
        protected:
            Shape m_window_shape;
            Strides m_window_movement_strides;

--- a/src/ngraph/ops/one_hot.cpp
+++ b/src/ngraph/ops/one_hot.cpp
@@ -41,3 +41,19 @@ op::OneHot::OneHot(const std::shared_ptr<Node>& arg, const Shape& shape, size_t

    set_value_type_checked(make_shared<TensorViewType>(input_element_type, shape));
 }
+
+bool op::OneHot::is_functionally_identical(const Node& other) const
+{
+    bool rc = true;
+    if (Node::is_functionally_identical(other))
+    {
+        const OneHot& rhs = dynamic_cast<const OneHot&>(other);
+        rc &= m_shape == rhs.m_shape;
+        rc &= m_one_hot_axis == rhs.m_one_hot_axis;
+    }
+    else
+    {
+        rc = false;
+    }
+    return rc;
+}
--- a/src/ngraph/ops/one_hot.hpp
+++ b/src/ngraph/ops/one_hot.hpp
@@ -60,6 +60,8 @@ namespace ngraph

            /// \return The index of the one-hot axis.
            size_t get_one_hot_axis() const { return m_one_hot_axis; }
+            bool is_functionally_identical(const Node&) const override;
+
        protected:
            Shape m_shape;
            size_t m_one_hot_axis;

--- a/src/ngraph/ops/reduce.cpp
+++ b/src/ngraph/ops/reduce.cpp
@@ -90,3 +90,8 @@ op::Reduce::Reduce(const std::shared_ptr<Node>& arg_reductee,

    add_output(input_reductee.get_element_type(), result_shape);
 }
+
+bool op::Reduce::is_functionally_identical(const Node& other) const
+{
+    return false;
+}
--- a/src/ngraph/ops/reduce.hpp
+++ b/src/ngraph/ops/reduce.hpp
@@ -110,6 +110,8 @@ namespace ngraph
            }
            /// \return The axis positions (0-based) to be eliminated through reduction.
            const AxisSet& get_reduction_axes() const { return m_reduction_axes; }
+            bool is_functionally_identical(const Node&) const override;
+
        protected:
            std::shared_ptr<Function> m_reduction_function;
            AxisSet m_reduction_axes;

--- a/src/ngraph/ops/replace_slice.cpp
+++ b/src/ngraph/ops/replace_slice.cpp
@@ -132,3 +132,20 @@ void op::ReplaceSlice::generate_adjoints(autodiff::Adjoints& adjoints,
    adjoints.add_delta(
        y, std::make_shared<op::Slice>(delta, m_lower_bounds, m_upper_bounds, m_strides));
 }
+
+bool op::ReplaceSlice::is_functionally_identical(const Node& other) const
+{
+    bool rc = true;
+    if (Node::is_functionally_identical(other))
+    {
+        const ReplaceSlice& slice = dynamic_cast<const ReplaceSlice&>(other);
+        rc &= m_lower_bounds == slice.m_lower_bounds;
+        rc &= m_upper_bounds == slice.m_upper_bounds;
+        rc &= m_strides == slice.m_strides;
+    }
+    else
+    {
+        rc = false;
+    }
+    return rc;
+}
--- a/src/ngraph/ops/replace_slice.hpp
+++ b/src/ngraph/ops/replace_slice.hpp
@@ -87,6 +87,8 @@ namespace ngraph
            const Coordinate& get_upper_bounds() const { return m_upper_bounds; }
            /// \return The slicing strides.
            const Strides& get_strides() const { return m_strides; }
+            bool is_functionally_identical(const Node&) const override;
+
        protected:
            virtual void generate_adjoints(autodiff::Adjoints& adjoints,
                                           const std::shared_ptr<Node>& delta) override;

--- a/src/ngraph/ops/reshape.cpp
+++ b/src/ngraph/ops/reshape.cpp
@@ -99,3 +99,19 @@ void op::Reshape::generate_adjoints(autodiff::Adjoints& adjoints,

    adjoints.add_delta(get_input_op(0), reshape);
 }
+
+bool op::Reshape::is_functionally_identical(const Node& other) const
+{
+    bool rc = true;
+    if (Node::is_functionally_identical(other))
+    {
+        const Reshape& reshape = dynamic_cast<const Reshape&>(other);
+        rc &= m_input_order == reshape.m_input_order;
+        rc &= m_output_shape == reshape.m_output_shape;
+    }
+    else
+    {
+        rc = false;
+    }
+    return rc;
+}
--- a/src/ngraph/ops/reshape.hpp
+++ b/src/ngraph/ops/reshape.hpp
@@ -79,6 +79,8 @@ namespace ngraph
            const AxisVector& get_input_order() const { return m_input_order; }
            /// \return The shape of the output tensor.
            const Shape& get_output_shape() const { return m_output_shape; }
+            bool is_functionally_identical(const Node&) const override;
+
        protected:
            virtual void generate_adjoints(autodiff::Adjoints& adjoints,
                                           const std::shared_ptr<Node>& delta) override;

--- a/src/ngraph/ops/slice.cpp
+++ b/src/ngraph/ops/slice.cpp
@@ -97,3 +97,20 @@ void op::Slice::generate_adjoints(autodiff::Adjoints& adjoints, const std::share

    adjoints.add_delta_to_slice(x, delta, m_lower_bounds, m_upper_bounds, m_strides);
 }
+
+bool op::Slice::is_functionally_identical(const Node& other) const
+{
+    bool rc = true;
+    if (Node::is_functionally_identical(other))
+    {
+        const Slice& slice = dynamic_cast<const Slice&>(other);
+        rc &= m_lower_bounds == slice.m_lower_bounds;
+        rc &= m_upper_bounds == slice.m_upper_bounds;
+        rc &= m_strides == slice.m_strides;
+    }
+    else
+    {
+        rc = false;
+    }
+    return rc;
+}
--- a/src/ngraph/ops/slice.hpp
+++ b/src/ngraph/ops/slice.hpp
@@ -85,6 +85,8 @@ namespace ngraph
            const Coordinate& get_upper_bounds() const { return m_upper_bounds; }
            /// \return The slicing strides.
            const Strides& get_strides() const { return m_strides; }
+            bool is_functionally_identical(const Node&) const override;
+
        protected:
            virtual void generate_adjoints(autodiff::Adjoints& adjoints,
                                           const std::shared_ptr<Node>& delta) override;

--- a/src/ngraph/ops/sum.cpp
+++ b/src/ngraph/ops/sum.cpp
@@ -60,3 +60,18 @@ void op::Sum::generate_adjoints(autodiff::Adjoints& adjoints, const std::shared_

    adjoints.add_delta(x, make_shared<op::Broadcast>(delta, x_shape, m_reduction_axes));
 }
+
+bool op::Sum::is_functionally_identical(const Node& other) const
+{
+    bool rc = true;
+    if (Node::is_functionally_identical(other))
+    {
+        const Sum& slice = dynamic_cast<const Sum&>(other);
+        rc &= m_reduction_axes == slice.m_reduction_axes;
+    }
+    else
+    {
+        rc = false;
+    }
+    return rc;
+}
--- a/src/ngraph/ops/sum.hpp
+++ b/src/ngraph/ops/sum.hpp
@@ -93,6 +93,8 @@ namespace ngraph

            /// \return The axis positions (0-based) to be eliminated through summation.
            const AxisSet& get_reduction_axes() const { return m_reduction_axes; }
+            bool is_functionally_identical(const Node&) const override;
+
        protected:
            virtual void generate_adjoints(autodiff::Adjoints& adjoints,
                                           const std::shared_ptr<Node>& delta) override;

--- a/src/ngraph/runtime/call_frame.hpp
+++ b/src/ngraph/runtime/call_frame.hpp
@@ -24,6 +24,28 @@ namespace ngraph
 {
    namespace runtime
    {
+        class PrimaryTensorView;
+        class Value;
+
+        class PerformanceCounter
+        {
+        public:
+            PerformanceCounter(const char* n, size_t us, size_t calls)
+                : m_name(n)
+                , m_total_microseconds(us)
+                , m_call_count(calls)
+            {
+            }
+            const std::string& name() const { return m_name; }
+            size_t total_microseconds() const { return m_total_microseconds; }
+            size_t microseconds() const { return m_total_microseconds / m_call_count; }
+            size_t call_count() const { return m_call_count; }
+        private:
+            std::string m_name;
+            size_t m_total_microseconds;
+            size_t m_call_count;
+        };
+
        // A VM for executing lightly-compiled graph functions.
        class CallFrame
        {
@@ -38,6 +60,11 @@ namespace ngraph
            /// @brief Invoke the function with tuples pre-expanded to their underlying tensor views.
            virtual void tensor_call(const TensorViewPtrs& inputs,
                                     const TensorViewPtrs& outputs) = 0;
+
+            virtual std::vector<PerformanceCounter> get_performance_data() const
+            {
+                return std::vector<PerformanceCounter>();
+            }
        };
    }
 }
--- a/src/ngraph/runtime/cpu/cpu_call_frame.cpp
+++ b/src/ngraph/runtime/cpu/cpu_call_frame.cpp
@@ -71,39 +71,26 @@ void runtime::cpu::CPU_CallFrame::call(
    tensor_call(inputs, outputs);
 }

-vector<runtime::cpu::PerformanceCounter> runtime::cpu::CPU_CallFrame::get_performance_data() const
+vector<runtime::PerformanceCounter> runtime::cpu::CPU_CallFrame::get_performance_data() const
 {
+    vector<runtime::PerformanceCounter> rc;
    auto* engine = m_external_function->m_execution_engine.get();
-    auto get_count = engine->find_function<size_t()>("get_debug_timer_count");
-    auto get_name = engine->find_function<const char*(size_t)>("get_debug_timer_name");
-    auto get_microseconds = engine->find_function<size_t(size_t)>("get_debug_timer_microseconds");
-    auto get_call_count = engine->find_function<size_t(size_t)>("get_debug_timer_call_count");
-
-    if (!get_count)
-    {
-        throw runtime_error("failed to find accessor function 'get_debug_timer_count'");
-    }
-
-    if (!get_name)
-    {
-        throw runtime_error("failed to find accessor function 'get_debug_timer_name'");
-    }
-
-    if (!get_microseconds)
-    {
-        throw runtime_error("failed to find accessor function 'get_debug_timer_microseconds'");
-    }
-
-    if (!get_call_count)
+    if (engine)
    {
-        throw runtime_error("failed to find accessor function 'get_debug_timer_call_count'");
-    }
+        auto get_count = engine->find_function<size_t()>("get_debug_timer_count");
+        auto get_name = engine->find_function<const char*(size_t)>("get_debug_timer_name");
+        auto get_microseconds =
+            engine->find_function<size_t(size_t)>("get_debug_timer_microseconds");
+        auto get_call_count = engine->find_function<size_t(size_t)>("get_debug_timer_call_count");

-    vector<runtime::cpu::PerformanceCounter> rc;
-    size_t count = get_count();
-    for (size_t i = 0; i < count; i++)
-    {
-        rc.push_back({get_name(i), get_microseconds(i), get_call_count(i)});
+        if (get_count && get_name && get_microseconds && get_call_count)
+        {
+            size_t count = get_count();
+            for (size_t i = 0; i < count; i++)
+            {
+                rc.push_back({get_name(i), get_microseconds(i), get_call_count(i)});
+            }
+        }
    }
    return rc;
 }
--- a/src/ngraph/runtime/cpu/cpu_call_frame.hpp
+++ b/src/ngraph/runtime/cpu/cpu_call_frame.hpp
@@ -33,25 +33,6 @@ namespace ngraph
            class CPU_CallFrame;
            class CPU_ExternalFunction;

-            class PerformanceCounter
-            {
-            public:
-                PerformanceCounter(const char* n, size_t us, size_t calls)
-                    : m_name(n)
-                    , m_total_microseconds(us)
-                    , m_call_count(calls)
-                {
-                }
-                const std::string& name() const { return m_name; }
-                size_t total_microseconds() const { return m_total_microseconds; }
-                size_t microseconds() const { return m_total_microseconds / m_call_count; }
-                size_t call_count() const { return m_call_count; }
-            private:
-                std::string m_name;
-                size_t m_total_microseconds;
-                size_t m_call_count;
-            };
-
            using EntryPoint_t = void(void** inputs, void** outputs);

            using EntryPoint = std::function<EntryPoint_t>;
@@ -66,15 +47,17 @@ namespace ngraph
                /// @brief Invoke the function with values matching the signature of the function.
                ///
                /// Tuples will be expanded into their tensor views to build the call frame.
-                void call(const std::vector<std::shared_ptr<runtime::TensorView>>& inputs,
-                          const std::vector<std::shared_ptr<runtime::TensorView>>& outputs);
+                void
+                    call(const std::vector<std::shared_ptr<runtime::TensorView>>& inputs,
+                         const std::vector<std::shared_ptr<runtime::TensorView>>& outputs) override;

                /// @brief Invoke the function with tuples pre-expanded to their underlying
                /// tensor views.
                void tensor_call(const std::vector<std::shared_ptr<TensorView>>& inputs,
-                                 const std::vector<std::shared_ptr<TensorView>>& outputs);
+                                 const std::vector<std::shared_ptr<TensorView>>& outputs) override;

-                std::vector<ngraph::runtime::cpu::PerformanceCounter> get_performance_data() const;
+                std::vector<ngraph::runtime::PerformanceCounter>
+                    get_performance_data() const override;

            protected:
                std::shared_ptr<CPU_ExternalFunction> m_external_function;

--- a/src/ngraph/runtime/cpu/cpu_external_function.cpp
+++ b/src/ngraph/runtime/cpu/cpu_external_function.cpp
--- a/src/ngraph/runtime/cpu/cpu_external_function.hpp
+++ b/src/ngraph/runtime/cpu/cpu_external_function.hpp
@@ -78,6 +78,7 @@ namespace ngraph
                std::unique_ptr<codegen::ExecutionEngine> m_execution_engine;
                bool m_emit_timing;
                bool m_use_tbb;
+                std::unordered_map<std::string, std::string> m_variable_name_map;
            };
        }
    }

--- a/src/ngraph/runtime/cpu/cpu_tensor_view_wrapper.cpp
+++ b/src/ngraph/runtime/cpu/cpu_tensor_view_wrapper.cpp
@@ -19,8 +19,10 @@
 using namespace std;
 using namespace ngraph;

-runtime::cpu::TensorViewWrapper::TensorViewWrapper(const shared_ptr<descriptor::TensorView>& tv)
+runtime::cpu::TensorViewWrapper::TensorViewWrapper(const shared_ptr<descriptor::TensorView>& tv,
+                                                   const string& alias)
    : m_tensor_view(tv)
+    , m_alias(alias)
 {
 }

@@ -46,7 +48,14 @@ const element::Type& runtime::cpu::TensorViewWrapper::get_element_type() const

 const std::string& runtime::cpu::TensorViewWrapper::get_name() const
 {
-    return m_tensor_view->get_tensor().get_name();
+    if (m_alias.empty())
+    {
+        return m_tensor_view->get_tensor().get_name();
+    }
+    else
+    {
+        return m_alias;
+    }
 }

 const std::string& runtime::cpu::TensorViewWrapper::get_type() const

--- a/src/ngraph/runtime/cpu/cpu_tensor_view_wrapper.hpp
+++ b/src/ngraph/runtime/cpu/cpu_tensor_view_wrapper.hpp
@@ -33,7 +33,8 @@ namespace ngraph
 class ngraph::runtime::cpu::TensorViewWrapper
 {
 public:
-    TensorViewWrapper(const std::shared_ptr<descriptor::TensorView>&);
+    TensorViewWrapper(const std::shared_ptr<descriptor::TensorView>&,
+                      const std::string& alias = "");

    size_t get_size() const;
    const std::vector<size_t>& get_shape() const;
@@ -45,4 +46,5 @@ public:

 private:
    std::shared_ptr<descriptor::TensorView> m_tensor_view;
+    std::string m_alias;
 };
--- a/src/ngraph/runtime/interpreter/int_call_frame.cpp
+++ b/src/ngraph/runtime/interpreter/int_call_frame.cpp
@@ -13,6 +13,8 @@
 // ----------------------------------------------------------------------------

 #include <algorithm>
+#include <cstdlib>
+#include <iomanip>

 #include "ngraph/runtime/interpreter/int_call_frame.hpp"
 #include "ngraph/runtime/interpreter/int_tensor_view.hpp"
@@ -24,6 +26,7 @@ runtime::interpreter::INT_CallFrame::INT_CallFrame(shared_ptr<ExternalFunction>
                                                   shared_ptr<Function> func)
    : m_external_function(external_function)
    , m_function(func)
+    , m_emit_timing(std::getenv("NGRAPH_INTERPRETER_EMIT_TIMING") != nullptr)
 {
 }

@@ -136,7 +139,16 @@ void runtime::interpreter::INT_CallFrame::call(
            secondary_type = op->get_inputs().at(0).get_tensor().get_element_type();
        }

+        if (m_emit_timing)
+        {
+            m_timer_map[op.get()].start();
+        }
        generate_calls(base_type, secondary_type, *op, inputs, outputs);
+        if (m_emit_timing)
+        {
+            stopwatch& timer = m_timer_map[op.get()];
+            timer.stop();
+        }

        handle_output_alias(*op, output_alias_map, output_tvs);

@@ -281,3 +293,16 @@ void runtime::interpreter::INT_CallFrame::call(

    tensor_call(inputs, outputs);
 }
+
+vector<runtime::PerformanceCounter>
+    runtime::interpreter::INT_CallFrame::get_performance_data() const
+{
+    vector<runtime::PerformanceCounter> rc;
+    for (const pair<const Node*, stopwatch> p : m_timer_map)
+    {
+        rc.emplace_back(p.first->get_name().c_str(),
+                        p.second.get_total_microseconds(),
+                        p.second.get_call_count());
+    }
+    return rc;
+}
--- a/src/ngraph/runtime/interpreter/int_call_frame.hpp
+++ b/src/ngraph/runtime/interpreter/int_call_frame.hpp
@@ -112,13 +112,14 @@ public:
    ///
    /// Tuples will be expanded into their tensor views to build the call frame.
    void call(const std::vector<std::shared_ptr<runtime::TensorView>>& inputs,
-              const std::vector<std::shared_ptr<runtime::TensorView>>& outputs);
+              const std::vector<std::shared_ptr<runtime::TensorView>>& outputs) override;
+    std::vector<runtime::PerformanceCounter> get_performance_data() const override;

 private:
    /// @brief Invoke the function with tuples pre-expanded to their underlying
    /// tensor views.
    void tensor_call(const std::vector<std::shared_ptr<TensorView>>& inputs,
-                     const std::vector<std::shared_ptr<TensorView>>& outputs);
+                     const std::vector<std::shared_ptr<TensorView>>& outputs) override;
    void tensor_call(const std::vector<std::shared_ptr<INT_TensorView>>& inputs,
                     const std::vector<std::shared_ptr<INT_TensorView>>& outputs);
    void call(std::shared_ptr<Function> function,
@@ -131,6 +132,9 @@ private:

    std::shared_ptr<ExternalFunction> m_external_function;
    std::shared_ptr<Function> m_function;
+    bool m_emit_timing;
+    std::unordered_map<const Node*, stopwatch> m_timer_map;
+
    void generate_calls(const element::Type& base_type,
                        const element::Type& secondary_type,
                        ngraph::Node& op,

--- a/test/backend_performance.cpp
+++ b/test/backend_performance.cpp
@@ -42,10 +42,10 @@ static void copy_data(shared_ptr<runtime::TensorView> tv, const vector<T>& data)
 }

 static multimap<size_t, string>
-    agregate_timing(const vector<runtime::cpu::PerformanceCounter>& perf_data)
+    agregate_timing(const vector<runtime::PerformanceCounter>& perf_data)
 {
    unordered_map<string, size_t> timing;
-    for (const runtime::cpu::PerformanceCounter& p : perf_data)
+    for (const runtime::PerformanceCounter& p : perf_data)
    {
        string op = p.name().substr(0, p.name().find('_'));
        timing[op] += p.microseconds();
@@ -59,12 +59,13 @@ static multimap<size_t, string>
    return rc;
 }

-void run_benchmark(const std::string& json_path, size_t iterations)
+void run_benchmark(const string& json_path, const string& backend_name, size_t iterations)
 {
-    bool emit_timing = (std::getenv("NGRAPH_CPU_EMIT_TIMING") != nullptr);
+    string env_var_name = "NGRAPH_" + backend_name + "_EMIT_TIMING";
+    bool emit_timing = (std::getenv(env_var_name.c_str()) != nullptr);
    if (!emit_timing)
    {
-        cout << "To get per-op timing set the environment variable NGRAPH_CPU_EMIT_TIMING\n";
+        cout << "To get per-op timing set the environment variable " << env_var_name << "\n";
    }

    test::Uniform<float> rng{-1, 1, 0};
@@ -74,11 +75,10 @@ void run_benchmark(const std::string& json_path, size_t iterations)

    stopwatch build_time;
    build_time.start();
-    auto manager = runtime::Manager::get("CPU");
+    auto manager = runtime::Manager::get(backend_name);
    auto external = manager->compile(f);
    auto backend = manager->allocate_backend();
    auto cf = backend->make_call_frame(external);
-    runtime::cpu::CPU_CallFrame* cpu_cf = static_cast<runtime::cpu::CPU_CallFrame*>(cf.get());
    build_time.stop();
    cout << "build_time " << build_time.get_milliseconds() << "ms" << endl;

@@ -107,47 +107,60 @@ void run_benchmark(const std::string& json_path, size_t iterations)
    float time = t1.get_milliseconds();
    cout << time / iterations << "ms per iteration" << endl;

-    if (emit_timing)
+    vector<runtime::PerformanceCounter> perf_data = cf->get_performance_data();
+    sort(perf_data.begin(),
+         perf_data.end(),
+         [](const runtime::PerformanceCounter& p1, const runtime::PerformanceCounter& p2) {
+             return p1.total_microseconds() > p2.total_microseconds();
+         });
+    multimap<size_t, string> timing = agregate_timing(perf_data);
+    for (auto it = timing.rbegin(); it != timing.rend(); it++)
    {
-        vector<runtime::cpu::PerformanceCounter> perf_data = cpu_cf->get_performance_data();
-        sort(perf_data.begin(),
-             perf_data.end(),
-             [](const runtime::cpu::PerformanceCounter& p1,
-                const runtime::cpu::PerformanceCounter& p2) {
-                 return p1.total_microseconds() > p2.total_microseconds();
-             });
-        multimap<size_t, string> timing = agregate_timing(perf_data);
-        for (auto it = timing.rbegin(); it != timing.rend(); it++)
-        {
-            cout.imbue(locale(""));
-            cout << setw(15) << left << it->second << " " << setw(10) << right << it->first
-                 << "us\n";
-        }
+        cout.imbue(locale(""));
+        cout << setw(15) << left << it->second << " " << setw(10) << right << it->first << "us\n";
    }
 }

 TEST(benchmark, mxnet_mnist_mlp_forward)
 {
    const string json_path = file_util::path_join(SERIALIZED_ZOO, "mxnet/mnist_mlp_forward.json");
-    run_benchmark(json_path, 1000);
+    run_benchmark(json_path, "CPU", 1000);
 }

 TEST(benchmark, mxnet_10_bucket_lstm)
 {
    const string json_path = file_util::path_join(SERIALIZED_ZOO, "mxnet/10_bucket_LSTM.json");
-    run_benchmark(json_path, 10);
+    run_benchmark(json_path, "CPU", 10);
+}
+
+TEST(benchmark, mxnet_10_bucket_lstm_int)
+{
+    const string json_path = file_util::path_join(SERIALIZED_ZOO, "mxnet/10_bucket_LSTM.json");
+    run_benchmark(json_path, "INTERPRETER", 10);
 }

 TEST(benchmark, mxnet_lstm_backward)
 {
    const string json_path = file_util::path_join(SERIALIZED_ZOO, "mxnet/LSTM_backward.json");
-    run_benchmark(json_path, 10);
+    run_benchmark(json_path, "CPU", 10);
+}
+
+TEST(benchmark, mxnet_lstm_backward_int)
+{
+    const string json_path = file_util::path_join(SERIALIZED_ZOO, "mxnet/LSTM_backward.json");
+    run_benchmark(json_path, "INTERPRETER", 1);
 }

 TEST(benchmark, mxnet_lstm_forward)
 {
    const string json_path = file_util::path_join(SERIALIZED_ZOO, "mxnet/LSTM_forward.json");
-    run_benchmark(json_path, 10);
+    run_benchmark(json_path, "CPU", 10);
+}
+
+TEST(benchmark, mxnet_lstm_forward_int)
+{
+    const string json_path = file_util::path_join(SERIALIZED_ZOO, "mxnet/LSTM_forward.json");
+    run_benchmark(json_path, "INTERPRETER", 10);
 }

 //

--- a/test/backend_test.in.cpp
+++ b/test/backend_test.in.cpp
@@ -1208,7 +1208,7 @@ TEST(${BACKEND_NAME}, log)
    auto result = backend->make_primary_tensor_view(element::f32, shape);

    cf->call({a}, {result});
-    EXPECT_EQ(loga, result->get_vector<float>());
+    EXPECT_TRUE(test::all_close(loga, result->get_vector<float>()));
 }

 TEST(${BACKEND_NAME}, maximum)
@@ -2615,7 +2615,7 @@ TEST(${BACKEND_NAME}, cosh)
        input.begin(), input.end(), input.begin(), [](float x) -> float { return coshf(x); });

    cf->call({a}, {result});
-    EXPECT_EQ(input, result->get_vector<float>());
+    EXPECT_TRUE(test::all_close(input, result->get_vector<float>()));
 }

 TEST(${BACKEND_NAME}, tanh)