Merge remote-tracking branch 'origin/master' into tfl/gpu_dot_back

0b99a7a1 · fenglei.tian · bc4aefed · 41a883b1 · 0b99a7a1 · 0b99a7a1
Commit 0b99a7a1 authored Mar 12, 2018 by fenglei.tian
21 changed files
--- a/src/ngraph/graph_util.hpp
+++ b/src/ngraph/graph_util.hpp
@@ -16,6 +16,7 @@

 #pragma once

+#include <functional>
 #include <list>
 #include <memory>
 #include <string>

--- a/src/ngraph/pass/assign_placement.hpp
+++ b/src/ngraph/pass/assign_placement.hpp
@@ -17,6 +17,7 @@
 #pragma once

 #include <exception>
+#include <functional>
 #include <sstream>

 #include "ngraph/pass/pass.hpp"

--- a/src/ngraph/runtime/cpu/cpu_call_frame.cpp
+++ b/src/ngraph/runtime/cpu/cpu_call_frame.cpp
@@ -146,6 +146,7 @@ void runtime::cpu::CPU_CallFrame::setup_runtime_context()
    }
    const auto& mkldnn_emitter = m_external_function->get_mkldnn_emitter();
    ctx->mkldnn_primitives = mkldnn_emitter->get_mkldnn_primitives().data();
+    ctx->mkldnn_workspaces = mkldnn_emitter->get_mkldnn_workspaces().data();
 }

 void runtime::cpu::CPU_CallFrame::cleanup_runtime_context()

--- a/src/ngraph/runtime/cpu/cpu_emitter.cpp
+++ b/src/ngraph/runtime/cpu/cpu_emitter.cpp
--- a/src/ngraph/runtime/cpu/cpu_external_function.cpp
+++ b/src/ngraph/runtime/cpu/cpu_external_function.cpp
@@ -258,7 +258,7 @@ runtime::cpu::CPU_ExternalFunction::CPU_ExternalFunction(
    const shared_ptr<ngraph::Function>& function, bool release_function)
    : ngraph::runtime::ExternalFunction(function, release_function)
    , m_compiled_function(nullptr)
-    , m_emit_timing(std::getenv("NGRAPH_CPU_EMIT_TIMING") != nullptr)
+    , m_emit_timing(false)
    , m_use_tbb(std::getenv("NGRAPH_CPU_USE_TBB") != nullptr)
    , m_function_name(function->get_name())
 {
@@ -275,6 +275,8 @@ void runtime::cpu::CPU_ExternalFunction::compile()
        return;
    }

+    m_emit_timing = m_timing | (std::getenv("NGRAPH_CPU_EMIT_TIMING") != nullptr);
+
    m_mkldnn_emitter.reset(new MKLDNNEmitter());

    ngraph::pass::Manager pass_manager;
@@ -370,6 +372,7 @@ using namespace ngraph::runtime;
    {
        writer << "// Declare debug timers\n";
        vector<string> names;
+        size_t index = 0;
        for (shared_ptr<Function> current_function : pass_manager.get_state().get_functions())
        {
            for (shared_ptr<Node> node : current_function->get_ordered_ops())
@@ -377,59 +380,43 @@ using namespace ngraph::runtime;
                if (!node->is_parameter() && !node->is_constant())
                {
                    names.push_back(node->get_name());
+                    m_name_index_map.insert({node->get_name(), index++});
                }
            }
        }
-        for (const string& s : names)
-        {
-            writer << "ngraph::stopwatch timer_" << s << ";\n";
-        }
+        writer << "ngraph::stopwatch timers[" << names.size() << "];\n";
        writer << "extern \"C\" size_t get_debug_timer_count() { return " << names.size()
               << "; }\n";
        writer << "extern \"C\" const char* get_debug_timer_name(size_t index)\n";
        writer << "{\n";
        writer.indent++;
-        writer << "const char* rc;\n";
-        writer << "switch(index)\n";
+        writer << "static const char* timer_names[" << names.size() << "] =\n";
        writer << "{\n";
-        for (size_t i = 0; i < names.size(); i++)
+        writer.indent++;
+        vector<string> quoted_names;
+        for (const string& name : names)
        {
-            writer << "case " << i << ": rc = \"" << names[i] << "\"; break;\n";
+            quoted_names.push_back("\"" + name + "\"");
        }
-        writer << "default: rc = \"\";\n";
-        writer << "}\n";
-        writer << "return rc;\n";
+        writer << emit_string_array(quoted_names, 100 - (4 * 2 + 1));
+        writer << "\n};\n";
+        writer.indent--;
+        writer << "return timer_names[index];\n";
        writer.indent--;
        writer << "}\n";
+
        writer << "extern \"C\" const size_t get_debug_timer_microseconds(size_t index)\n";
        writer << "{\n";
        writer.indent++;
-        writer << "size_t rc;\n";
-        writer << "switch(index)\n";
-        writer << "{\n";
-        for (size_t i = 0; i < names.size(); i++)
-        {
-            writer << "case " << i << ": rc = timer_" << names[i]
-                   << ".get_total_microseconds(); break;\n";
-        }
-        writer << "default: rc = 0;\n";
-        writer << "}\n";
-        writer << "return rc;\n";
+        writer << "return (index < " << names.size()
+               << " ? timers[index].get_total_microseconds() : 0);\n";
        writer.indent--;
        writer << "}\n";
+
        writer << "extern \"C\" const size_t get_debug_timer_call_count(size_t index)\n";
        writer << "{\n";
        writer.indent++;
-        writer << "size_t rc;\n";
-        writer << "switch(index)\n";
-        writer << "{\n";
-        for (size_t i = 0; i < names.size(); i++)
-        {
-            writer << "case " << i << ": rc = timer_" << names[i] << ".get_call_count(); break;\n";
-        }
-        writer << "default: rc = 0;\n";
-        writer << "}\n";
-        writer << "return rc;\n";
+        writer << "return (index < " << names.size() << " ? timers[index].get_call_count() : 0);\n";
        writer.indent--;
        writer << "}\n";
        writer << "\n";
@@ -443,15 +430,11 @@ using namespace ngraph::runtime;
            const ngraph::op::Constant* c = dynamic_cast<ngraph::op::Constant*>(node.get());
            if (c)
            {
+                m_active_constants.push_back(node);
                shared_ptr<descriptor::TensorView> tv = node->get_outputs()[0].get_tensor_view();
-                auto c_value_strings = c->get_value_strings();
-                writer << "static " << tv->get_tensor().get_element_type().c_type_string() << " "
-                       << tv->get_tensor().get_name() << "[" << c_value_strings.size() << "] =\n";
-                writer << "{\n";
-                writer.indent++;
-                writer << emit_string_array(c_value_strings, 100 - writer.indent * 4);
-                writer.indent--;
-                writer << "\n};\n\n";
+                string type = tv->get_tensor().get_element_type().c_type_string();
+                writer << "static " << type << "* " << tv->get_tensor().get_name() << " = (("
+                       << type << "*)(" << c->get_data_ptr() << "));\n";
                m_variable_name_map[tv->get_tensor().get_name()] = tv->get_tensor().get_name();
            }
        }
@@ -702,10 +685,6 @@ using namespace ngraph::runtime;
                           << "(G, [&](const tbb::flow::continue_msg &msg)\n{\n";
                    writer.indent++;
                }
-                if (m_emit_timing)
-                {
-                    emit_debug_function_entry(writer, node.get(), in, out);
-                }
                if (runtime::cpu::IsTracingEnabled() &&
                    current_function->get_name() == m_function_name)
                {
@@ -713,14 +692,21 @@ using namespace ngraph::runtime;
                }
            }

-            writer << "\n// " << node->get_name() << "(";
-            vector<string> parameter_nodes = node_input_names;
-            parameter_nodes.insert(
-                parameter_nodes.end(), node_output_names.begin(), node_output_names.end());
-            writer << join(parameter_nodes);
-            writer << ")\n";
+            if (!node->is_parameter() && !node->is_constant())
+            {
+                writer << "\n// " << node->get_name() << "(";
+                vector<string> parameter_nodes = node_input_names;
+                parameter_nodes.insert(
+                    parameter_nodes.end(), node_output_names.begin(), node_output_names.end());
+                writer << join(parameter_nodes);
+                writer << ")\n";
+            }

            // Emit operation body
+            if (!node->is_parameter() && !node->is_constant())
+            {
+                emit_debug_function_entry(writer, node.get(), in, out);
+            }
            string func_name;
            auto it = match_functions.find(node.get());
            if (it == match_functions.end())
@@ -745,10 +731,7 @@ using namespace ngraph::runtime;
            // Emit operation epilogue
            if (!node->is_parameter() && !node->is_constant())
            {
-                if (m_emit_timing)
-                {
-                    emit_debug_function_exit(writer, node.get(), in, out);
-                }
+                emit_debug_function_exit(writer, node.get(), in, out);
                if (runtime::cpu::IsTracingEnabled() &&
                    current_function->get_name() == m_function_name)
                {
@@ -909,7 +892,10 @@ void runtime::cpu::CPU_ExternalFunction::emit_debug_function_entry(
    const std::vector<TensorViewWrapper>& in,
    const std::vector<TensorViewWrapper>& out)
 {
-    writer << "timer_" << node->get_name() << ".start();\n";
+    if (m_emit_timing)
+    {
+        writer << "timers[" << m_name_index_map[node->get_name()] << "].start();\n";
+    }
 }

 void runtime::cpu::CPU_ExternalFunction::emit_debug_function_exit(
@@ -918,7 +904,10 @@ void runtime::cpu::CPU_ExternalFunction::emit_debug_function_exit(
    const std::vector<TensorViewWrapper>& in,
    const std::vector<TensorViewWrapper>& out)
 {
-    writer << "timer_" << node->get_name() << ".stop();\n";
+    if (m_emit_timing)
+    {
+        writer << "timers[" << m_name_index_map[node->get_name()] << "].stop();\n";
+    }
 }

 bool runtime::cpu::CPU_ExternalFunction::is_functionally_identical(

--- a/src/ngraph/runtime/cpu/cpu_external_function.hpp
+++ b/src/ngraph/runtime/cpu/cpu_external_function.hpp
@@ -17,6 +17,7 @@
 #pragma once

 #include <functional>
+#include <map>
 #include <memory>
 #include <string>
 #include <typeindex>
@@ -119,6 +120,12 @@ namespace ngraph
                bool m_emit_timing;
                bool m_use_tbb;
                std::unordered_map<std::string, std::string> m_variable_name_map;
+                std::map<std::string, size_t> m_name_index_map;
+
+                // Because we are directly accessing the constant data stored in the
+                // Constant ops we need to keep a list of shared_ptr to each Constant
+                // so they don't get freed before we are done with them
+                std::vector<std::shared_ptr<Node>> m_active_constants;

                LayoutDescriptorPtrs parameter_layout_descriptors;
                LayoutDescriptorPtrs result_layout_descriptors;

--- a/src/ngraph/runtime/cpu/cpu_runtime_context.hpp
+++ b/src/ngraph/runtime/cpu/cpu_runtime_context.hpp
@@ -37,6 +37,7 @@ namespace ngraph
            {
                int64_t* op_durations;
                mkldnn::primitive* const* mkldnn_primitives;
+                char* const* mkldnn_workspaces;
            };
            }
        }

--- a/src/ngraph/runtime/cpu/mkldnn_emitter.cpp
+++ b/src/ngraph/runtime/cpu/mkldnn_emitter.cpp
@@ -21,6 +21,7 @@

 #include "ngraph/runtime/cpu/cpu_layout_descriptor.hpp"
 #include "ngraph/runtime/cpu/cpu_tensor_view_wrapper.hpp"
+#include "ngraph/runtime/cpu/mkldnn_invoke.hpp"
 #include "ngraph/runtime/cpu/mkldnn_utils.hpp"

 using namespace ngraph::runtime::cpu;
@@ -36,12 +37,24 @@ const std::vector<mkldnn::primitive*>& MKLDNNEmitter::get_mkldnn_primitives() co
    return m_mkldnn_primitives;
 }

+const std::vector<char*>& MKLDNNEmitter::get_mkldnn_workspaces()
+{
+    return m_workspace_bufs;
+}
+
 size_t MKLDNNEmitter::insert_primitive(mkldnn::primitive* primitive)
 {
    m_mkldnn_primitives.emplace_back(primitive);
    return (m_mkldnn_primitives.size() - 1);
 }

+size_t MKLDNNEmitter::insert_workspace(std::unique_ptr<MKLDNNWorkspace>& workspace)
+{
+    m_workspace_bufs.push_back(workspace.get()->buf);
+    m_workspaces.push_back(std::move(workspace));
+    return (m_workspaces.size() - 1);
+}
+
 const std::vector<size_t>& MKLDNNEmitter::get_primitive_deps(size_t index) const
 {
    return m_primitive_deps.at(index);
@@ -321,6 +334,105 @@ size_t MKLDNNEmitter::build_pooling_forward(mkldnn::algorithm pooling_algorithm,
    return primitive_index;
 }

+size_t MKLDNNEmitter::build_pooling_backward(mkldnn::algorithm pooling_algorithm,
+                                             const mkldnn::memory::desc& diff_dst_desc,
+                                             const mkldnn::memory::desc& diff_src_desc,
+                                             const ngraph::Strides& window_strides,
+                                             const ngraph::Shape& window_shape,
+                                             const ngraph::Shape& padding_below,
+                                             const ngraph::Shape& padding_above)
+{
+    size_t input_index = build_memory_primitive(diff_dst_desc);
+    size_t result_index = build_memory_primitive(diff_src_desc);
+
+    size_t primitive_index = insert_primitive(new mkldnn::pooling_backward(
+        {{pooling_algorithm,
+          diff_src_desc,
+          diff_dst_desc,
+          mkldnn::memory::dims(window_strides.begin(), window_strides.end()),
+          mkldnn::memory::dims(window_shape.begin(), window_shape.end()),
+          mkldnn::memory::dims(padding_below.begin(), padding_below.end()),
+          mkldnn::memory::dims(padding_above.begin(), padding_above.end()),
+          mkldnn::padding_kind::zero},
+         mkldnn_utils::global_cpu_engine,
+         {{mkldnn::prop_kind::forward_training,
+           pooling_algorithm,
+           diff_src_desc,
+           diff_dst_desc,
+           mkldnn::memory::dims(window_strides.begin(), window_strides.end()),
+           mkldnn::memory::dims(window_shape.begin(), window_shape.end()),
+           mkldnn::memory::dims(padding_below.begin(), padding_below.end()),
+           mkldnn::memory::dims(padding_above.begin(), padding_above.end()),
+           mkldnn::padding_kind::zero},
+          mkldnn_utils::global_cpu_engine}},
+        *m_mkldnn_primitives[input_index],
+        *m_mkldnn_primitives[result_index]));
+
+    m_primitive_deps[primitive_index] = {input_index, result_index};
+    return primitive_index;
+}
+
+size_t MKLDNNEmitter::build_max_pooling_backward(mkldnn::algorithm pooling_algorithm,
+                                                 const mkldnn::memory::desc& fprop_src_desc,
+                                                 const mkldnn::memory::desc& diff_dst_desc,
+                                                 const mkldnn::memory::desc& diff_src_desc,
+                                                 const ngraph::Strides& window_strides,
+                                                 const ngraph::Shape& window_shape,
+                                                 const ngraph::Shape& padding_below,
+                                                 const ngraph::Shape& padding_above)
+{
+    size_t fprop_src_index = build_memory_primitive(fprop_src_desc);
+    size_t diff_dst_index = build_memory_primitive(diff_dst_desc);
+    size_t diff_src_index = build_memory_primitive(diff_src_desc);
+
+    mkldnn::pooling_forward::primitive_desc fwd_pd{
+        {mkldnn::prop_kind::forward_training,
+         pooling_algorithm,
+         diff_src_desc,
+         diff_dst_desc,
+         mkldnn::memory::dims(window_strides.begin(), window_strides.end()),
+         mkldnn::memory::dims(window_shape.begin(), window_shape.end()),
+         mkldnn::memory::dims(padding_below.begin(), padding_below.end()),
+         mkldnn::memory::dims(padding_above.begin(), padding_above.end()),
+         mkldnn::padding_kind::zero},
+        mkldnn_utils::global_cpu_engine};
+
+    auto ws_index = build_memory_primitive(fwd_pd.workspace_primitive_desc().desc());
+    // Allocate workspace
+    // TODO (jbobba): Might need to align memory
+    auto ws = std::unique_ptr<MKLDNNWorkspace>(
+        new MKLDNNWorkspace(fwd_pd.workspace_primitive_desc().get_size()));
+    auto ws_buf_index = insert_workspace(ws);
+
+    size_t fwd_primitive_index = insert_primitive(new mkldnn::pooling_forward(
+        fwd_pd,
+        *m_mkldnn_primitives[fprop_src_index],
+        *m_mkldnn_primitives
+            [diff_src_index], // HACK - Uses diff_src buffer. Safe since diff_src > fprop_dst
+        *m_mkldnn_primitives[ws_index]));
+
+    size_t bwd_primitive_index = insert_primitive(new mkldnn::pooling_backward(
+        {{pooling_algorithm,
+          diff_src_desc,
+          diff_dst_desc,
+          mkldnn::memory::dims(window_strides.begin(), window_strides.end()),
+          mkldnn::memory::dims(window_shape.begin(), window_shape.end()),
+          mkldnn::memory::dims(padding_below.begin(), padding_below.end()),
+          mkldnn::memory::dims(padding_above.begin(), padding_above.end()),
+          mkldnn::padding_kind::zero},
+         mkldnn_utils::global_cpu_engine,
+         fwd_pd},
+        *m_mkldnn_primitives[diff_dst_index],
+        *m_mkldnn_primitives[ws_index],
+        *m_mkldnn_primitives[diff_src_index]));
+
+    m_primitive_deps[fwd_primitive_index] = {
+        fprop_src_index, diff_src_index, ws_index, ws_buf_index};
+    m_primitive_deps[bwd_primitive_index] = {
+        diff_dst_index, ws_index, diff_src_index, ws_buf_index};
+    return bwd_primitive_index;
+}
+
 size_t MKLDNNEmitter::build_reorder(const mkldnn::memory::desc& input_desc,
                                    const mkldnn::memory::desc& result_desc)
 {

--- a/src/ngraph/runtime/cpu/mkldnn_emitter.hpp
+++ b/src/ngraph/runtime/cpu/mkldnn_emitter.hpp
@@ -35,6 +35,14 @@ namespace ngraph
            class CPU_ExternalFunction;
            class TensorViewWrapper;

+            class MKLDNNWorkspace
+            {
+            public:
+                MKLDNNWorkspace(size_t size) { buf = reinterpret_cast<char*>(malloc(size)); }
+                ~MKLDNNWorkspace() { free(buf); }
+                char* buf;
+            };
+
            class MKLDNNEmitter
            {
            public:
@@ -42,8 +50,10 @@ namespace ngraph
                ~MKLDNNEmitter();

                const std::vector<mkldnn::primitive*>& get_mkldnn_primitives() const;
+                const std::vector<char*>& get_mkldnn_workspaces();

                size_t insert_primitive(mkldnn::primitive* primitive);
+                size_t insert_workspace(std::unique_ptr<MKLDNNWorkspace>& workspace);
                const std::vector<size_t>& get_primitive_deps(size_t index) const;

                // TODO(jmenon): Get rid of TensorViewWrappers at some point
@@ -109,6 +119,23 @@ namespace ngraph
                                             const ngraph::Shape& padding_below,
                                             const ngraph::Shape& padding_above);

+                size_t build_pooling_backward(mkldnn::algorithm pooling_algorithm,
+                                              const mkldnn::memory::desc& diff_dst_desc,
+                                              const mkldnn::memory::desc& diff_src_desc,
+                                              const ngraph::Strides& window_strides,
+                                              const ngraph::Shape& window_shape,
+                                              const ngraph::Shape& padding_below,
+                                              const ngraph::Shape& padding_above);
+
+                size_t build_max_pooling_backward(mkldnn::algorithm pooling_algorithm,
+                                                  const mkldnn::memory::desc& fprop_src_desc,
+                                                  const mkldnn::memory::desc& diff_dst_desc,
+                                                  const mkldnn::memory::desc& diff_src_desc,
+                                                  const ngraph::Strides& window_strides,
+                                                  const ngraph::Shape& window_shape,
+                                                  const ngraph::Shape& padding_below,
+                                                  const ngraph::Shape& padding_above);
+
                size_t build_reorder(const mkldnn::memory::desc& input_desc,
                                     const mkldnn::memory::desc& result_desc);

@@ -129,6 +156,8 @@ namespace ngraph
                std::vector<mkldnn::primitive*> m_mkldnn_primitives;
                std::vector<mkldnn::stream> m_mkldnn_streams;
                std::unordered_map<size_t, std::vector<size_t>> m_primitive_deps;
+                std::vector<std::unique_ptr<MKLDNNWorkspace>> m_workspaces;
+                std::vector<char*> m_workspace_bufs;
            };
        }
    }

--- a/src/ngraph/runtime/cpu/pass/cpu_assignment.cpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_assignment.cpp
@@ -29,6 +29,7 @@
 #include "ngraph/ops/avg_pool.hpp"
 #include "ngraph/ops/batch_norm.hpp"
 #include "ngraph/ops/convolution.hpp"
+#include "ngraph/ops/max_pool.hpp"
 #include "ngraph/ops/relu.hpp"
 #include "ngraph/runtime/cpu/cpu_op_annotations.hpp"
 #include "ngraph/runtime/cpu/mkldnn_utils.hpp"
@@ -245,10 +246,48 @@ namespace ngraph
                    }
                }

+                template <>
+                void CPUAssignment::ASSIGN_DECL(ngraph::op::MaxPool)
+                {
+                    auto max_pool = static_cast<op::MaxPool*>(node);
+
+                    auto arg0_shape = node->get_input_shape(0);
+                    auto arg0_rank = arg0_shape.size();
+                    auto result_shape = node->get_output_shape(0);
+
+                    if (arg0_rank == 4 && max_pool->get_window_shape().size() == 2 &&
+                        node->get_input_element_type(0) == element::f32)
+                    {
+                        auto op_annotations =
+                            std::make_shared<ngraph::runtime::cpu::CPUOpAnnotations>();
+                        op_annotations->set_mkldnn_op(true);
+                        max_pool->set_op_annotations(op_annotations);
+                    }
+                }
+
+                template <>
+                void CPUAssignment::ASSIGN_DECL(ngraph::op::MaxPoolBackprop)
+                {
+                    auto max_pool = static_cast<op::MaxPoolBackprop*>(node);
+
+                    auto arg1_shape = node->get_input_shape(1);
+                    auto arg1_rank = arg1_shape.size();
+                    auto result_shape = node->get_output_shape(0);
+
+                    if (arg1_rank == 4 && max_pool->get_window_shape().size() == 2 &&
+                        node->get_input_element_type(1) == element::f32)
+                    {
+                        auto op_annotations =
+                            std::make_shared<ngraph::runtime::cpu::CPUOpAnnotations>();
+                        op_annotations->set_mkldnn_op(true);
+                        max_pool->set_op_annotations(op_annotations);
+                    }
+                }
+
                template <>
                void CPUAssignment::ASSIGN_DECL(ngraph::op::Relu)
                {
-                    auto avg_pool = static_cast<op::Relu*>(node);
+                    auto relu = static_cast<op::Relu*>(node);

                    auto arg0_shape = node->get_input_shape(0);
                    auto arg0_rank = arg0_shape.size();
@@ -260,7 +299,7 @@ namespace ngraph
                        auto op_annotations =
                            std::make_shared<ngraph::runtime::cpu::CPUOpAnnotations>();
                        op_annotations->set_mkldnn_op(true);
-                        avg_pool->set_op_annotations(op_annotations);
+                        relu->set_op_annotations(op_annotations);
                    }
                }

@@ -280,18 +319,19 @@ namespace ngraph
                template <>
                void CPUAssignment::ASSIGN_DECL(ngraph::op::ReluBackprop)
                {
-                    auto avg_pool = static_cast<op::ReluBackprop*>(node);
+                    auto relu_bprop = static_cast<op::ReluBackprop*>(node);

                    auto arg0_shape = node->get_input_shape(0);
                    auto arg0_rank = arg0_shape.size();
                    auto result_shape = node->get_output_shape(0);

-                    if (arg0_rank == 4 && node->get_input_element_type(0) == element::f32)
+                    if ((arg0_rank == 4 || arg0_rank == 2) &&
+                        node->get_input_element_type(0) == element::f32)
                    {
                        auto op_annotations =
                            std::make_shared<ngraph::runtime::cpu::CPUOpAnnotations>();
                        op_annotations->set_mkldnn_op(true);
-                        avg_pool->set_op_annotations(op_annotations);
+                        relu_bprop->set_op_annotations(op_annotations);
                    }
                }

@@ -313,6 +353,9 @@ namespace ngraph

 static const runtime::cpu::pass::AssignOpMap s_dispatcher{
    {TI(ngraph::op::Add), &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::Add>},
+    {TI(ngraph::op::AvgPool), &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::AvgPool>},
+    {TI(ngraph::op::AvgPoolBackprop),
+     &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::AvgPoolBackprop>},
    {TI(ngraph::op::BatchNorm), &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::BatchNorm>},
    {TI(ngraph::op::Convolution),
     &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::Convolution>},
@@ -320,13 +363,13 @@ static const runtime::cpu::pass::AssignOpMap s_dispatcher{
     &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::ConvolutionBackpropData>},
    {TI(ngraph::op::ConvolutionBackpropFilters),
     &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::ConvolutionBackpropFilters>},
+    {TI(ngraph::op::MaxPool), &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::MaxPool>},
+    {TI(ngraph::op::MaxPoolBackprop),
+     &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::MaxPoolBackprop>},
    {TI(ngraph::op::ConvolutionBias),
     &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::ConvolutionBias>},
    {TI(ngraph::op::ConvolutionBiasBackpropFiltersBias),
     &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::ConvolutionBiasBackpropFiltersBias>},
-    {TI(ngraph::op::AvgPool), &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::AvgPool>},
-    {TI(ngraph::op::AvgPoolBackprop),
-     &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::AvgPoolBackprop>},
    {TI(ngraph::op::Relu), &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::Relu>},
    {TI(ngraph::op::ReluBackprop),
     &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::ReluBackprop>},

--- a/src/ngraph/runtime/cpu/pass/cpu_layout.cpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_layout.cpp
--- a/src/ngraph/runtime/cpu/pass/cpu_nop_elimination.cpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_nop_elimination.cpp
@@ -14,6 +14,7 @@
 * limitations under the License.
 *******************************************************************************/

+#include <functional>
 #include <memory>
 #include <typeindex>
 #include <typeinfo>

--- a/src/ngraph/runtime/external_function.hpp
+++ b/src/ngraph/runtime/external_function.hpp
@@ -19,6 +19,7 @@
 #include <memory>

 #include "ngraph/function.hpp"
+#include "ngraph/log.hpp"

 namespace ngraph
 {
@@ -34,6 +35,7 @@ namespace ngraph
                : m_function(function)
                , m_release_function(release_function)
                , m_is_compiled(false)
+                , m_timing(false)
            {
            }

@@ -42,12 +44,13 @@ namespace ngraph
        public:
            virtual ~ExternalFunction() {}
            virtual std::shared_ptr<CallFrame> make_call_frame() = 0;
-
+            void set_emit_timing(bool enable) { m_timing = enable; }
            const std::shared_ptr<ngraph::Function> get_function() { return m_function; }
        protected:
            std::shared_ptr<ngraph::Function> m_function;
            bool m_release_function;
            bool m_is_compiled;
+            bool m_timing;
        };
    }
 }
--- a/src/ngraph/runtime/manager.cpp
+++ b/src/ngraph/runtime/manager.cpp
@@ -15,6 +15,7 @@
 *******************************************************************************/

 #include <dlfcn.h>
+#include <functional>
 #include <iostream>
 #include <sstream>
 #include <string>

--- a/src/ngraph/runtime/manager.hpp
+++ b/src/ngraph/runtime/manager.hpp
@@ -16,6 +16,7 @@

 #pragma once

+#include <functional>
 #include <map>
 #include <memory>
 #include <string>

--- a/src/tools/nbench/nbench.cpp
+++ b/src/tools/nbench/nbench.cpp
@@ -21,34 +21,42 @@
 // sample models are under ../../test/models

 #include <fstream>
+#include <ngraph/file_util.hpp>
 #include <ngraph/runtime/backend.hpp>
 #include <ngraph/runtime/call_frame.hpp>
 #include <ngraph/runtime/manager.hpp>
+#include <ngraph/util.hpp>
+
 #include "util/benchmark.hpp"
 #include "util/test_tools.hpp"
+
 using namespace std;
+using namespace ngraph;

 int main(int argc, char** argv)
 {
-    string model = "model.json";
-    string backend = "INTERPRETER";
-    int iter = 10;
+    string model;
+    string backend = "CPU";
+    int iterations = 10;
    bool failed = false;
+    bool statistics = false;
+    bool timing_detail = false;
    for (size_t i = 1; i < argc; i++)
    {
-        if (string(argv[i]) == "-f")
+        string arg = argv[i];
+        if (arg == "-f" || arg == "--file")
        {
            model = argv[++i];
        }
-        else if (string(argv[i]) == "-b")
+        else if (arg == "-b" || arg == "--backend")
        {
            backend = argv[++i];
        }
-        else if (string(argv[i]) == "-i")
+        else if (arg == "-i" || arg == "--iterations")
        {
            try
            {
-                iter = stoi(argv[++i]);
+                iterations = stoi(argv[++i]);
            }
            catch (...)
            {
@@ -56,6 +64,19 @@ int main(int argc, char** argv)
                failed = true;
            }
        }
+        else if (arg == "-s" || arg == "--statistics")
+        {
+            statistics = true;
+        }
+        else if (arg == "--timing_detail")
+        {
+            timing_detail = true;
+        }
+        else
+        {
+            cout << "Unknown option: " << arg << endl;
+            failed = true;
+        }
    }
    if (!static_cast<bool>(ifstream(model)))
    {
@@ -73,12 +94,58 @@ SYNOPSIS
        nbench [-f <filename>] [-b <backend>] [-i <iterations>]

 OPTIONS
-        -f          model json file to use (default: model.json)
-        -b          Backend to use (default: INTERPRETER)
-        -i          Iterations (default: 10)
+        -f|--file          Serialized model file
+        -b|--backend       Backend to use (default: CPU)
+        -i|--iterations    Iterations (default: 10)
+        -s|--statistics    Display op stastics
+        --timing_detail    Gather detailed timing
 )###";
        return 1;
    }
-    cout << "Benchmarking " << model << ", " << backend << " backend, " << iter << " iterations.\n";
-    run_benchmark(model, backend, iter);
+
+    const string json_string = file_util::read_file_to_string(model);
+    stringstream ss(json_string);
+    shared_ptr<Function> f = deserialize(ss);
+    if (statistics)
+    {
+        cout << "statistics:" << endl;
+        cout << "total nodes: " << f->get_ops().size() << endl;
+        size_t total_constant_bytes = 0;
+        unordered_map<string, size_t> op_list;
+        for (shared_ptr<Node> node : f->get_ordered_ops())
+        {
+            string name = node->get_name();
+            string op_name = name.substr(0, name.find('_'));
+            string shape_name = "{" + join(node->get_outputs()[0].get_shape()) + "}";
+            op_list[op_name + shape_name]++;
+
+            if (op_name == "Constant")
+            {
+                const Shape& shape = node->get_outputs()[0].get_shape();
+                size_t const_size = node->get_outputs()[0].get_element_type().size();
+                if (shape.size() == 0)
+                {
+                    total_constant_bytes += const_size;
+                }
+                else
+                {
+                    total_constant_bytes +=
+                        (const_size * shape_size(node->get_outputs()[0].get_shape()));
+                }
+            }
+        }
+        cout << "Total Constant size: " << total_constant_bytes << " bytes\n";
+        for (const pair<string, size_t>& op_info : op_list)
+        {
+            cout << op_info.first << ": " << op_info.second << " ops" << endl;
+        }
+    }
+    else if (iterations > 0)
+    {
+        cout << "Benchmarking " << model << ", " << backend << " backend, " << iterations
+             << " iterations.\n";
+        run_benchmark(f, backend, iterations, timing_detail);
+    }
+
+    return 0;
 }
--- a/test/backend_test.in.cpp
+++ b/test/backend_test.in.cpp
@@ -5370,7 +5370,6 @@ TEST(${BACKEND_NAME}, numeric_double_inf)

 TEST(${BACKEND_NAME}, abc_tbb)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    ONLY_ENABLE_TEST_FOR("CPU", "${BACKEND_NAME}");

    // Force TBB flow graph generation in the CPU backend

--- a/test/cpu_fusion.cpp
+++ b/test/cpu_fusion.cpp
@@ -109,14 +109,10 @@ TEST(cpu_fusion, gemm_cpu_broadcast_row)
    auto A = make_shared<op::Parameter>(element::f32, shapeA);
    auto B = make_shared<op::Parameter>(element::f32, shapeB);

-    auto reshape_w = make_shared<op::Reshape>(A, AxisVector{1, 0}, Shape{2, 3});
-    auto reshape_x = make_shared<op::Reshape>(B, AxisVector{1, 0}, Shape{3, 2});
-
-    auto one = op::Constant::create<float>(element::f32, Shape{2}, std::vector<float>{1.0f, 1.0f});
+    auto bias = op::Constant::create<float>(element::f32, Shape{2}, std::vector<float>{2.0f, 3.0f});

-    auto broadcast = make_shared<op::Broadcast>(one, shapeC, AxisSet{0});
    auto cg = make_shared<op::MatmulBias>(
-        A, B, one, A->get_shape(), B->get_shape(), true, true, AxisSet{0});
+        A, B, bias, A->get_shape(), B->get_shape(), true, true, AxisSet{0});

    auto f = make_shared<Function>(cg, op::ParameterVector{A, B});

@@ -136,8 +132,8 @@ TEST(cpu_fusion, gemm_cpu_broadcast_row)
    copy_data(b, dataB);

    cf->call({a, b}, {result});
-    vector<float> expected{10, 28, 37, 109};
-    ASSERT_TRUE(read_vector<float>(result) == expected);
+    vector<float> expected{11, 30, 38, 111};
+    EXPECT_EQ(read_vector<float>(result), expected);
 }

 TEST(cpu_fusion, gemm_cpu_broadcast_column)
@@ -148,14 +144,10 @@ TEST(cpu_fusion, gemm_cpu_broadcast_column)
    auto A = make_shared<op::Parameter>(element::f32, shapeA);
    auto B = make_shared<op::Parameter>(element::f32, shapeB);

-    auto reshape_w = make_shared<op::Reshape>(A, AxisVector{1, 0}, Shape{2, 3});
-    auto reshape_x = make_shared<op::Reshape>(B, AxisVector{1, 0}, Shape{3, 2});
-
-    auto one = op::Constant::create<float>(element::f32, Shape{2}, std::vector<float>{1.0f, 1.0f});
+    auto bias = op::Constant::create<float>(element::f32, Shape{2}, std::vector<float>{2.0f, 3.0f});

-    auto broadcast = make_shared<op::Broadcast>(one, shapeC, AxisSet{1});
    auto cg = make_shared<op::MatmulBias>(
-        A, B, one, A->get_shape(), B->get_shape(), true, true, AxisSet{1});
+        A, B, bias, A->get_shape(), B->get_shape(), true, true, AxisSet{1});

    auto f = make_shared<Function>(cg, op::ParameterVector{A, B});

@@ -175,8 +167,8 @@ TEST(cpu_fusion, gemm_cpu_broadcast_column)
    copy_data(b, dataB);

    cf->call({a, b}, {result});
-    vector<float> expected{10, 28, 37, 109};
-    ASSERT_TRUE(read_vector<float>(result) == expected);
+    vector<float> expected{11, 29, 39, 111};
+    EXPECT_EQ(read_vector<float>(result), expected);
 }

 TEST(cpu_fusion, gemm_cpu_broadcast_matrix)

--- a/test/util/benchmark.cpp
+++ b/test/util/benchmark.cpp
@@ -17,56 +17,127 @@
 #include <iomanip>

 #include "benchmark.hpp"
+#include "ngraph/graph_util.hpp"
 #include "ngraph/runtime/backend.hpp"
 #include "ngraph/runtime/call_frame.hpp"
+#include "ngraph/runtime/external_function.hpp"
 #include "ngraph/runtime/manager.hpp"
 #include "ngraph/runtime/tensor_view.hpp"
 #include "ngraph/serializer.hpp"
 #include "ngraph/util.hpp"
 #include "random.hpp"

-std::multimap<size_t, std::string>
-    aggregate_timing(const std::vector<ngraph::runtime::PerformanceCounter>& perf_data)
+using namespace std;
+using namespace ngraph;
+
+shared_ptr<Node> find_node(const string& name, shared_ptr<Function> func)
 {
-    std::unordered_map<std::string, size_t> timing;
-    for (const ngraph::runtime::PerformanceCounter& p : perf_data)
+    static unordered_map<string, shared_ptr<Node>> node_map;
+    if (node_map.empty())
    {
-        std::string op = p.name().substr(0, p.name().find('_'));
-        timing[op] += p.microseconds();
+        vector<shared_ptr<Function>> fs;
+        traverse_functions(func, [&](shared_ptr<Function> f) { fs.push_back(f); });
+        for (shared_ptr<Function> f : fs)
+        {
+            for (shared_ptr<Node> node : f->get_ops())
+            {
+                node_map.insert({node->get_name(), node});
+            }
+        }
+    }
+    return node_map[name];
+}
+
+multimap<size_t, string>
+    aggregate_timing_details(const vector<runtime::PerformanceCounter>& perf_data,
+                             shared_ptr<Function> f)
+{
+    unordered_map<string, size_t> timing;
+    for (const runtime::PerformanceCounter& p : perf_data)
+    {
+        shared_ptr<Node> node = find_node(p.name(), f);
+        string op = p.name().substr(0, p.name().find('_'));
+        string shape_name = "{" + join(node->get_outputs()[0].get_shape()) + "}";
+        timing[op + shape_name] += p.microseconds();
    }

-    std::multimap<size_t, std::string> rc;
-    for (const std::pair<std::string, size_t>& t : timing)
+    multimap<size_t, string> rc;
+    for (const pair<string, size_t>& t : timing)
    {
        rc.insert({t.second, t.first});
    }
    return rc;
 }

-void run_benchmark(const std::string& json_path, const std::string& backend_name, size_t iterations)
+multimap<size_t, string> aggregate_timing(const vector<runtime::PerformanceCounter>& perf_data)
 {
-    using namespace std;
-    using namespace ngraph;
-    string env_var_name = "NGRAPH_" + backend_name + "_EMIT_TIMING";
-    bool emit_timing = (std::getenv(env_var_name.c_str()) != nullptr);
-    if (!emit_timing)
+    unordered_map<string, size_t> timing;
+    for (const runtime::PerformanceCounter& p : perf_data)
+    {
+        string op = p.name().substr(0, p.name().find('_'));
+        timing[op] += p.microseconds();
+    }
+
+    multimap<size_t, string> rc;
+    for (const pair<string, size_t>& t : timing)
    {
-        cout << "To get per-op timing set the environment variable " << env_var_name << "\n";
+        rc.insert({t.second, t.first});
    }
+    return rc;
+}

-    ngraph::test::Uniform<float> rng{-1, 1, 0};
+void run_benchmark(const string& json_path,
+                   const string& backend_name,
+                   size_t iterations,
+                   bool timing_detail)
+{
+    stopwatch timer;
+    timer.start();
    const string json_string = file_util::read_file_to_string(json_path);
    stringstream ss(json_string);
    shared_ptr<Function> f = deserialize(ss);
+    timer.stop();
+    cout << "deserialize time: " << timer.get_milliseconds() << "ms" << endl;
+    run_benchmark(f, backend_name, iterations, timing_detail);
+}

-    stopwatch build_time;
-    build_time.start();
+void print_times(const multimap<size_t, string>& timing)
+{
+    // set the column widths
+    int name_width = 0;
+    int time_width = 0;
+    for (const pair<size_t, string>& p : timing)
+    {
+        name_width = max(name_width, static_cast<int>(p.second.size()));
+        stringstream ss;
+        ss.imbue(locale(""));
+        ss << p.first;
+        time_width = max(time_width, static_cast<int>(ss.str().size()));
+    }
+    for (auto it = timing.rbegin(); it != timing.rend(); it++)
+    {
+        cout << setw(name_width + 2) << left << it->second << " " << setw(time_width + 2) << right
+             << it->first << "us\n";
+    }
+}
+
+void run_benchmark(shared_ptr<Function> f,
+                   const string& backend_name,
+                   size_t iterations,
+                   bool timing_detail)
+{
+    test::Uniform<float> rng{-1, 1, 0};
+
+    stopwatch timer;
+    timer.start();
    auto manager = runtime::Manager::get(backend_name);
    auto external = manager->compile(f);
+    external->set_emit_timing(timing_detail);
    auto backend = manager->allocate_backend();
    auto cf = backend->make_call_frame(external);
-    build_time.stop();
-    cout << "build_time " << build_time.get_milliseconds() << "ms" << endl;
+    timer.stop();
+    cout.imbue(locale(""));
+    cout << "compile time: " << timer.get_milliseconds() << "ms" << endl;

    vector<shared_ptr<runtime::TensorView>> args;
    for (shared_ptr<op::Parameter> param : f->get_parameters())
@@ -100,9 +171,11 @@ void run_benchmark(const std::string& json_path, const std::string& backend_name
             return p1.total_microseconds() > p2.total_microseconds();
         });
    multimap<size_t, string> timing = aggregate_timing(perf_data);
-    for (auto it = timing.rbegin(); it != timing.rend(); it++)
-    {
-        cout.imbue(locale(""));
-        cout << setw(15) << left << it->second << " " << setw(10) << right << it->first << "us\n";
-    }
+    multimap<size_t, string> timing_details = aggregate_timing_details(perf_data, f);
+
+    cout << "\n---- Aggregate times per op type ----\n";
+    print_times(timing);
+
+    cout << "\n---- Aggregate times per op type/shape ----\n";
+    print_times(timing_details);
 }
--- a/test/util/benchmark.hpp
+++ b/test/util/benchmark.hpp
@@ -18,13 +18,21 @@

 #include <map>

+#include <ngraph/function.hpp>
 #include <ngraph/runtime/call_frame.hpp>
+
 #include "test_tools.hpp"

 /// performance test utilities
 std::multimap<size_t, std::string>
    aggregate_timing(const std::vector<ngraph::runtime::PerformanceCounter>& perf_data);

+void run_benchmark(std::shared_ptr<ngraph::Function> f,
+                   const std::string& backend_name,
+                   size_t iterations,
+                   bool timing_detail);
+
 void run_benchmark(const std::string& json_path,
                   const std::string& backend_name,
-                   size_t iterations);
+                   size_t iterations,
+                   bool timing_detail = false);
--- a/test/util/test_tools.hpp
+++ b/test/util/test_tools.hpp
@@ -36,11 +36,8 @@
 #define ONLY_ENABLE_TEST_FOR(backend_to_enable, current_backend)                                   \
    if (backend_to_enable != current_backend)                                                      \
    {                                                                                              \
+        NGRAPH_INFO << "Skipped test for " << current_backend;                                     \
        return;                                                                                    \
-    }                                                                                              \
-    else                                                                                           \
-    {                                                                                              \
-        NGRAPH_INFO << "Enabled test for " << current_backend;                                     \
    }

 namespace ngraph