Computation reuse (#945)

* Make temp memory pools static to avoid memory allocation overheads * Initial implementation for graph control to enable caching and computation reuse * Added sphinx documentation * Turned off memory buffer reuse in CPU transformer to support computation reuse. Added unit test * Change memoizable to cacheable * Change memoizable to cacheable * Rename variables

Computation reuse (#945)
* Make temp memory pools static to avoid memory allocation overheads * Initial implementation for graph control to enable caching and computation reuse * Added sphinx documentation * Turned off memory buffer reuse in CPU transformer to support computation reuse. Added unit test * Change memoizable to cacheable * Change memoizable to cacheable * Rename variables
41c50b44 · Jayaram Bobba · Scott Cyphers · 698f29f1 · 41c50b44 · 41c50b44
Commit 41c50b44 authored May 08, 2018 by Jayaram Bobba Committed by Scott Cyphers May 08, 2018
12 changed files
--- a/doc/sphinx/source/ops/parameter.rst
+++ b/doc/sphinx/source/ops/parameter.rst
@@ -25,6 +25,9 @@ Attributes
 +------------------+------------------------------------------+
 | ``shape``        | The ``Shape`` of the parameter.          |
 +------------------+------------------------------------------+
+| ``cacheable``    | True if the parameter is not expected to |
+|                  | be frequently updated.                   |
+------------------+------------------------------------------+

 Outputs
 -------

--- a/src/ngraph/op/parameter.cpp
+++ b/src/ngraph/op/parameter.cpp
@@ -21,8 +21,11 @@
 using namespace std;
 using namespace ngraph;

-op::Parameter::Parameter(const element::Type& element_type, const Shape& shape)
+op::Parameter::Parameter(const element::Type& element_type,
+                         const Shape& shape,
+                         const bool cacheable)
    : Op("Parameter", {})
+    , m_cacheable(cacheable)
 {
    add_output(element_type, shape);
 }

--- a/src/ngraph/op/parameter.hpp
+++ b/src/ngraph/op/parameter.hpp
@@ -40,10 +40,17 @@ namespace ngraph
            ///
            /// \param element_type The element type of the parameter.
            /// \param shape The shape of the parameter.
-            Parameter(const ngraph::element::Type& element_type, const Shape& shape);
+            /// \param cacheable True if the parameter is not expected to be frequently updated.
+            Parameter(const ngraph::element::Type& element_type,
+                      const Shape& shape,
+                      const bool cacheable = false);

+            const bool get_cacheable() const { return m_cacheable; }
            virtual std::shared_ptr<Node>
                copy_with_new_args(const NodeVector& new_args) const override;
+
+        protected:
+            bool m_cacheable;
        };
    }
 }
--- a/src/ngraph/pass/memory_layout.cpp
+++ b/src/ngraph/pass/memory_layout.cpp
@@ -27,8 +27,9 @@
 using namespace std;
 using namespace ngraph;

-pass::MemoryLayout::MemoryLayout(size_t alignment)
+pass::MemoryLayout::MemoryLayout(size_t alignment, bool disable_memory_sharing)
    : m_alignment(alignment)
+    , m_disable_memory_sharing(disable_memory_sharing)
 {
 }

@@ -42,9 +43,12 @@ bool pass::MemoryLayout::run_on_function(shared_ptr<ngraph::Function> function)
            size_t offset = mm.allocate(tensor->size());
            tensor->set_pool_offset(offset);
        }
-        for (const descriptor::Tensor* tensor : node->liveness_free_list)
+        if (!m_disable_memory_sharing)
        {
-            mm.free(tensor->get_pool_offset());
+            for (const descriptor::Tensor* tensor : node->liveness_free_list)
+            {
+                mm.free(tensor->get_pool_offset());
+            }
        }
    }
    function->set_temporary_pool_size(mm.max_allocated());

--- a/src/ngraph/pass/memory_layout.hpp
+++ b/src/ngraph/pass/memory_layout.hpp
@@ -35,11 +35,12 @@ namespace ngraph
 class ngraph::pass::MemoryLayout : public FunctionPass
 {
 public:
-    MemoryLayout(size_t alignment = 1);
+    MemoryLayout(size_t alignment = 1, bool disable_memory_sharing = false);
    bool run_on_function(std::shared_ptr<ngraph::Function>) override;

 private:
    size_t m_alignment;
+    bool m_disable_memory_sharing;
 };

 class ngraph::pass::MemoryManager

--- a/src/ngraph/runtime/cpu/cpu_call_frame.cpp
+++ b/src/ngraph/runtime/cpu/cpu_call_frame.cpp
@@ -51,6 +51,7 @@ void runtime::cpu::CPU_CallFrame::call(
    {
        shared_ptr<runtime::cpu::CPUTensorView> tv =
            static_pointer_cast<runtime::cpu::CPUTensorView>(input_tvs[i]);
+        ctx->p_en[i] = tv->get_stale();
        inputs.push_back(tv->get_data_ptr());
    }
    for (size_t i = 0; i < output_tvs.size(); i++)
@@ -100,6 +101,7 @@ void runtime::cpu::CPU_CallFrame::setup_runtime_context()
    {
        ctx->op_durations = new int64_t[m_external_function->get_op_attrs().size()];
    }
+    ctx->p_en = new bool[m_external_function->get_parameter_layout_descriptors().size()];
    const auto& mkldnn_emitter = m_external_function->get_mkldnn_emitter();
    ctx->mkldnn_primitives = mkldnn_emitter->get_mkldnn_primitives().data();
    ctx->mkldnn_workspaces = mkldnn_emitter->get_mkldnn_workspaces().data();
@@ -108,5 +110,6 @@ void runtime::cpu::CPU_CallFrame::setup_runtime_context()
 void runtime::cpu::CPU_CallFrame::cleanup_runtime_context()
 {
    delete[] ctx->op_durations;
+    delete[] ctx->p_en;
    delete ctx;
 }
--- a/src/ngraph/runtime/cpu/cpu_external_function.cpp
+++ b/src/ngraph/runtime/cpu/cpu_external_function.cpp
@@ -330,7 +330,7 @@ void runtime::cpu::CPU_ExternalFunction::compile()
    pass_manager.register_pass<ngraph::pass::ResultCopyElimination>();
    pass_manager.register_pass<ngraph::pass::GetOutputElementElimination>();
    pass_manager.register_pass<ngraph::pass::Liveness>();
-    pass_manager.register_pass<ngraph::pass::MemoryLayout>(s_memory_pool_alignment);
+    pass_manager.register_pass<ngraph::pass::MemoryLayout>(s_memory_pool_alignment, true);
    pass_manager.run_passes(m_function);

    unordered_map<shared_ptr<Function>, list<shared_ptr<Node>>> function_ordered_ops;
@@ -588,6 +588,26 @@ using namespace ngraph::runtime;
                   << ");\n";
        }

+        // Indexing for Control Flags
+        std::map<std::string, size_t> tensor_index_map;
+        std::map<std::string, size_t> param_index_map;
+        size_t tensor_index = 0;
+        for (shared_ptr<Node> node : ordered_ops)
+        {
+            if (!node->is_parameter() && !node->is_constant())
+            {
+                for (const descriptor::Input& input : node->get_inputs())
+                {
+                    const descriptor::Output& output = input.get_output();
+                    shared_ptr<descriptor::TensorView> tv = output.get_tensor_view();
+                    tensor_index_map.insert({tv->get_tensor().get_name(), tensor_index++});
+                }
+            }
+        }
+
+        writer << "bool " << current_function->get_name() << "_t_en[" << tensor_index << "];\n";
+        writer << "bool " << current_function->get_name() << "_init = true;\n";
+
        writer << "extern \"C\" void " << current_function->get_name();
        writer << "(void** inputs, void** outputs, cpu::CPURuntimeContext* ctx)\n";
        writer << "{\n";
@@ -625,6 +645,8 @@ using namespace ngraph::runtime;
            }
        }

+        writer << "bool* t_en = (bool*)" << current_function->get_name() << "_t_en;\n";
+
        // Add inputs to the variable name map
        size_t arg_index = 0;
        for (shared_ptr<ngraph::op::Parameter> param : current_function->get_parameters())
@@ -637,6 +659,7 @@ using namespace ngraph::runtime;
                stringstream ss;
                ss << "((" << type << "*)(inputs[" << arg_index << "]))";
                m_variable_name_map[tv->get_tensor().get_name()] = ss.str();
+                param_index_map[tv->get_tensor().get_name()] = arg_index;
                arg_index++;
            }
        }
@@ -748,6 +771,30 @@ using namespace ngraph::runtime;
            {
                emit_debug_function_entry(writer, node.get(), in, out);
            }
+
+            // Op Control
+            if (!node->is_parameter() && !node->is_constant())
+            {
+                writer << "if (" << current_function->get_name() << "_init ";
+                for (const descriptor::Input& input : node->get_inputs())
+                {
+                    const descriptor::Output& output = input.get_output();
+                    shared_ptr<descriptor::TensorView> tv = output.get_tensor_view();
+                    auto input_name = tv->get_tensor().get_name();
+
+                    if (output.get_node()->is_parameter())
+                    {
+                        writer << " || ctx->p_en[" << param_index_map[input_name] << "]";
+                    }
+                    else if (!output.get_node()->is_constant())
+                    {
+                        writer << " || t_en[" << tensor_index_map[input_name] << "]";
+                    }
+                }
+                writer << ") {\n";
+                writer.indent++;
+            }
+
            string func_name;
            auto it = match_functions.find(node.get());
            if (it == match_functions.end())
@@ -794,6 +841,19 @@ using namespace ngraph::runtime;
            // Emit operation epilogue
            if (!node->is_parameter() && !node->is_constant())
            {
+                for (auto output_name : node_output_names)
+                {
+                    writer << "t_en[" << tensor_index_map[output_name] << "] = true;\n";
+                }
+                writer.indent--;
+                writer << "} else {\n";
+                writer.indent++;
+                for (auto output_name : node_output_names)
+                {
+                    writer << "t_en[" << tensor_index_map[output_name] << "] = false;\n";
+                }
+                writer.indent--;
+                writer << "}\n";
                emit_debug_function_exit(writer, node.get(), in, out);
                if (runtime::cpu::IsTracingEnabled() &&
                    current_function->get_name() == m_function_name)
@@ -850,6 +910,7 @@ using namespace ngraph::runtime;
                writer << "try { G.wait_for_all(); } catch(...) { throw; }\n";
            }
        }
+        writer << current_function->get_name() << "_init = false;\n";

        writer.indent--;
        // End generated function

--- a/src/ngraph/runtime/cpu/cpu_runtime_context.hpp
+++ b/src/ngraph/runtime/cpu/cpu_runtime_context.hpp
@@ -38,6 +38,7 @@ namespace ngraph
            struct CPURuntimeContext
            {
                int64_t* op_durations;
+                bool* p_en;
                mkldnn::primitive* const* mkldnn_primitives;
                char* const* mkldnn_workspaces;
            };

--- a/src/ngraph/runtime/tensor_view.hpp
+++ b/src/ngraph/runtime/tensor_view.hpp
@@ -38,6 +38,7 @@ namespace ngraph
        protected:
            TensorView(const std::shared_ptr<ngraph::descriptor::TensorView>& descriptor)
                : m_descriptor(descriptor)
+                , m_stale(true)
            {
            }

@@ -58,6 +59,8 @@ namespace ngraph
            std::shared_ptr<ngraph::descriptor::layout::TensorViewLayout>
                get_tensor_view_layout() const;

+            bool get_stale() { return m_stale; }
+            void set_stale(bool val) { m_stale = val; }
            /// @brief Write bytes directly into the tensor
            /// @param p Pointer to source of data
            /// @param tensor_offset Offset into tensor storage to begin writing. Must be element-aligned.
@@ -72,6 +75,7 @@ namespace ngraph

        protected:
            std::shared_ptr<ngraph::descriptor::TensorView> m_descriptor;
+            bool m_stale;
        };

        using TensorViewPtrs = std::vector<std::shared_ptr<TensorView>>;

--- a/src/ngraph/serializer.cpp
+++ b/src/ngraph/serializer.cpp
@@ -739,7 +739,8 @@ static shared_ptr<ngraph::Function>
                    node_js.count("element_type") == 0 ? node_js.at("value_type") : node_js;
                auto element_type = read_element_type(type_node_js.at("element_type"));
                auto shape = type_node_js.at("shape");
-                node = make_shared<op::Parameter>(element_type, shape);
+                auto cacheable = get_or_default<bool>(node_js, "cacheable", false);
+                node = make_shared<op::Parameter>(element_type, shape, cacheable);
            }
            else if (node_op == "Power")
            {

--- a/test/backend_test.in.cpp
+++ b/test/backend_test.in.cpp
@@ -5837,6 +5837,48 @@ TEST(${BACKEND_NAME}, mkldnn_layouts)
    EXPECT_EQ(vector<float>{expected_result}, rv);
 }

+TEST(${BACKEND_NAME}, computation_reuse)
+{
+    ONLY_ENABLE_TEST_FOR("CPU", "${BACKEND_NAME}");
+
+    Shape shape_a{1, 16, 2, 2};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_b{32, 16, 1, 1};
+    auto B = make_shared<op::Parameter>(element::f32, shape_b);
+    Shape shape_r{1, 32, 2, 2};
+    auto conv = make_shared<op::Convolution>(A,
+                                             B,
+                                             Strides{1, 1},
+                                             Strides{1, 1},
+                                             CoordinateDiff{0, 0},
+                                             CoordinateDiff{0, 0},
+                                             Strides{1, 1});
+    Shape pool_shape{1, 1};
+    auto pool = make_shared<op::AvgPool>(conv, pool_shape);
+    auto bias = make_shared<op::Broadcast>(
+        op::Constant::create(element::f32, Shape{}, {2.14}), shape_r, AxisSet{0, 1, 2, 3});
+    auto result_op = make_shared<op::Result>(pool + bias);
+    auto f = make_shared<Function>(ResultVector{result_op}, op::ParameterVector{A, B});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    vector<float> input(64, 1.0f);
+    vector<float> weights(512, 0.5f);
+    vector<float> rv(128);
+
+    auto a = backend->create_tensor(element::f32, shape_a, input.data());
+    auto b = backend->create_tensor(element::f32, shape_b, weights.data());
+    auto result = backend->create_tensor(element::f32, shape_r, rv.data());
+
+    backend->call(f, {result}, {a, b});
+
+    vector<float> rv_saved(rv);
+
+    b->set_stale(false);
+    backend->call(f, {result}, {a, b});
+    EXPECT_EQ(rv_saved, rv);
+}
+
 TEST(${BACKEND_NAME}, avg_pool_1d_1channel_1image)
 {
    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");

--- a/test/util/benchmark.cpp
+++ b/test/util/benchmark.cpp
@@ -205,11 +205,13 @@ void run_benchmark(shared_ptr<Function> f,
    cout << "compile time: " << timer.get_milliseconds() << "ms" << endl;

    vector<shared_ptr<runtime::TensorView>> args;
+    vector<bool> args_cacheable;
    for (shared_ptr<op::Parameter> param : f->get_parameters())
    {
        auto tensor = backend->create_tensor(param->get_element_type(), param->get_shape());
        random_init(tensor);
        args.push_back(tensor);
+        args_cacheable.push_back(param->get_cacheable());
    }
    vector<shared_ptr<runtime::TensorView>> results;
    for (shared_ptr<Node> out : f->get_results())
@@ -218,6 +220,13 @@ void run_benchmark(shared_ptr<Function> f,
        results.push_back(result);
    }

+    for (size_t i = 0; i < args.size(); i++)
+    {
+        if (args_cacheable[i])
+        {
+            args[i]->set_stale(false);
+        }
+    }
    stopwatch t1;
    t1.start();
    for (size_t i = 0; i < static_cast<size_t>(iterations); i++)