Reuse memory for CPU backend. (#2238)

* Reuse memory for CPU backend. * Use NGRAPH_REUSE_MEMORY to enable memory reuse. * Add a test. * Move make_function to test_tools.cpp. * Add more comments. * Address PR Feedback: add a method to CPU backend. * *Add a member to CPUOpAnnotations to remove redundant code. *Overload compile function for CPU backend. * Move make_function out of test_tools. * Address PR Feedback. * Use modified liveness analysis in CPUMemoryAssignment pass. * Use lambda expression. * Fix style error. * Check if any user of the tensor has destructive io when building tensor alias map. * Fix a bug. * Check if tensor has multiple users. * Allow tensor alias for destructive oi node. * Update multiple_users_tensor set along the chain of in place ops. * No tensor alias if input is parameter or constant. * Use buffer sets in cpu memory assignment, tensors sharing the same memory buffer are put into the same set. * Add more checks and do not combine sets when allowing destructive oi. * Style fix. * Do no allow destructive oi if the input tensor uses function input memory. Update set label. * Add unit tests. * Style fix. * Get the correct size for memcpy when the input is padded. * Style fix. * Address PR feedback. * Address PR feedback. * Move make_function in cpu_test after #if 0 and before the disabled test. * Add utility functions. Use iterator. Rename variables. * Add pass attributes and move cpu memory assignment to common passes (#2504)

Reuse memory for CPU backend. (#2238)
* Reuse memory for CPU backend. * Use NGRAPH_REUSE_MEMORY to enable memory reuse. * Add a test. * Move make_function to test_tools.cpp. * Add more comments. * Address PR Feedback: add a method to CPU backend. * *Add a member to CPUOpAnnotations to remove redundant code. *Overload compile function for CPU backend. * Move make_function out of test_tools. * Address PR Feedback. * Use modified liveness analysis in CPUMemoryAssignment pass. * Use lambda expression. * Fix style error. * Check if any user of the tensor has destructive io when building tensor alias map. * Fix a bug. * Check if tensor has multiple users. * Allow tensor alias for destructive oi node. * Update multiple_users_tensor set along the chain of in place ops. * No tensor alias if input is parameter or constant. * Use buffer sets in cpu memory assignment, tensors sharing the same memory buffer are put into the same set. * Add more checks and do not combine sets when allowing destructive oi. * Style fix. * Do no allow destructive oi if the input tensor uses function input memory. Update set label. * Add unit tests. * Style fix. * Get the correct size for memcpy when the input is padded. * Style fix. * Address PR feedback. * Address PR feedback. * Move make_function in cpu_test after #if 0 and before the disabled test. * Add utility functions. Use iterator. Rename variables. * Add pass attributes and move cpu memory assignment to common passes (#2504)
b277627a · Amy Zhuang · Scott Cyphers · c32512fa · b277627a · b277627a
Commit b277627a authored Feb 27, 2019 by Amy Zhuang Committed by Scott Cyphers Feb 27, 2019
18 changed files
--- a/src/ngraph/pass/pass_config.cpp
+++ b/src/ngraph/pass/pass_config.cpp
@@ -21,6 +21,7 @@

 using namespace ngraph;

+// TODO: Add file-based configuration support
 ngraph::pass::PassConfig::PassConfig()
 {
    /**
@@ -42,9 +43,37 @@ ngraph::pass::PassConfig::PassConfig()
            auto split_str = split(substr, ':', false);
            switch (split_str.size())
            {
-            case 1: m_enables.emplace(split_str[0], true); break;
-            case 2: m_enables.emplace(split_str[0], parse_string<bool>(split_str[1])); break;
-            default: throw ngraph_error("Unexpected string in get_pass_enables: " + substr);
+            case 1: m_pass_enables.emplace(split_str[0], true); break;
+            case 2: m_pass_enables.emplace(split_str[0], parse_string<bool>(split_str[1])); break;
+            default: throw ngraph_error("Unexpected string in NGRAPH_PASS_ENABLES: " + substr);
+            }
+        }
+    }
+    /**
+    * Parses the semi-colon separated environment string passed through NGRAPH_PASS_ATTRIBUTES
+    * and returns the pass attributes and whether they should be enabled or disabled in the
+    * provided unordered_map. Naming of pass attributes is up to the backends
+    * E.g., NGRAPH_PASS_ATTRIBUTES="OptimizeForMemory=0;MemoryAssignment::ReuseMemory=1;UseDefaultLayouts"
+    * would set false on "OptimizeForMemory", true on "MemoryAssignment::ReuseMemory" and true on
+    * "UseDefaultLayouts"
+    **/
+    env_str = std::getenv("NGRAPH_PASS_ATTRIBUTES");
+    if (env_str)
+    {
+        std::stringstream ss;
+        ss << env_str;
+        while (ss.good())
+        {
+            std::string substr;
+            std::getline(ss, substr, ';');
+            auto split_str = split(substr, '=', false);
+            switch (split_str.size())
+            {
+            case 1: m_pass_attributes.emplace(split_str[0], true); break;
+            case 2:
+                m_pass_attributes.emplace(split_str[0], parse_string<bool>(split_str[1]));
+                break;
+            default: throw ngraph_error("Unexpected string in NGRAPH_PASS_ATTRIBUTES: " + substr);
            }
        }
    }
@@ -52,14 +81,28 @@ ngraph::pass::PassConfig::PassConfig()

 void ngraph::pass::PassConfig::set_pass_enable(std::string name, bool enable)
 {
-    m_enables[name] = enable;
+    m_pass_enables[name] = enable;
 }

 bool ngraph::pass::PassConfig::get_pass_enable(std::string name)
 {
-    if (m_enables.find(name) == m_enables.end())
+    if (m_pass_enables.find(name) == m_pass_enables.end())
+    {
+        return false;
+    }
+    return m_pass_enables[name];
+}
+
+void ngraph::pass::PassConfig::set_pass_attribute(std::string name, bool enable)
+{
+    m_pass_attributes[name] = enable;
+}
+
+bool ngraph::pass::PassConfig::get_pass_attribute(std::string name)
+{
+    if (m_pass_attributes.find(name) == m_pass_attributes.end())
    {
        return false;
    }
-    return m_enables[name];
+    return m_pass_attributes[name];
 }
--- a/src/ngraph/pass/pass_config.hpp
+++ b/src/ngraph/pass/pass_config.hpp
@@ -30,10 +30,14 @@ class ngraph::pass::PassConfig
 {
 public:
    PassConfig();
-    const std::map<std::string, bool>& get_enables() { return m_enables; }
+    const std::map<std::string, bool>& get_enables() { return m_pass_enables; }
    void set_pass_enable(std::string name, bool enable);
    bool get_pass_enable(std::string name);
+    const std::map<std::string, bool>& get_pass_attributes() { return m_pass_attributes; }
+    void set_pass_attribute(std::string name, bool enable);
+    bool get_pass_attribute(std::string name);

 private:
-    std::map<std::string, bool> m_enables;
+    std::map<std::string, bool> m_pass_enables;
+    std::map<std::string, bool> m_pass_attributes;
 };
--- a/src/ngraph/pass/propagate_cacheability.cpp
+++ b/src/ngraph/pass/propagate_cacheability.cpp
@@ -39,13 +39,9 @@ bool ngraph::pass::PropagateCacheability::run_on_function(std::shared_ptr<Functi
                op_annotations = op_annotations_factory();
                op->set_op_annotations(op_annotations);
            }
-            if (std::dynamic_pointer_cast<op::Constant>(node))
-            {
-                op_annotations->set_cacheable(true);
-                NGRAPH_DEBUG << "propagate cacheability: cacheability is 1";
-            }
-            else if (auto parameter = std::dynamic_pointer_cast<op::Parameter>(node))
+            if (node->is_parameter())
            {
+                auto parameter = std::static_pointer_cast<op::Parameter>(node);
                op_annotations->set_cacheable(parameter->get_cacheable());
                NGRAPH_DEBUG << "propagate cacheability: cacheability is "
                             << parameter->get_cacheable();

--- a/src/ngraph/runtime/cpu/CMakeLists.txt
+++ b/src/ngraph/runtime/cpu/CMakeLists.txt
@@ -109,6 +109,7 @@ set(SRC
    pass/cpu_layout.cpp
    pass/cpu_loop_kernel_fusion.cpp
    pass/cpu_mat_fusion.cpp
+    pass/cpu_memory_assignment.cpp
    pass/cpu_memory_optimization.cpp
    pass/cpu_post_layout_optimizations.cpp
    pass/cpu_rnn_fusion.cpp

--- a/src/ngraph/runtime/cpu/builder/convolution.cpp
+++ b/src/ngraph/runtime/cpu/builder/convolution.cpp
@@ -188,7 +188,7 @@ namespace ngraph
                auto& arg2_tensor = external_function->get_tensor_data(args[2].get_name());
                auto& arg3_tensor = external_function->get_tensor_data(args[3].get_name());
                auto& out_tensor = external_function->get_tensor_data(out[0].get_name());
-                size_t arg3_size = args[3].get_size();
+                size_t arg3_size = node->get_inputs()[3].get_tensor().size();

                if (runtime::cpu::mkldnn_utils::use_mkldnn_kernel(node))
                {
@@ -229,7 +229,7 @@ namespace ngraph
                auto& arg1_tensor = external_function->get_tensor_data(args[1].get_name());
                auto& arg2_tensor = external_function->get_tensor_data(args[2].get_name());
                auto& out_tensor = external_function->get_tensor_data(out[0].get_name());
-                size_t arg2_size = args[2].get_size();
+                size_t arg2_size = node->get_inputs()[2].get_tensor().size();

                if (runtime::cpu::mkldnn_utils::use_mkldnn_kernel(node))
                {

--- a/src/ngraph/runtime/cpu/builder/quantized_conv.cpp
+++ b/src/ngraph/runtime/cpu/builder/quantized_conv.cpp
@@ -203,8 +203,8 @@ namespace ngraph
                    auto& arg4_tensor = external_function->get_tensor_data(args[4].get_name());
                    auto& arg5_tensor = external_function->get_tensor_data(args[5].get_name());
                    auto& out0_tensor = external_function->get_tensor_data(out[0].get_name());
+                    size_t arg3_size = node->get_inputs()[3].get_tensor().size();

-                    size_t arg3_size = args[3].get_size();
                    auto scales_size = shape_size(args[4].get_shape());
                    auto sum_scales_size = shape_size(args[5].get_shape());

@@ -297,8 +297,8 @@ namespace ngraph
                    auto& arg4_tensor = external_function->get_tensor_data(args[4].get_name());
                    auto& arg5_tensor = external_function->get_tensor_data(args[5].get_name());
                    auto& out0_tensor = external_function->get_tensor_data(out[0].get_name());
+                    size_t arg3_size = node->get_inputs()[3].get_tensor().size();

-                    size_t arg3_size = args[3].get_size();
                    auto scales_size = shape_size(args[4].get_shape());
                    auto sum_scales_size = shape_size(args[5].get_shape());


--- a/src/ngraph/runtime/cpu/cpu_backend.cpp
+++ b/src/ngraph/runtime/cpu/cpu_backend.cpp
@@ -51,9 +51,10 @@ namespace
 }

 shared_ptr<runtime::cpu::CPU_CallFrame> runtime::cpu::CPU_Backend::make_call_frame(
-    const shared_ptr<runtime::cpu::CPU_ExternalFunction>& external_function)
+    const shared_ptr<runtime::cpu::CPU_ExternalFunction>& external_function,
+    ngraph::pass::PassConfig& pass_config)
 {
-    return external_function->make_call_frame();
+    return external_function->make_call_frame(pass_config);
 }

 shared_ptr<runtime::Tensor>
@@ -70,6 +71,15 @@ shared_ptr<runtime::Tensor> runtime::cpu::CPU_Backend::create_tensor(

 shared_ptr<runtime::Executable>
    runtime::cpu::CPU_Backend::compile(shared_ptr<Function> func, bool performance_counters_enabled)
+{
+    ngraph::pass::PassConfig pass_config;
+    return compile(func, pass_config, performance_counters_enabled);
+}
+
+shared_ptr<runtime::Executable>
+    runtime::cpu::CPU_Backend::compile(shared_ptr<Function> func,
+                                       ngraph::pass::PassConfig& pass_config,
+                                       bool performance_counters_enabled)
 {
    shared_ptr<runtime::Executable> rc;
    auto it = m_exec_map.find(func);
@@ -79,13 +89,14 @@ shared_ptr<runtime::Executable>
    }
    else
    {
-        rc = make_shared<CPU_Executable>(func, performance_counters_enabled);
+        rc = make_shared<CPU_Executable>(func, pass_config, performance_counters_enabled);
        m_exec_map.insert({func, rc});
    }
    return rc;
 }

 runtime::cpu::CPU_Executable::CPU_Executable(shared_ptr<Function> func,
+                                             ngraph::pass::PassConfig& pass_config,
                                             bool performance_counters_enabled)
 {
    FunctionInstance& instance = m_function_instance;
@@ -93,7 +104,7 @@ runtime::cpu::CPU_Executable::CPU_Executable(shared_ptr<Function> func,
    {
        instance.m_external_function = make_shared<CPU_ExternalFunction>(func);
        instance.m_external_function->m_emit_timing = performance_counters_enabled;
-        auto cf = instance.m_external_function->make_call_frame();
+        auto cf = instance.m_external_function->make_call_frame(pass_config);
        instance.m_call_frame = dynamic_pointer_cast<CPU_CallFrame>(cf);
    }
    set_parameters_and_results(*func);

--- a/src/ngraph/runtime/cpu/cpu_backend.hpp
+++ b/src/ngraph/runtime/cpu/cpu_backend.hpp
@@ -20,6 +20,7 @@
 #include <memory>

 #include "cpu_backend_visibility.h"
+#include "ngraph/pass/pass_config.hpp"
 #include "ngraph/runtime/backend.hpp"

 namespace ngraph
@@ -35,7 +36,8 @@ namespace ngraph
            {
            public:
                std::shared_ptr<CPU_CallFrame>
-                    make_call_frame(const std::shared_ptr<CPU_ExternalFunction>& external_function);
+                    make_call_frame(const std::shared_ptr<CPU_ExternalFunction>& external_function,
+                                    ngraph::pass::PassConfig& pass_config);

                std::shared_ptr<ngraph::runtime::Tensor>
                    create_tensor(const ngraph::element::Type& element_type,
@@ -50,6 +52,11 @@ namespace ngraph
                    compile(std::shared_ptr<Function> func,
                            bool enable_performance_counters = false) override;

+                std::shared_ptr<ngraph::runtime::Executable>
+                    compile(std::shared_ptr<Function> func,
+                            ngraph::pass::PassConfig& pass_config,
+                            bool enable_performance_counters = false);
+
                void remove_compiled_function(std::shared_ptr<Executable> exec) override;

                bool is_supported(const Node& node) const override;
@@ -63,7 +70,9 @@ namespace ngraph
            class CPU_BACKEND_API CPU_Executable : public runtime::Executable
            {
            public:
-                CPU_Executable(std::shared_ptr<Function> func, bool performance_counters_enabled);
+                CPU_Executable(std::shared_ptr<Function> func,
+                               ngraph::pass::PassConfig& pass_config,
+                               bool performance_counters_enabled);
                bool call(const std::vector<std::shared_ptr<runtime::Tensor>>& outputs,
                          const std::vector<std::shared_ptr<runtime::Tensor>>& inputs) override;


--- a/src/ngraph/runtime/cpu/cpu_debugger.cpp
+++ b/src/ngraph/runtime/cpu/cpu_debugger.cpp
@@ -129,8 +129,8 @@ bool runtime::cpu::CPU_Debugger::delete_breakpoint(std::shared_ptr<Node> op)

 void* runtime::cpu::CPU_Debugger::inspect(std::shared_ptr<Node> op, size_t output_index)
 {
-    return m_callframe.m_external_function->tensor_data.at(op->get_name() + "_" +
-                                                           to_string(output_index));
+    return m_callframe.m_external_function->get_tensor_data(op->get_name() + "_" +
+                                                            to_string(output_index));
 }

 bool runtime::cpu::CPU_Debugger::add_tracepoint(

--- a/src/ngraph/runtime/cpu/cpu_external_function.cpp
+++ b/src/ngraph/runtime/cpu/cpu_external_function.cpp
--- a/src/ngraph/runtime/cpu/cpu_external_function.hpp
+++ b/src/ngraph/runtime/cpu/cpu_external_function.hpp
@@ -49,6 +49,7 @@
 #include "ngraph/runtime/cpu/mkldnn_emitter.hpp"
 #include "ngraph/runtime/performance_counter.hpp"
 #include "ngraph/state/state.hpp"
+#include "ngraph/util.hpp"

 namespace ngraph
 {
@@ -95,18 +96,11 @@ namespace ngraph
                friend class CPU_Executable;

            public:
-                enum class CPUTensorRole
-                {
-                    INPUT,
-                    CONSTANT,
-                    OUTPUT,
-                    INTERMEDIATE
-                };
-
                CPU_ExternalFunction(const std::shared_ptr<ngraph::Function>& function,
                                     bool release_function = true);
                ~CPU_ExternalFunction();
-                std::shared_ptr<ngraph::runtime::cpu::CPU_CallFrame> make_call_frame();
+                std::shared_ptr<ngraph::runtime::cpu::CPU_CallFrame>
+                    make_call_frame(ngraph::pass::PassConfig& pass_config);

                const LayoutDescriptorPtrs& get_parameter_layout_descriptors();
                const LayoutDescriptorPtrs& get_result_layout_descriptors();
@@ -171,11 +165,11 @@ namespace ngraph
 #endif

            protected:
-                void build();
+                void build(ngraph::pass::PassConfig& pass_config);

 #if !defined(NGRAPH_DEX_ONLY)

-                void compile();
+                void compile(ngraph::pass::PassConfig& pass_config);

 #endif

@@ -183,37 +177,8 @@ namespace ngraph

            private:
                // Register passes that are common to codegen and DEX
-                void register_common_passes(ngraph::pass::Manager& pass_manager);
-
-                // For non-destructive passthrough kernels, propagate function
-                // constant buffers to internal ops
-                void propagate_in_place_constant(ngraph::descriptor::Output* output,
-                                                 std::string input_name,
-                                                 bool dex);
-                // For non-destructive passthrough kernels, propagate function
-                // input buffers to internal ops
-                void propagate_in_place_input(ngraph::descriptor::Output* output,
-                                              std::string input_name,
-                                              bool dex);
-                // For in-place kernels, propagate function output buffers to
-                // internal ops
-                void propagate_in_place_output(ngraph::descriptor::Output* res_src_output,
-                                               std::string output_name,
-                                               bool dex);
-
-                // Find in-place concat ops and set appropriate memory pool offset for its arguments
-                void process_in_place_concat(std::list<std::shared_ptr<Node>> nodes);
-
-                // For a chain of concat ops, propagate memory pool offsets
-                void propagate_in_place_concat(std::shared_ptr<ngraph::op::Concat> concat);
-
-                // Find in-place slice ops and set appropriate memory pool offset for its output
-                void process_in_place_slice(std::list<std::shared_ptr<Node>> nodes);
-
-                // propagate slice when its arg comes from function input
-                void propagate_in_place_slice(ngraph::descriptor::Output* output,
-                                              size_t input_index,
-                                              size_t input_offset);
+                void register_common_passes(ngraph::pass::Manager& pass_manager,
+                                            ngraph::pass::PassConfig& pass_config);

                bool computes_result(Node* node);
                void release_function() { m_function = nullptr; }
@@ -238,6 +203,9 @@ namespace ngraph
                std::string emit_op_as_function(const Node&, const std::string& function_name);
                std::string strip_comments(const std::string&);

+                std::unordered_set<descriptor::Tensor*>&
+                    get_tensor_set(descriptor::Tensor* output_tensor);
+
                std::unique_ptr<codegen::Compiler> m_compiler;
                std::unique_ptr<codegen::ExecutionEngine> m_execution_engine;

@@ -269,8 +237,10 @@ namespace ngraph
                std::unordered_map<std::string, std::string> m_variable_name_map;
                std::unordered_map<std::string, std::pair<std::size_t, std::size_t>>
                    m_variable_input_index_offset_map;
+                std::unordered_map<std::string, std::pair<std::size_t, std::size_t>>
+                    m_variable_output_index_offset_map;

-                std::unordered_map<std::string, CPUTensorRole> m_tensor_roles;
+                std::unordered_map<std::string, ngraph::CPUTensorRole> m_tensor_roles;

                LayoutDescriptorPtrs parameter_layout_descriptors;
                LayoutDescriptorPtrs result_layout_descriptors;
@@ -290,15 +260,33 @@ namespace ngraph
                    executor;
                std::unordered_map<std::string, void*> tensor_data;
                std::unordered_map<std::string, bool> tensor_stale;
+                // Each tensor is put into one buffer set.
+                // All the tensors in the same buffer set share the same memory buffer.
+                // bufferID_to_tensorSets maps bufferID to the pair of CPUTensorRole and buffer set.
+                // CPUTensorRole is INPUT, CONSTANT, OUTPUT, or INTERMEDIATE,
+                // which tells from where the memory buffer comes.
+                std::unordered_map<
+                    size_t,
+                    std::pair<ngraph::CPUTensorRole, std::unordered_set<descriptor::Tensor*>>>
+                    bufferID_to_tensorSets;
+                // tensor_to_bufferID maps tensor to the ID of the buffer set it belongs to.
+                std::unordered_map<descriptor::Tensor*, size_t> tensor_to_bufferID;
                std::unordered_map<std::string, std::string> tensor_alias;
-                std::unordered_map<std::string, size_t> function_input_name_index;
+
+                // tensor pointer and its offset into the memory allocated for intermediates
+                // used to calculate the correct address at runtime
                std::list<std::pair<std::reference_wrapper<void*>, size_t>> intermediates_offsets;
+                // tensor pointer, input index, offset into the input, and if the input is stale
+                // used to calculate the correct address at runtime
+                std::list<std::tuple<std::reference_wrapper<void*>,
+                                     size_t,
+                                     size_t,
+                                     std::reference_wrapper<bool>>>
+                    function_input_index_offset;
+                // tensor pointer, output index, and offset into the output
+                // used to calculate the correct address at runtime
                std::list<std::tuple<std::reference_wrapper<void*>, size_t, size_t>>
-                    intermediate_input_index_offset;
-                std::list<
-                    std::tuple<std::reference_wrapper<void*>, size_t, std::reference_wrapper<bool>>>
-                    function_input_index;
-                std::list<std::pair<std::reference_wrapper<void*>, size_t>> function_output_index;
+                    function_output_index_offset;
                std::unordered_map<std::string, std::shared_ptr<CPU_ExternalFunction>> callees;
                bool m_is_built;
                std::vector<runtime::PerformanceCounter> m_perf_counters;

--- a/src/ngraph/runtime/cpu/pass/cpu_memory_assignment.cpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_memory_assignment.cpp
--- a/src/ngraph/runtime/cpu/pass/cpu_memory_assignment.hpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_memory_assignment.hpp
+//*****************************************************************************
+// Copyright 2017-2019 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+
+#pragma once
+
+#include <limits>
+#include <list>
+#include <sstream>
+#include <unordered_map>
+#include <unordered_set>
+
+#include "ngraph/pass/pass.hpp"
+#include "ngraph/util.hpp"
+
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            namespace pass
+            {
+                class CPUMemoryAssignment;
+            }
+        }
+    }
+}
+class ngraph::runtime::cpu::pass::CPUMemoryAssignment : public ngraph::pass::FunctionPass
+{
+public:
+    CPUMemoryAssignment(
+        std::unordered_map<size_t,
+                           std::pair<CPUTensorRole, std::unordered_set<descriptor::Tensor*>>>&,
+        std::unordered_map<descriptor::Tensor*, size_t>&,
+        size_t alignment = 1,
+        bool disable_memory_sharing = false);
+    bool run_on_function(std::shared_ptr<ngraph::Function>) override;
+
+private:
+    // Find in-place concat ops and set appropriate memory pool offset for its arguments
+    void process_in_place_concat(std::list<std::shared_ptr<Node>> nodes);
+
+    // For a chain of concat ops, propagate memory pool offsets
+    void propagate_in_place_concat(std::shared_ptr<ngraph::op::Op> concat, size_t index);
+
+    // Find in-place slice ops and set appropriate memory pool offset for its output
+    void process_in_place_slice(std::list<std::shared_ptr<Node>> nodes);
+
+    // propagate slice when its arg comes from function input
+    void propagate_in_place_slice(ngraph::descriptor::Input* input, size_t input_index);
+
+    // build buffer sets maps
+    void build_buffer_sets_maps(std::list<std::shared_ptr<Node>>& ops);
+
+    // liveness analysis to build new and free list for each node
+    void liveness_analysis(std::list<std::shared_ptr<Node>>& ops);
+
+    size_t get_bufferID(descriptor::Tensor* tensor);
+
+    size_t m_alignment;
+    bool m_disable_memory_sharing;
+    std::set<descriptor::Tensor*> m_tensor_caching;
+    std::unordered_map<size_t,
+                       std::pair<ngraph::CPUTensorRole, std::unordered_set<descriptor::Tensor*>>>&
+        m_bufferID_to_tensorSets;
+    std::unordered_map<descriptor::Tensor*, size_t>& m_tensor_to_bufferID;
+};
--- a/src/ngraph/runtime/cpu/pass/cpu_memory_optimization.cpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_memory_optimization.cpp
@@ -62,8 +62,9 @@ bool runtime::cpu::pass::CPUMemoryOptimization::run_on_function(std::shared_ptr<
 {
    for (auto n : function->get_ordered_ops())
    {
-        if (auto concat = std::dynamic_pointer_cast<op::Concat>(n))
+        if (n->description() == "Concat")
        {
+            auto concat = std::static_pointer_cast<op::Concat>(n);
            auto shape = concat->get_input_shape(0);
            auto axis = concat->get_concatenation_axis();
            auto product = 1;
@@ -119,8 +120,7 @@ bool runtime::cpu::pass::CPUMemoryOptimization::run_on_function(std::shared_ptr<

                const auto& output = input.get_output();
                auto arg = output.get_node();
-                if (std::dynamic_pointer_cast<op::Constant>(arg) ||
-                    std::dynamic_pointer_cast<op::Parameter>(arg))
+                if (arg->is_constant() || arg->is_parameter())
                {
                    NGRAPH_DEBUG << "cpu_memory_optimization: " << arg->get_name()
                                 << ": constant or parameter, no in place concat";
@@ -130,7 +130,7 @@ bool runtime::cpu::pass::CPUMemoryOptimization::run_on_function(std::shared_ptr<

                NGRAPH_ASSERT(arg->get_output_size() == 1);

-                if (!std::dynamic_pointer_cast<op::Concat>(arg))
+                if (arg->description() != "Concat")
                {
                    if (arg->is_op())
                    {
@@ -154,7 +154,7 @@ bool runtime::cpu::pass::CPUMemoryOptimization::run_on_function(std::shared_ptr<
                    for (auto output_input : output.get_inputs())
                    {
                        auto user = output_input->get_node();
-                        if (std::dynamic_pointer_cast<op::Concat>(user))
+                        if (user->description() == "Concat")
                        {
                            concat_count++;
                            if (concat_count == 2)
@@ -225,8 +225,9 @@ bool runtime::cpu::pass::CPUMemoryOptimization::run_on_function(std::shared_ptr<

    for (auto n : function->get_ordered_ops())
    {
-        if (auto slice = std::dynamic_pointer_cast<op::Slice>(n))
+        if (n->description() == "Slice")
        {
+            auto slice = std::static_pointer_cast<op::Slice>(n);
            auto in_shape = slice->get_input_shape(0);
            auto out_shape = slice->get_output_shape(0);
            auto strides = slice->get_strides();
@@ -235,7 +236,6 @@ bool runtime::cpu::pass::CPUMemoryOptimization::run_on_function(std::shared_ptr<
            auto upper_bounds = slice->get_upper_bounds();

            auto arg = slice->get_argument(0);
-
            if (arg->is_constant())
            {
                NGRAPH_DEBUG << "cpu_memory_optimization: " << arg->get_name()
@@ -243,25 +243,6 @@ bool runtime::cpu::pass::CPUMemoryOptimization::run_on_function(std::shared_ptr<
                continue;
            }

-            bool no_in_place_slice = false;
-            if (arg->is_parameter())
-            {
-                for (auto user : slice->get_users())
-                {
-                    if (user->is_output())
-                    {
-                        NGRAPH_DEBUG << "cpu_memory_optimization: slice between function input and "
-                                        "output, no in place slice";
-                        no_in_place_slice = true;
-                        break;
-                    }
-                }
-            }
-            if (no_in_place_slice)
-            {
-                continue;
-            }
-
            if (is_strided(strides))
            {
                NGRAPH_DEBUG << "cpu_memory_optimization: strided slice, no in place slice";

--- a/src/ngraph/util.hpp
+++ b/src/ngraph/util.hpp
@@ -219,6 +219,14 @@ namespace ngraph
    * bprop function will have these nodes as the first N input parameters
    **/
    FpropCache cache_fprop(std::shared_ptr<Function> fprop, std::shared_ptr<Function> bprop);
+
+    enum class CPUTensorRole
+    {
+        INPUT,
+        CONSTANT,
+        OUTPUT,
+        INTERMEDIATE
+    };
 } // end namespace ngraph

 std::ostream& operator<<(std::ostream& os, const ngraph::NodeVector& nv);
--- a/test/backend_test.in.cpp
+++ b/test/backend_test.in.cpp
@@ -2861,7 +2861,7 @@ NGRAPH_TEST(${BACKEND_NAME}, computation_reuse)
    Shape shape_a{1, 16, 2, 2};
    auto A = make_shared<op::Parameter>(element::f32, shape_a);
    Shape shape_b{32, 16, 1, 1};
-    auto B = make_shared<op::Parameter>(element::f32, shape_b);
+    auto B = make_shared<op::Parameter>(element::f32, shape_b, true);
    Shape shape_r{1, 32, 2, 2};
    auto conv = make_shared<op::Convolution>(A,
                                             B,

--- a/test/cpu_test.cpp
+++ b/test/cpu_test.cpp
@@ -32,6 +32,7 @@
 #include "ngraph/op/parameter.hpp"
 #include "ngraph/pass/manager.hpp"
 #include "ngraph/pass/visualize_tree.hpp"
+#include "ngraph/runtime/cpu/cpu_backend.hpp"
 #include "ngraph/runtime/cpu/op/convert_layout.hpp"
 #include "ngraph/serializer.hpp"
 #include "ngraph/util.hpp"
@@ -669,6 +670,206 @@ TEST(cpu_test, convolution_large_padding)
    compare_backends(int_f, cpu_f, "INTERPRETER", "CPU", 1e-4, 1e-4);
 }

+#if 0
+static std::shared_ptr<Function> make_function(const std::string& file_name)
+{
+    const string json_path = file_util::path_join(SERIALIZED_ZOO, file_name);
+    const string json_string = file_util::read_file_to_string(json_path);
+    stringstream ss(json_string);
+    shared_ptr<Function> func = ngraph::deserialize(ss);
+    return func;
+}
+
+TEST(cpu_test, memory_reuse_mxnet_densenet121)
+{
+    const std::string file_name("mxnet/mxnet_densenet121_inference_batch1_float32.json");
+    auto cpu_f = make_function(file_name);
+
+    test::Uniform<float> rng(-1.0f, 1.0f);
+    vector<vector<float>> args;
+
+    for (shared_ptr<op::Parameter> param : cpu_f->get_parameters())
+    {
+        vector<float> tensor_val(shape_size(param->get_shape()));
+        rng.initialize(tensor_val);
+        args.push_back(tensor_val);
+    }
+
+    // without memory reuse
+    auto cpu_results = execute(cpu_f, args, "CPU");
+
+    auto cpu_f_new = make_function(file_name);
+    auto cpu_results_new = execute(cpu_f_new, args, "CPU");
+    for (size_t i = 0; i < cpu_results.size(); i++)
+    {
+        EXPECT_TRUE(test::all_close(cpu_results.at(i), cpu_results_new.at(i), 1.0e-4f, 1.0e-4f));
+    }
+
+    // with memory reuse
+    auto backend = runtime::Backend::create("CPU");
+    auto parms = cpu_f->get_parameters();
+    std::vector<std::shared_ptr<ngraph::runtime::Tensor>> arg_tensors(args.size());
+    for (size_t i = 0; i < args.size(); i++)
+    {
+        auto t = backend->create_tensor(parms.at(i)->get_element_type(), parms.at(i)->get_shape());
+        copy_data(t, args.at(i));
+        arg_tensors.at(i) = t;
+    }
+
+    auto results = cpu_f->get_results();
+    std::vector<std::shared_ptr<ngraph::runtime::Tensor>> result_tensors(results.size());
+
+    for (size_t i = 0; i < results.size(); i++)
+    {
+        result_tensors.at(i) =
+            backend->create_tensor(results.at(i)->get_element_type(), results.at(i)->get_shape());
+    }
+
+    ngraph::pass::PassConfig pass_config;
+    pass_config.set_pass_attribute("CPUMemoryAssignment::ReuseMemory", true);
+    auto cpu_backend = std::unique_ptr<runtime::cpu::CPU_Backend>(
+        static_cast<runtime::cpu::CPU_Backend*>(backend.release()));
+
+    auto cpu_f_new_reuse = make_function(file_name);
+
+    shared_ptr<runtime::Executable> handle = cpu_backend->compile(cpu_f_new_reuse, pass_config);
+    for (auto it = 0; it < 2; it++)
+    {
+        handle->call_with_validate(result_tensors, arg_tensors);
+
+        std::vector<std::vector<float>> cpu_results_new_reuse;
+        for (auto rt : result_tensors)
+        {
+            cpu_results_new_reuse.push_back(read_vector<float>(rt));
+        }
+
+        for (size_t i = 0; i < cpu_results.size(); i++)
+        {
+            EXPECT_TRUE(
+                test::all_close(cpu_results.at(i), cpu_results_new_reuse.at(i), 1.0e-4f, 1.0e-4f));
+        }
+    }
+}
+#endif
+
+TEST(cpu_test, memory_reuse_destructive_oi_relu)
+{
+    auto shape_a = Shape{2, 5};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    auto B = make_shared<op::Parameter>(element::f32, shape_a);
+    auto C = make_shared<op::Parameter>(element::f32, shape_a);
+    auto add = make_shared<op::Add>(A, B);
+    auto relu = make_shared<op::Relu>(add);
+    auto subtract = make_shared<op::Subtract>(C, relu);
+    auto shape_rt = Shape{2, 5};
+    auto f = make_shared<Function>(subtract, ParameterVector{A, B, C});
+
+    auto backend = runtime::Backend::create("CPU");
+
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a, vector<float>{1, 8, -8, 17, -0.5, 1, 8, -8, 17, -0.5});
+    auto b = backend->create_tensor(element::f32, shape_a);
+    copy_data(b, vector<float>{1, 2, 3, 4, 0.5, 1, 8, -8, 17, -0.5});
+    auto c = backend->create_tensor(element::f32, shape_a);
+    copy_data(c, vector<float>{2, 10, 0, 21, 0, 2, 16, 0, 34, 0});
+    auto result = backend->create_tensor(element::f32, shape_rt);
+    vector<float> expected{0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+    shared_ptr<runtime::Executable> handle = backend->compile(f);
+    handle->call_with_validate({result}, {a, b, c});
+    EXPECT_EQ(read_vector<float>(result), expected);
+}
+
+TEST(cpu_test, memory_reuse_cacheable_no_destructive_oi_relu)
+{
+    auto shape_a = Shape{2, 5};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a, true);
+    auto B = make_shared<op::Parameter>(element::f32, shape_a, true);
+    auto C = make_shared<op::Parameter>(element::f32, shape_a);
+    auto add = make_shared<op::Add>(A, B);
+    auto relu = make_shared<op::Relu>(add);
+    auto subtract = make_shared<op::Subtract>(C, relu);
+    auto shape_rt = Shape{2, 5};
+    auto f = make_shared<Function>(subtract, ParameterVector{A, B, C});
+
+    auto backend = runtime::Backend::create("CPU");
+
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a, vector<float>{1, 8, -8, 17, -0.5, 1, 8, -8, 17, -0.5});
+    auto b = backend->create_tensor(element::f32, shape_a);
+    copy_data(b, vector<float>{1, 2, 3, 4, 0.5, 1, 8, -8, 17, -0.5});
+    auto c = backend->create_tensor(element::f32, shape_a);
+    copy_data(c, vector<float>{2, 10, 0, 21, 0, 2, 16, 0, 34, 0});
+    auto result = backend->create_tensor(element::f32, shape_rt);
+    vector<float> expected{0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+    shared_ptr<runtime::Executable> handle = backend->compile(f);
+    handle->call_with_validate({result}, {a, b, c});
+    EXPECT_EQ(read_vector<float>(result), expected);
+
+    a->set_stale(false);
+    b->set_stale(false);
+    handle->call_with_validate({result}, {a, b, c});
+    EXPECT_EQ(read_vector<float>(result), expected);
+}
+
+TEST(cpu_test, memory_reuse_in_place_concat_after_in_place_slice)
+{
+    Shape shape_a{4, 4};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    auto B = make_shared<op::Slice>(A, Coordinate{0, 0}, Coordinate{2, 4});
+    auto D = make_shared<op::Slice>(B, Coordinate{1, 0}, Coordinate{2, 4});
+    auto E = make_shared<op::Slice>(A, Coordinate{2, 0}, Coordinate{3, 4});
+    auto r = make_shared<op::Concat>(NodeVector{B, D, E}, 0);
+    auto f = make_shared<Function>(r, ParameterVector{A});
+
+    auto backend = runtime::Backend::create("CPU");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a, vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+    auto result = backend->create_tensor(element::f32, shape_a);
+
+    shared_ptr<runtime::Executable> handle = backend->compile(f);
+    handle->call_with_validate({result}, {a});
+
+    EXPECT_EQ((vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 5, 6, 7, 8, 9, 10, 11, 12}),
+              read_vector<float>(result));
+}
+
+TEST(cpu_test, memory_reuse_in_place_slice_after_in_place_concat)
+{
+    Shape shape{1, 1};
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto B = make_shared<op::Parameter>(element::f32, shape);
+    auto add1 = make_shared<op::Add>(A, B);
+    auto C = make_shared<op::Parameter>(element::f32, shape);
+    auto D = make_shared<op::Parameter>(element::f32, shape);
+    auto add2 = make_shared<op::Add>(C, D);
+    auto subtract = make_shared<op::Subtract>(C, A);
+    auto concat = make_shared<op::Concat>(NodeVector{add1, add2, subtract}, 0);
+    Shape shape_r{2, 1};
+    auto slice = make_shared<op::Slice>(concat, Coordinate{0, 0}, Coordinate{2, 1});
+    auto f = make_shared<Function>(slice, ParameterVector{A, B, C, D});
+
+    auto backend = runtime::Backend::create("CPU");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape);
+    copy_data(a, vector<float>{1});
+    auto b = backend->create_tensor(element::f32, shape);
+    copy_data(b, vector<float>{2});
+    auto c = backend->create_tensor(element::f32, shape);
+    copy_data(c, vector<float>{3});
+    auto d = backend->create_tensor(element::f32, shape);
+    copy_data(d, vector<float>{4});
+    auto result = backend->create_tensor(element::f32, shape_r);
+
+    shared_ptr<runtime::Executable> handle = backend->compile(f);
+    handle->call_with_validate({result}, {a, b, c, d});
+    EXPECT_EQ((vector<float>{3, 7}), read_vector<float>(result));
+}
+
 TEST(cpu_test, convert_inplace)
 {
    Shape shape{2, 2};

--- a/test/models/mxnet/mxnet_densenet121_inference_batch1_float32.json
+++ b/test/models/mxnet/mxnet_densenet121_inference_batch1_float32.json