Commit b277627a authored by Amy Zhuang's avatar Amy Zhuang Committed by Scott Cyphers

Reuse memory for CPU backend. (#2238)

* Reuse memory for CPU backend.

* Use NGRAPH_REUSE_MEMORY to enable memory reuse.

* Add a test.

* Move make_function to test_tools.cpp.

* Add more comments.

* Address PR Feedback: add a method to CPU backend.

* *Add a member to CPUOpAnnotations to remove redundant code.

*Overload compile function for CPU backend.

* Move make_function out of test_tools.

* Address PR Feedback.

* Use modified liveness analysis in CPUMemoryAssignment pass.

* Use lambda expression.

* Fix style error.

* Check if any user of the tensor has destructive io when building tensor alias map.

* Fix a bug.

* Check if tensor has multiple users.

* Allow tensor alias for destructive oi node.

* Update multiple_users_tensor set along the chain of in place ops.

* No tensor alias if input is parameter or constant.

* Use buffer sets in cpu memory assignment,
tensors sharing the same memory buffer are put into the same set.

* Add more checks and do not combine sets when allowing destructive oi.

* Style fix.

* Do no allow destructive oi if the input tensor uses function input memory.

Update set label.

* Add unit tests.

* Style fix.

* Get the correct size for memcpy when the input is padded.

* Style fix.

* Address PR feedback.

* Address PR feedback.

* Move make_function in cpu_test after #if 0 and before the disabled test.

* Add utility functions.

Use iterator.

Rename variables.

* Add pass attributes and move cpu memory assignment to common passes (#2504)
parent c32512fa
......@@ -21,6 +21,7 @@
using namespace ngraph;
// TODO: Add file-based configuration support
ngraph::pass::PassConfig::PassConfig()
{
/**
......@@ -42,9 +43,37 @@ ngraph::pass::PassConfig::PassConfig()
auto split_str = split(substr, ':', false);
switch (split_str.size())
{
case 1: m_enables.emplace(split_str[0], true); break;
case 2: m_enables.emplace(split_str[0], parse_string<bool>(split_str[1])); break;
default: throw ngraph_error("Unexpected string in get_pass_enables: " + substr);
case 1: m_pass_enables.emplace(split_str[0], true); break;
case 2: m_pass_enables.emplace(split_str[0], parse_string<bool>(split_str[1])); break;
default: throw ngraph_error("Unexpected string in NGRAPH_PASS_ENABLES: " + substr);
}
}
}
/**
* Parses the semi-colon separated environment string passed through NGRAPH_PASS_ATTRIBUTES
* and returns the pass attributes and whether they should be enabled or disabled in the
* provided unordered_map. Naming of pass attributes is up to the backends
* E.g., NGRAPH_PASS_ATTRIBUTES="OptimizeForMemory=0;MemoryAssignment::ReuseMemory=1;UseDefaultLayouts"
* would set false on "OptimizeForMemory", true on "MemoryAssignment::ReuseMemory" and true on
* "UseDefaultLayouts"
**/
env_str = std::getenv("NGRAPH_PASS_ATTRIBUTES");
if (env_str)
{
std::stringstream ss;
ss << env_str;
while (ss.good())
{
std::string substr;
std::getline(ss, substr, ';');
auto split_str = split(substr, '=', false);
switch (split_str.size())
{
case 1: m_pass_attributes.emplace(split_str[0], true); break;
case 2:
m_pass_attributes.emplace(split_str[0], parse_string<bool>(split_str[1]));
break;
default: throw ngraph_error("Unexpected string in NGRAPH_PASS_ATTRIBUTES: " + substr);
}
}
}
......@@ -52,14 +81,28 @@ ngraph::pass::PassConfig::PassConfig()
void ngraph::pass::PassConfig::set_pass_enable(std::string name, bool enable)
{
m_enables[name] = enable;
m_pass_enables[name] = enable;
}
bool ngraph::pass::PassConfig::get_pass_enable(std::string name)
{
if (m_enables.find(name) == m_enables.end())
if (m_pass_enables.find(name) == m_pass_enables.end())
{
return false;
}
return m_pass_enables[name];
}
void ngraph::pass::PassConfig::set_pass_attribute(std::string name, bool enable)
{
m_pass_attributes[name] = enable;
}
bool ngraph::pass::PassConfig::get_pass_attribute(std::string name)
{
if (m_pass_attributes.find(name) == m_pass_attributes.end())
{
return false;
}
return m_enables[name];
return m_pass_attributes[name];
}
......@@ -30,10 +30,14 @@ class ngraph::pass::PassConfig
{
public:
PassConfig();
const std::map<std::string, bool>& get_enables() { return m_enables; }
const std::map<std::string, bool>& get_enables() { return m_pass_enables; }
void set_pass_enable(std::string name, bool enable);
bool get_pass_enable(std::string name);
const std::map<std::string, bool>& get_pass_attributes() { return m_pass_attributes; }
void set_pass_attribute(std::string name, bool enable);
bool get_pass_attribute(std::string name);
private:
std::map<std::string, bool> m_enables;
std::map<std::string, bool> m_pass_enables;
std::map<std::string, bool> m_pass_attributes;
};
......@@ -39,13 +39,9 @@ bool ngraph::pass::PropagateCacheability::run_on_function(std::shared_ptr<Functi
op_annotations = op_annotations_factory();
op->set_op_annotations(op_annotations);
}
if (std::dynamic_pointer_cast<op::Constant>(node))
{
op_annotations->set_cacheable(true);
NGRAPH_DEBUG << "propagate cacheability: cacheability is 1";
}
else if (auto parameter = std::dynamic_pointer_cast<op::Parameter>(node))
if (node->is_parameter())
{
auto parameter = std::static_pointer_cast<op::Parameter>(node);
op_annotations->set_cacheable(parameter->get_cacheable());
NGRAPH_DEBUG << "propagate cacheability: cacheability is "
<< parameter->get_cacheable();
......
......@@ -109,6 +109,7 @@ set(SRC
pass/cpu_layout.cpp
pass/cpu_loop_kernel_fusion.cpp
pass/cpu_mat_fusion.cpp
pass/cpu_memory_assignment.cpp
pass/cpu_memory_optimization.cpp
pass/cpu_post_layout_optimizations.cpp
pass/cpu_rnn_fusion.cpp
......
......@@ -188,7 +188,7 @@ namespace ngraph
auto& arg2_tensor = external_function->get_tensor_data(args[2].get_name());
auto& arg3_tensor = external_function->get_tensor_data(args[3].get_name());
auto& out_tensor = external_function->get_tensor_data(out[0].get_name());
size_t arg3_size = args[3].get_size();
size_t arg3_size = node->get_inputs()[3].get_tensor().size();
if (runtime::cpu::mkldnn_utils::use_mkldnn_kernel(node))
{
......@@ -229,7 +229,7 @@ namespace ngraph
auto& arg1_tensor = external_function->get_tensor_data(args[1].get_name());
auto& arg2_tensor = external_function->get_tensor_data(args[2].get_name());
auto& out_tensor = external_function->get_tensor_data(out[0].get_name());
size_t arg2_size = args[2].get_size();
size_t arg2_size = node->get_inputs()[2].get_tensor().size();
if (runtime::cpu::mkldnn_utils::use_mkldnn_kernel(node))
{
......
......@@ -203,8 +203,8 @@ namespace ngraph
auto& arg4_tensor = external_function->get_tensor_data(args[4].get_name());
auto& arg5_tensor = external_function->get_tensor_data(args[5].get_name());
auto& out0_tensor = external_function->get_tensor_data(out[0].get_name());
size_t arg3_size = node->get_inputs()[3].get_tensor().size();
size_t arg3_size = args[3].get_size();
auto scales_size = shape_size(args[4].get_shape());
auto sum_scales_size = shape_size(args[5].get_shape());
......@@ -297,8 +297,8 @@ namespace ngraph
auto& arg4_tensor = external_function->get_tensor_data(args[4].get_name());
auto& arg5_tensor = external_function->get_tensor_data(args[5].get_name());
auto& out0_tensor = external_function->get_tensor_data(out[0].get_name());
size_t arg3_size = node->get_inputs()[3].get_tensor().size();
size_t arg3_size = args[3].get_size();
auto scales_size = shape_size(args[4].get_shape());
auto sum_scales_size = shape_size(args[5].get_shape());
......
......@@ -51,9 +51,10 @@ namespace
}
shared_ptr<runtime::cpu::CPU_CallFrame> runtime::cpu::CPU_Backend::make_call_frame(
const shared_ptr<runtime::cpu::CPU_ExternalFunction>& external_function)
const shared_ptr<runtime::cpu::CPU_ExternalFunction>& external_function,
ngraph::pass::PassConfig& pass_config)
{
return external_function->make_call_frame();
return external_function->make_call_frame(pass_config);
}
shared_ptr<runtime::Tensor>
......@@ -70,6 +71,15 @@ shared_ptr<runtime::Tensor> runtime::cpu::CPU_Backend::create_tensor(
shared_ptr<runtime::Executable>
runtime::cpu::CPU_Backend::compile(shared_ptr<Function> func, bool performance_counters_enabled)
{
ngraph::pass::PassConfig pass_config;
return compile(func, pass_config, performance_counters_enabled);
}
shared_ptr<runtime::Executable>
runtime::cpu::CPU_Backend::compile(shared_ptr<Function> func,
ngraph::pass::PassConfig& pass_config,
bool performance_counters_enabled)
{
shared_ptr<runtime::Executable> rc;
auto it = m_exec_map.find(func);
......@@ -79,13 +89,14 @@ shared_ptr<runtime::Executable>
}
else
{
rc = make_shared<CPU_Executable>(func, performance_counters_enabled);
rc = make_shared<CPU_Executable>(func, pass_config, performance_counters_enabled);
m_exec_map.insert({func, rc});
}
return rc;
}
runtime::cpu::CPU_Executable::CPU_Executable(shared_ptr<Function> func,
ngraph::pass::PassConfig& pass_config,
bool performance_counters_enabled)
{
FunctionInstance& instance = m_function_instance;
......@@ -93,7 +104,7 @@ runtime::cpu::CPU_Executable::CPU_Executable(shared_ptr<Function> func,
{
instance.m_external_function = make_shared<CPU_ExternalFunction>(func);
instance.m_external_function->m_emit_timing = performance_counters_enabled;
auto cf = instance.m_external_function->make_call_frame();
auto cf = instance.m_external_function->make_call_frame(pass_config);
instance.m_call_frame = dynamic_pointer_cast<CPU_CallFrame>(cf);
}
set_parameters_and_results(*func);
......
......@@ -20,6 +20,7 @@
#include <memory>
#include "cpu_backend_visibility.h"
#include "ngraph/pass/pass_config.hpp"
#include "ngraph/runtime/backend.hpp"
namespace ngraph
......@@ -35,7 +36,8 @@ namespace ngraph
{
public:
std::shared_ptr<CPU_CallFrame>
make_call_frame(const std::shared_ptr<CPU_ExternalFunction>& external_function);
make_call_frame(const std::shared_ptr<CPU_ExternalFunction>& external_function,
ngraph::pass::PassConfig& pass_config);
std::shared_ptr<ngraph::runtime::Tensor>
create_tensor(const ngraph::element::Type& element_type,
......@@ -50,6 +52,11 @@ namespace ngraph
compile(std::shared_ptr<Function> func,
bool enable_performance_counters = false) override;
std::shared_ptr<ngraph::runtime::Executable>
compile(std::shared_ptr<Function> func,
ngraph::pass::PassConfig& pass_config,
bool enable_performance_counters = false);
void remove_compiled_function(std::shared_ptr<Executable> exec) override;
bool is_supported(const Node& node) const override;
......@@ -63,7 +70,9 @@ namespace ngraph
class CPU_BACKEND_API CPU_Executable : public runtime::Executable
{
public:
CPU_Executable(std::shared_ptr<Function> func, bool performance_counters_enabled);
CPU_Executable(std::shared_ptr<Function> func,
ngraph::pass::PassConfig& pass_config,
bool performance_counters_enabled);
bool call(const std::vector<std::shared_ptr<runtime::Tensor>>& outputs,
const std::vector<std::shared_ptr<runtime::Tensor>>& inputs) override;
......
......@@ -129,8 +129,8 @@ bool runtime::cpu::CPU_Debugger::delete_breakpoint(std::shared_ptr<Node> op)
void* runtime::cpu::CPU_Debugger::inspect(std::shared_ptr<Node> op, size_t output_index)
{
return m_callframe.m_external_function->tensor_data.at(op->get_name() + "_" +
to_string(output_index));
return m_callframe.m_external_function->get_tensor_data(op->get_name() + "_" +
to_string(output_index));
}
bool runtime::cpu::CPU_Debugger::add_tracepoint(
......
......@@ -49,6 +49,7 @@
#include "ngraph/runtime/cpu/mkldnn_emitter.hpp"
#include "ngraph/runtime/performance_counter.hpp"
#include "ngraph/state/state.hpp"
#include "ngraph/util.hpp"
namespace ngraph
{
......@@ -95,18 +96,11 @@ namespace ngraph
friend class CPU_Executable;
public:
enum class CPUTensorRole
{
INPUT,
CONSTANT,
OUTPUT,
INTERMEDIATE
};
CPU_ExternalFunction(const std::shared_ptr<ngraph::Function>& function,
bool release_function = true);
~CPU_ExternalFunction();
std::shared_ptr<ngraph::runtime::cpu::CPU_CallFrame> make_call_frame();
std::shared_ptr<ngraph::runtime::cpu::CPU_CallFrame>
make_call_frame(ngraph::pass::PassConfig& pass_config);
const LayoutDescriptorPtrs& get_parameter_layout_descriptors();
const LayoutDescriptorPtrs& get_result_layout_descriptors();
......@@ -171,11 +165,11 @@ namespace ngraph
#endif
protected:
void build();
void build(ngraph::pass::PassConfig& pass_config);
#if !defined(NGRAPH_DEX_ONLY)
void compile();
void compile(ngraph::pass::PassConfig& pass_config);
#endif
......@@ -183,37 +177,8 @@ namespace ngraph
private:
// Register passes that are common to codegen and DEX
void register_common_passes(ngraph::pass::Manager& pass_manager);
// For non-destructive passthrough kernels, propagate function
// constant buffers to internal ops
void propagate_in_place_constant(ngraph::descriptor::Output* output,
std::string input_name,
bool dex);
// For non-destructive passthrough kernels, propagate function
// input buffers to internal ops
void propagate_in_place_input(ngraph::descriptor::Output* output,
std::string input_name,
bool dex);
// For in-place kernels, propagate function output buffers to
// internal ops
void propagate_in_place_output(ngraph::descriptor::Output* res_src_output,
std::string output_name,
bool dex);
// Find in-place concat ops and set appropriate memory pool offset for its arguments
void process_in_place_concat(std::list<std::shared_ptr<Node>> nodes);
// For a chain of concat ops, propagate memory pool offsets
void propagate_in_place_concat(std::shared_ptr<ngraph::op::Concat> concat);
// Find in-place slice ops and set appropriate memory pool offset for its output
void process_in_place_slice(std::list<std::shared_ptr<Node>> nodes);
// propagate slice when its arg comes from function input
void propagate_in_place_slice(ngraph::descriptor::Output* output,
size_t input_index,
size_t input_offset);
void register_common_passes(ngraph::pass::Manager& pass_manager,
ngraph::pass::PassConfig& pass_config);
bool computes_result(Node* node);
void release_function() { m_function = nullptr; }
......@@ -238,6 +203,9 @@ namespace ngraph
std::string emit_op_as_function(const Node&, const std::string& function_name);
std::string strip_comments(const std::string&);
std::unordered_set<descriptor::Tensor*>&
get_tensor_set(descriptor::Tensor* output_tensor);
std::unique_ptr<codegen::Compiler> m_compiler;
std::unique_ptr<codegen::ExecutionEngine> m_execution_engine;
......@@ -269,8 +237,10 @@ namespace ngraph
std::unordered_map<std::string, std::string> m_variable_name_map;
std::unordered_map<std::string, std::pair<std::size_t, std::size_t>>
m_variable_input_index_offset_map;
std::unordered_map<std::string, std::pair<std::size_t, std::size_t>>
m_variable_output_index_offset_map;
std::unordered_map<std::string, CPUTensorRole> m_tensor_roles;
std::unordered_map<std::string, ngraph::CPUTensorRole> m_tensor_roles;
LayoutDescriptorPtrs parameter_layout_descriptors;
LayoutDescriptorPtrs result_layout_descriptors;
......@@ -290,15 +260,33 @@ namespace ngraph
executor;
std::unordered_map<std::string, void*> tensor_data;
std::unordered_map<std::string, bool> tensor_stale;
// Each tensor is put into one buffer set.
// All the tensors in the same buffer set share the same memory buffer.
// bufferID_to_tensorSets maps bufferID to the pair of CPUTensorRole and buffer set.
// CPUTensorRole is INPUT, CONSTANT, OUTPUT, or INTERMEDIATE,
// which tells from where the memory buffer comes.
std::unordered_map<
size_t,
std::pair<ngraph::CPUTensorRole, std::unordered_set<descriptor::Tensor*>>>
bufferID_to_tensorSets;
// tensor_to_bufferID maps tensor to the ID of the buffer set it belongs to.
std::unordered_map<descriptor::Tensor*, size_t> tensor_to_bufferID;
std::unordered_map<std::string, std::string> tensor_alias;
std::unordered_map<std::string, size_t> function_input_name_index;
// tensor pointer and its offset into the memory allocated for intermediates
// used to calculate the correct address at runtime
std::list<std::pair<std::reference_wrapper<void*>, size_t>> intermediates_offsets;
// tensor pointer, input index, offset into the input, and if the input is stale
// used to calculate the correct address at runtime
std::list<std::tuple<std::reference_wrapper<void*>,
size_t,
size_t,
std::reference_wrapper<bool>>>
function_input_index_offset;
// tensor pointer, output index, and offset into the output
// used to calculate the correct address at runtime
std::list<std::tuple<std::reference_wrapper<void*>, size_t, size_t>>
intermediate_input_index_offset;
std::list<
std::tuple<std::reference_wrapper<void*>, size_t, std::reference_wrapper<bool>>>
function_input_index;
std::list<std::pair<std::reference_wrapper<void*>, size_t>> function_output_index;
function_output_index_offset;
std::unordered_map<std::string, std::shared_ptr<CPU_ExternalFunction>> callees;
bool m_is_built;
std::vector<runtime::PerformanceCounter> m_perf_counters;
......
This diff is collapsed.
//*****************************************************************************
// Copyright 2017-2019 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#pragma once
#include <limits>
#include <list>
#include <sstream>
#include <unordered_map>
#include <unordered_set>
#include "ngraph/pass/pass.hpp"
#include "ngraph/util.hpp"
namespace ngraph
{
namespace runtime
{
namespace cpu
{
namespace pass
{
class CPUMemoryAssignment;
}
}
}
}
class ngraph::runtime::cpu::pass::CPUMemoryAssignment : public ngraph::pass::FunctionPass
{
public:
CPUMemoryAssignment(
std::unordered_map<size_t,
std::pair<CPUTensorRole, std::unordered_set<descriptor::Tensor*>>>&,
std::unordered_map<descriptor::Tensor*, size_t>&,
size_t alignment = 1,
bool disable_memory_sharing = false);
bool run_on_function(std::shared_ptr<ngraph::Function>) override;
private:
// Find in-place concat ops and set appropriate memory pool offset for its arguments
void process_in_place_concat(std::list<std::shared_ptr<Node>> nodes);
// For a chain of concat ops, propagate memory pool offsets
void propagate_in_place_concat(std::shared_ptr<ngraph::op::Op> concat, size_t index);
// Find in-place slice ops and set appropriate memory pool offset for its output
void process_in_place_slice(std::list<std::shared_ptr<Node>> nodes);
// propagate slice when its arg comes from function input
void propagate_in_place_slice(ngraph::descriptor::Input* input, size_t input_index);
// build buffer sets maps
void build_buffer_sets_maps(std::list<std::shared_ptr<Node>>& ops);
// liveness analysis to build new and free list for each node
void liveness_analysis(std::list<std::shared_ptr<Node>>& ops);
size_t get_bufferID(descriptor::Tensor* tensor);
size_t m_alignment;
bool m_disable_memory_sharing;
std::set<descriptor::Tensor*> m_tensor_caching;
std::unordered_map<size_t,
std::pair<ngraph::CPUTensorRole, std::unordered_set<descriptor::Tensor*>>>&
m_bufferID_to_tensorSets;
std::unordered_map<descriptor::Tensor*, size_t>& m_tensor_to_bufferID;
};
......@@ -62,8 +62,9 @@ bool runtime::cpu::pass::CPUMemoryOptimization::run_on_function(std::shared_ptr<
{
for (auto n : function->get_ordered_ops())
{
if (auto concat = std::dynamic_pointer_cast<op::Concat>(n))
if (n->description() == "Concat")
{
auto concat = std::static_pointer_cast<op::Concat>(n);
auto shape = concat->get_input_shape(0);
auto axis = concat->get_concatenation_axis();
auto product = 1;
......@@ -119,8 +120,7 @@ bool runtime::cpu::pass::CPUMemoryOptimization::run_on_function(std::shared_ptr<
const auto& output = input.get_output();
auto arg = output.get_node();
if (std::dynamic_pointer_cast<op::Constant>(arg) ||
std::dynamic_pointer_cast<op::Parameter>(arg))
if (arg->is_constant() || arg->is_parameter())
{
NGRAPH_DEBUG << "cpu_memory_optimization: " << arg->get_name()
<< ": constant or parameter, no in place concat";
......@@ -130,7 +130,7 @@ bool runtime::cpu::pass::CPUMemoryOptimization::run_on_function(std::shared_ptr<
NGRAPH_ASSERT(arg->get_output_size() == 1);
if (!std::dynamic_pointer_cast<op::Concat>(arg))
if (arg->description() != "Concat")
{
if (arg->is_op())
{
......@@ -154,7 +154,7 @@ bool runtime::cpu::pass::CPUMemoryOptimization::run_on_function(std::shared_ptr<
for (auto output_input : output.get_inputs())
{
auto user = output_input->get_node();
if (std::dynamic_pointer_cast<op::Concat>(user))
if (user->description() == "Concat")
{
concat_count++;
if (concat_count == 2)
......@@ -225,8 +225,9 @@ bool runtime::cpu::pass::CPUMemoryOptimization::run_on_function(std::shared_ptr<
for (auto n : function->get_ordered_ops())
{
if (auto slice = std::dynamic_pointer_cast<op::Slice>(n))
if (n->description() == "Slice")
{
auto slice = std::static_pointer_cast<op::Slice>(n);
auto in_shape = slice->get_input_shape(0);
auto out_shape = slice->get_output_shape(0);
auto strides = slice->get_strides();
......@@ -235,7 +236,6 @@ bool runtime::cpu::pass::CPUMemoryOptimization::run_on_function(std::shared_ptr<
auto upper_bounds = slice->get_upper_bounds();
auto arg = slice->get_argument(0);
if (arg->is_constant())
{
NGRAPH_DEBUG << "cpu_memory_optimization: " << arg->get_name()
......@@ -243,25 +243,6 @@ bool runtime::cpu::pass::CPUMemoryOptimization::run_on_function(std::shared_ptr<
continue;
}
bool no_in_place_slice = false;
if (arg->is_parameter())
{
for (auto user : slice->get_users())
{
if (user->is_output())
{
NGRAPH_DEBUG << "cpu_memory_optimization: slice between function input and "
"output, no in place slice";
no_in_place_slice = true;
break;
}
}
}
if (no_in_place_slice)
{
continue;
}
if (is_strided(strides))
{
NGRAPH_DEBUG << "cpu_memory_optimization: strided slice, no in place slice";
......
......@@ -219,6 +219,14 @@ namespace ngraph
* bprop function will have these nodes as the first N input parameters
**/
FpropCache cache_fprop(std::shared_ptr<Function> fprop, std::shared_ptr<Function> bprop);
enum class CPUTensorRole
{
INPUT,
CONSTANT,
OUTPUT,
INTERMEDIATE
};
} // end namespace ngraph
std::ostream& operator<<(std::ostream& os, const ngraph::NodeVector& nv);
......@@ -2861,7 +2861,7 @@ NGRAPH_TEST(${BACKEND_NAME}, computation_reuse)
Shape shape_a{1, 16, 2, 2};
auto A = make_shared<op::Parameter>(element::f32, shape_a);
Shape shape_b{32, 16, 1, 1};
auto B = make_shared<op::Parameter>(element::f32, shape_b);
auto B = make_shared<op::Parameter>(element::f32, shape_b, true);
Shape shape_r{1, 32, 2, 2};
auto conv = make_shared<op::Convolution>(A,
B,
......
......@@ -32,6 +32,7 @@
#include "ngraph/op/parameter.hpp"
#include "ngraph/pass/manager.hpp"
#include "ngraph/pass/visualize_tree.hpp"
#include "ngraph/runtime/cpu/cpu_backend.hpp"
#include "ngraph/runtime/cpu/op/convert_layout.hpp"
#include "ngraph/serializer.hpp"
#include "ngraph/util.hpp"
......@@ -669,6 +670,206 @@ TEST(cpu_test, convolution_large_padding)
compare_backends(int_f, cpu_f, "INTERPRETER", "CPU", 1e-4, 1e-4);
}
#if 0
static std::shared_ptr<Function> make_function(const std::string& file_name)
{
const string json_path = file_util::path_join(SERIALIZED_ZOO, file_name);
const string json_string = file_util::read_file_to_string(json_path);
stringstream ss(json_string);
shared_ptr<Function> func = ngraph::deserialize(ss);
return func;
}
TEST(cpu_test, memory_reuse_mxnet_densenet121)
{
const std::string file_name("mxnet/mxnet_densenet121_inference_batch1_float32.json");
auto cpu_f = make_function(file_name);
test::Uniform<float> rng(-1.0f, 1.0f);
vector<vector<float>> args;
for (shared_ptr<op::Parameter> param : cpu_f->get_parameters())
{
vector<float> tensor_val(shape_size(param->get_shape()));
rng.initialize(tensor_val);
args.push_back(tensor_val);
}
// without memory reuse
auto cpu_results = execute(cpu_f, args, "CPU");
auto cpu_f_new = make_function(file_name);
auto cpu_results_new = execute(cpu_f_new, args, "CPU");
for (size_t i = 0; i < cpu_results.size(); i++)
{
EXPECT_TRUE(test::all_close(cpu_results.at(i), cpu_results_new.at(i), 1.0e-4f, 1.0e-4f));
}
// with memory reuse
auto backend = runtime::Backend::create("CPU");
auto parms = cpu_f->get_parameters();
std::vector<std::shared_ptr<ngraph::runtime::Tensor>> arg_tensors(args.size());
for (size_t i = 0; i < args.size(); i++)
{
auto t = backend->create_tensor(parms.at(i)->get_element_type(), parms.at(i)->get_shape());
copy_data(t, args.at(i));
arg_tensors.at(i) = t;
}
auto results = cpu_f->get_results();
std::vector<std::shared_ptr<ngraph::runtime::Tensor>> result_tensors(results.size());
for (size_t i = 0; i < results.size(); i++)
{
result_tensors.at(i) =
backend->create_tensor(results.at(i)->get_element_type(), results.at(i)->get_shape());
}
ngraph::pass::PassConfig pass_config;
pass_config.set_pass_attribute("CPUMemoryAssignment::ReuseMemory", true);
auto cpu_backend = std::unique_ptr<runtime::cpu::CPU_Backend>(
static_cast<runtime::cpu::CPU_Backend*>(backend.release()));
auto cpu_f_new_reuse = make_function(file_name);
shared_ptr<runtime::Executable> handle = cpu_backend->compile(cpu_f_new_reuse, pass_config);
for (auto it = 0; it < 2; it++)
{
handle->call_with_validate(result_tensors, arg_tensors);
std::vector<std::vector<float>> cpu_results_new_reuse;
for (auto rt : result_tensors)
{
cpu_results_new_reuse.push_back(read_vector<float>(rt));
}
for (size_t i = 0; i < cpu_results.size(); i++)
{
EXPECT_TRUE(
test::all_close(cpu_results.at(i), cpu_results_new_reuse.at(i), 1.0e-4f, 1.0e-4f));
}
}
}
#endif
TEST(cpu_test, memory_reuse_destructive_oi_relu)
{
auto shape_a = Shape{2, 5};
auto A = make_shared<op::Parameter>(element::f32, shape_a);
auto B = make_shared<op::Parameter>(element::f32, shape_a);
auto C = make_shared<op::Parameter>(element::f32, shape_a);
auto add = make_shared<op::Add>(A, B);
auto relu = make_shared<op::Relu>(add);
auto subtract = make_shared<op::Subtract>(C, relu);
auto shape_rt = Shape{2, 5};
auto f = make_shared<Function>(subtract, ParameterVector{A, B, C});
auto backend = runtime::Backend::create("CPU");
auto a = backend->create_tensor(element::f32, shape_a);
copy_data(a, vector<float>{1, 8, -8, 17, -0.5, 1, 8, -8, 17, -0.5});
auto b = backend->create_tensor(element::f32, shape_a);
copy_data(b, vector<float>{1, 2, 3, 4, 0.5, 1, 8, -8, 17, -0.5});
auto c = backend->create_tensor(element::f32, shape_a);
copy_data(c, vector<float>{2, 10, 0, 21, 0, 2, 16, 0, 34, 0});
auto result = backend->create_tensor(element::f32, shape_rt);
vector<float> expected{0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
shared_ptr<runtime::Executable> handle = backend->compile(f);
handle->call_with_validate({result}, {a, b, c});
EXPECT_EQ(read_vector<float>(result), expected);
}
TEST(cpu_test, memory_reuse_cacheable_no_destructive_oi_relu)
{
auto shape_a = Shape{2, 5};
auto A = make_shared<op::Parameter>(element::f32, shape_a, true);
auto B = make_shared<op::Parameter>(element::f32, shape_a, true);
auto C = make_shared<op::Parameter>(element::f32, shape_a);
auto add = make_shared<op::Add>(A, B);
auto relu = make_shared<op::Relu>(add);
auto subtract = make_shared<op::Subtract>(C, relu);
auto shape_rt = Shape{2, 5};
auto f = make_shared<Function>(subtract, ParameterVector{A, B, C});
auto backend = runtime::Backend::create("CPU");
auto a = backend->create_tensor(element::f32, shape_a);
copy_data(a, vector<float>{1, 8, -8, 17, -0.5, 1, 8, -8, 17, -0.5});
auto b = backend->create_tensor(element::f32, shape_a);
copy_data(b, vector<float>{1, 2, 3, 4, 0.5, 1, 8, -8, 17, -0.5});
auto c = backend->create_tensor(element::f32, shape_a);
copy_data(c, vector<float>{2, 10, 0, 21, 0, 2, 16, 0, 34, 0});
auto result = backend->create_tensor(element::f32, shape_rt);
vector<float> expected{0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
shared_ptr<runtime::Executable> handle = backend->compile(f);
handle->call_with_validate({result}, {a, b, c});
EXPECT_EQ(read_vector<float>(result), expected);
a->set_stale(false);
b->set_stale(false);
handle->call_with_validate({result}, {a, b, c});
EXPECT_EQ(read_vector<float>(result), expected);
}
TEST(cpu_test, memory_reuse_in_place_concat_after_in_place_slice)
{
Shape shape_a{4, 4};
auto A = make_shared<op::Parameter>(element::f32, shape_a);
auto B = make_shared<op::Slice>(A, Coordinate{0, 0}, Coordinate{2, 4});
auto D = make_shared<op::Slice>(B, Coordinate{1, 0}, Coordinate{2, 4});
auto E = make_shared<op::Slice>(A, Coordinate{2, 0}, Coordinate{3, 4});
auto r = make_shared<op::Concat>(NodeVector{B, D, E}, 0);
auto f = make_shared<Function>(r, ParameterVector{A});
auto backend = runtime::Backend::create("CPU");
// Create some tensors for input/output
auto a = backend->create_tensor(element::f32, shape_a);
copy_data(a, vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
auto result = backend->create_tensor(element::f32, shape_a);
shared_ptr<runtime::Executable> handle = backend->compile(f);
handle->call_with_validate({result}, {a});
EXPECT_EQ((vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 5, 6, 7, 8, 9, 10, 11, 12}),
read_vector<float>(result));
}
TEST(cpu_test, memory_reuse_in_place_slice_after_in_place_concat)
{
Shape shape{1, 1};
auto A = make_shared<op::Parameter>(element::f32, shape);
auto B = make_shared<op::Parameter>(element::f32, shape);
auto add1 = make_shared<op::Add>(A, B);
auto C = make_shared<op::Parameter>(element::f32, shape);
auto D = make_shared<op::Parameter>(element::f32, shape);
auto add2 = make_shared<op::Add>(C, D);
auto subtract = make_shared<op::Subtract>(C, A);
auto concat = make_shared<op::Concat>(NodeVector{add1, add2, subtract}, 0);
Shape shape_r{2, 1};
auto slice = make_shared<op::Slice>(concat, Coordinate{0, 0}, Coordinate{2, 1});
auto f = make_shared<Function>(slice, ParameterVector{A, B, C, D});
auto backend = runtime::Backend::create("CPU");
// Create some tensors for input/output
auto a = backend->create_tensor(element::f32, shape);
copy_data(a, vector<float>{1});
auto b = backend->create_tensor(element::f32, shape);
copy_data(b, vector<float>{2});
auto c = backend->create_tensor(element::f32, shape);
copy_data(c, vector<float>{3});
auto d = backend->create_tensor(element::f32, shape);
copy_data(d, vector<float>{4});
auto result = backend->create_tensor(element::f32, shape_r);
shared_ptr<runtime::Executable> handle = backend->compile(f);
handle->call_with_validate({result}, {a, b, c, d});
EXPECT_EQ((vector<float>{3, 7}), read_vector<float>(result));
}
TEST(cpu_test, convert_inplace)
{
Shape shape{2, 2};
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment