Commit 41c50b44 authored by Jayaram Bobba's avatar Jayaram Bobba Committed by Scott Cyphers

Computation reuse (#945)

* Make temp memory pools static to avoid memory allocation overheads

* Initial implementation for graph control to enable caching and computation reuse

* Added sphinx documentation

* Turned off memory buffer reuse in CPU transformer to support computation reuse. Added unit test

* Change memoizable to cacheable

* Change memoizable to cacheable

* Rename variables
parent 698f29f1
...@@ -25,6 +25,9 @@ Attributes ...@@ -25,6 +25,9 @@ Attributes
+------------------+------------------------------------------+ +------------------+------------------------------------------+
| ``shape`` | The ``Shape`` of the parameter. | | ``shape`` | The ``Shape`` of the parameter. |
+------------------+------------------------------------------+ +------------------+------------------------------------------+
| ``cacheable`` | True if the parameter is not expected to |
| | be frequently updated. |
+------------------+------------------------------------------+
Outputs Outputs
------- -------
......
...@@ -21,8 +21,11 @@ ...@@ -21,8 +21,11 @@
using namespace std; using namespace std;
using namespace ngraph; using namespace ngraph;
op::Parameter::Parameter(const element::Type& element_type, const Shape& shape) op::Parameter::Parameter(const element::Type& element_type,
const Shape& shape,
const bool cacheable)
: Op("Parameter", {}) : Op("Parameter", {})
, m_cacheable(cacheable)
{ {
add_output(element_type, shape); add_output(element_type, shape);
} }
......
...@@ -40,10 +40,17 @@ namespace ngraph ...@@ -40,10 +40,17 @@ namespace ngraph
/// ///
/// \param element_type The element type of the parameter. /// \param element_type The element type of the parameter.
/// \param shape The shape of the parameter. /// \param shape The shape of the parameter.
Parameter(const ngraph::element::Type& element_type, const Shape& shape); /// \param cacheable True if the parameter is not expected to be frequently updated.
Parameter(const ngraph::element::Type& element_type,
const Shape& shape,
const bool cacheable = false);
const bool get_cacheable() const { return m_cacheable; }
virtual std::shared_ptr<Node> virtual std::shared_ptr<Node>
copy_with_new_args(const NodeVector& new_args) const override; copy_with_new_args(const NodeVector& new_args) const override;
protected:
bool m_cacheable;
}; };
} }
} }
...@@ -27,8 +27,9 @@ ...@@ -27,8 +27,9 @@
using namespace std; using namespace std;
using namespace ngraph; using namespace ngraph;
pass::MemoryLayout::MemoryLayout(size_t alignment) pass::MemoryLayout::MemoryLayout(size_t alignment, bool disable_memory_sharing)
: m_alignment(alignment) : m_alignment(alignment)
, m_disable_memory_sharing(disable_memory_sharing)
{ {
} }
...@@ -42,9 +43,12 @@ bool pass::MemoryLayout::run_on_function(shared_ptr<ngraph::Function> function) ...@@ -42,9 +43,12 @@ bool pass::MemoryLayout::run_on_function(shared_ptr<ngraph::Function> function)
size_t offset = mm.allocate(tensor->size()); size_t offset = mm.allocate(tensor->size());
tensor->set_pool_offset(offset); tensor->set_pool_offset(offset);
} }
for (const descriptor::Tensor* tensor : node->liveness_free_list) if (!m_disable_memory_sharing)
{ {
mm.free(tensor->get_pool_offset()); for (const descriptor::Tensor* tensor : node->liveness_free_list)
{
mm.free(tensor->get_pool_offset());
}
} }
} }
function->set_temporary_pool_size(mm.max_allocated()); function->set_temporary_pool_size(mm.max_allocated());
......
...@@ -35,11 +35,12 @@ namespace ngraph ...@@ -35,11 +35,12 @@ namespace ngraph
class ngraph::pass::MemoryLayout : public FunctionPass class ngraph::pass::MemoryLayout : public FunctionPass
{ {
public: public:
MemoryLayout(size_t alignment = 1); MemoryLayout(size_t alignment = 1, bool disable_memory_sharing = false);
bool run_on_function(std::shared_ptr<ngraph::Function>) override; bool run_on_function(std::shared_ptr<ngraph::Function>) override;
private: private:
size_t m_alignment; size_t m_alignment;
bool m_disable_memory_sharing;
}; };
class ngraph::pass::MemoryManager class ngraph::pass::MemoryManager
......
...@@ -51,6 +51,7 @@ void runtime::cpu::CPU_CallFrame::call( ...@@ -51,6 +51,7 @@ void runtime::cpu::CPU_CallFrame::call(
{ {
shared_ptr<runtime::cpu::CPUTensorView> tv = shared_ptr<runtime::cpu::CPUTensorView> tv =
static_pointer_cast<runtime::cpu::CPUTensorView>(input_tvs[i]); static_pointer_cast<runtime::cpu::CPUTensorView>(input_tvs[i]);
ctx->p_en[i] = tv->get_stale();
inputs.push_back(tv->get_data_ptr()); inputs.push_back(tv->get_data_ptr());
} }
for (size_t i = 0; i < output_tvs.size(); i++) for (size_t i = 0; i < output_tvs.size(); i++)
...@@ -100,6 +101,7 @@ void runtime::cpu::CPU_CallFrame::setup_runtime_context() ...@@ -100,6 +101,7 @@ void runtime::cpu::CPU_CallFrame::setup_runtime_context()
{ {
ctx->op_durations = new int64_t[m_external_function->get_op_attrs().size()]; ctx->op_durations = new int64_t[m_external_function->get_op_attrs().size()];
} }
ctx->p_en = new bool[m_external_function->get_parameter_layout_descriptors().size()];
const auto& mkldnn_emitter = m_external_function->get_mkldnn_emitter(); const auto& mkldnn_emitter = m_external_function->get_mkldnn_emitter();
ctx->mkldnn_primitives = mkldnn_emitter->get_mkldnn_primitives().data(); ctx->mkldnn_primitives = mkldnn_emitter->get_mkldnn_primitives().data();
ctx->mkldnn_workspaces = mkldnn_emitter->get_mkldnn_workspaces().data(); ctx->mkldnn_workspaces = mkldnn_emitter->get_mkldnn_workspaces().data();
...@@ -108,5 +110,6 @@ void runtime::cpu::CPU_CallFrame::setup_runtime_context() ...@@ -108,5 +110,6 @@ void runtime::cpu::CPU_CallFrame::setup_runtime_context()
void runtime::cpu::CPU_CallFrame::cleanup_runtime_context() void runtime::cpu::CPU_CallFrame::cleanup_runtime_context()
{ {
delete[] ctx->op_durations; delete[] ctx->op_durations;
delete[] ctx->p_en;
delete ctx; delete ctx;
} }
...@@ -330,7 +330,7 @@ void runtime::cpu::CPU_ExternalFunction::compile() ...@@ -330,7 +330,7 @@ void runtime::cpu::CPU_ExternalFunction::compile()
pass_manager.register_pass<ngraph::pass::ResultCopyElimination>(); pass_manager.register_pass<ngraph::pass::ResultCopyElimination>();
pass_manager.register_pass<ngraph::pass::GetOutputElementElimination>(); pass_manager.register_pass<ngraph::pass::GetOutputElementElimination>();
pass_manager.register_pass<ngraph::pass::Liveness>(); pass_manager.register_pass<ngraph::pass::Liveness>();
pass_manager.register_pass<ngraph::pass::MemoryLayout>(s_memory_pool_alignment); pass_manager.register_pass<ngraph::pass::MemoryLayout>(s_memory_pool_alignment, true);
pass_manager.run_passes(m_function); pass_manager.run_passes(m_function);
unordered_map<shared_ptr<Function>, list<shared_ptr<Node>>> function_ordered_ops; unordered_map<shared_ptr<Function>, list<shared_ptr<Node>>> function_ordered_ops;
...@@ -588,6 +588,26 @@ using namespace ngraph::runtime; ...@@ -588,6 +588,26 @@ using namespace ngraph::runtime;
<< ");\n"; << ");\n";
} }
// Indexing for Control Flags
std::map<std::string, size_t> tensor_index_map;
std::map<std::string, size_t> param_index_map;
size_t tensor_index = 0;
for (shared_ptr<Node> node : ordered_ops)
{
if (!node->is_parameter() && !node->is_constant())
{
for (const descriptor::Input& input : node->get_inputs())
{
const descriptor::Output& output = input.get_output();
shared_ptr<descriptor::TensorView> tv = output.get_tensor_view();
tensor_index_map.insert({tv->get_tensor().get_name(), tensor_index++});
}
}
}
writer << "bool " << current_function->get_name() << "_t_en[" << tensor_index << "];\n";
writer << "bool " << current_function->get_name() << "_init = true;\n";
writer << "extern \"C\" void " << current_function->get_name(); writer << "extern \"C\" void " << current_function->get_name();
writer << "(void** inputs, void** outputs, cpu::CPURuntimeContext* ctx)\n"; writer << "(void** inputs, void** outputs, cpu::CPURuntimeContext* ctx)\n";
writer << "{\n"; writer << "{\n";
...@@ -625,6 +645,8 @@ using namespace ngraph::runtime; ...@@ -625,6 +645,8 @@ using namespace ngraph::runtime;
} }
} }
writer << "bool* t_en = (bool*)" << current_function->get_name() << "_t_en;\n";
// Add inputs to the variable name map // Add inputs to the variable name map
size_t arg_index = 0; size_t arg_index = 0;
for (shared_ptr<ngraph::op::Parameter> param : current_function->get_parameters()) for (shared_ptr<ngraph::op::Parameter> param : current_function->get_parameters())
...@@ -637,6 +659,7 @@ using namespace ngraph::runtime; ...@@ -637,6 +659,7 @@ using namespace ngraph::runtime;
stringstream ss; stringstream ss;
ss << "((" << type << "*)(inputs[" << arg_index << "]))"; ss << "((" << type << "*)(inputs[" << arg_index << "]))";
m_variable_name_map[tv->get_tensor().get_name()] = ss.str(); m_variable_name_map[tv->get_tensor().get_name()] = ss.str();
param_index_map[tv->get_tensor().get_name()] = arg_index;
arg_index++; arg_index++;
} }
} }
...@@ -748,6 +771,30 @@ using namespace ngraph::runtime; ...@@ -748,6 +771,30 @@ using namespace ngraph::runtime;
{ {
emit_debug_function_entry(writer, node.get(), in, out); emit_debug_function_entry(writer, node.get(), in, out);
} }
// Op Control
if (!node->is_parameter() && !node->is_constant())
{
writer << "if (" << current_function->get_name() << "_init ";
for (const descriptor::Input& input : node->get_inputs())
{
const descriptor::Output& output = input.get_output();
shared_ptr<descriptor::TensorView> tv = output.get_tensor_view();
auto input_name = tv->get_tensor().get_name();
if (output.get_node()->is_parameter())
{
writer << " || ctx->p_en[" << param_index_map[input_name] << "]";
}
else if (!output.get_node()->is_constant())
{
writer << " || t_en[" << tensor_index_map[input_name] << "]";
}
}
writer << ") {\n";
writer.indent++;
}
string func_name; string func_name;
auto it = match_functions.find(node.get()); auto it = match_functions.find(node.get());
if (it == match_functions.end()) if (it == match_functions.end())
...@@ -794,6 +841,19 @@ using namespace ngraph::runtime; ...@@ -794,6 +841,19 @@ using namespace ngraph::runtime;
// Emit operation epilogue // Emit operation epilogue
if (!node->is_parameter() && !node->is_constant()) if (!node->is_parameter() && !node->is_constant())
{ {
for (auto output_name : node_output_names)
{
writer << "t_en[" << tensor_index_map[output_name] << "] = true;\n";
}
writer.indent--;
writer << "} else {\n";
writer.indent++;
for (auto output_name : node_output_names)
{
writer << "t_en[" << tensor_index_map[output_name] << "] = false;\n";
}
writer.indent--;
writer << "}\n";
emit_debug_function_exit(writer, node.get(), in, out); emit_debug_function_exit(writer, node.get(), in, out);
if (runtime::cpu::IsTracingEnabled() && if (runtime::cpu::IsTracingEnabled() &&
current_function->get_name() == m_function_name) current_function->get_name() == m_function_name)
...@@ -850,6 +910,7 @@ using namespace ngraph::runtime; ...@@ -850,6 +910,7 @@ using namespace ngraph::runtime;
writer << "try { G.wait_for_all(); } catch(...) { throw; }\n"; writer << "try { G.wait_for_all(); } catch(...) { throw; }\n";
} }
} }
writer << current_function->get_name() << "_init = false;\n";
writer.indent--; writer.indent--;
// End generated function // End generated function
......
...@@ -38,6 +38,7 @@ namespace ngraph ...@@ -38,6 +38,7 @@ namespace ngraph
struct CPURuntimeContext struct CPURuntimeContext
{ {
int64_t* op_durations; int64_t* op_durations;
bool* p_en;
mkldnn::primitive* const* mkldnn_primitives; mkldnn::primitive* const* mkldnn_primitives;
char* const* mkldnn_workspaces; char* const* mkldnn_workspaces;
}; };
......
...@@ -38,6 +38,7 @@ namespace ngraph ...@@ -38,6 +38,7 @@ namespace ngraph
protected: protected:
TensorView(const std::shared_ptr<ngraph::descriptor::TensorView>& descriptor) TensorView(const std::shared_ptr<ngraph::descriptor::TensorView>& descriptor)
: m_descriptor(descriptor) : m_descriptor(descriptor)
, m_stale(true)
{ {
} }
...@@ -58,6 +59,8 @@ namespace ngraph ...@@ -58,6 +59,8 @@ namespace ngraph
std::shared_ptr<ngraph::descriptor::layout::TensorViewLayout> std::shared_ptr<ngraph::descriptor::layout::TensorViewLayout>
get_tensor_view_layout() const; get_tensor_view_layout() const;
bool get_stale() { return m_stale; }
void set_stale(bool val) { m_stale = val; }
/// @brief Write bytes directly into the tensor /// @brief Write bytes directly into the tensor
/// @param p Pointer to source of data /// @param p Pointer to source of data
/// @param tensor_offset Offset into tensor storage to begin writing. Must be element-aligned. /// @param tensor_offset Offset into tensor storage to begin writing. Must be element-aligned.
...@@ -72,6 +75,7 @@ namespace ngraph ...@@ -72,6 +75,7 @@ namespace ngraph
protected: protected:
std::shared_ptr<ngraph::descriptor::TensorView> m_descriptor; std::shared_ptr<ngraph::descriptor::TensorView> m_descriptor;
bool m_stale;
}; };
using TensorViewPtrs = std::vector<std::shared_ptr<TensorView>>; using TensorViewPtrs = std::vector<std::shared_ptr<TensorView>>;
......
...@@ -739,7 +739,8 @@ static shared_ptr<ngraph::Function> ...@@ -739,7 +739,8 @@ static shared_ptr<ngraph::Function>
node_js.count("element_type") == 0 ? node_js.at("value_type") : node_js; node_js.count("element_type") == 0 ? node_js.at("value_type") : node_js;
auto element_type = read_element_type(type_node_js.at("element_type")); auto element_type = read_element_type(type_node_js.at("element_type"));
auto shape = type_node_js.at("shape"); auto shape = type_node_js.at("shape");
node = make_shared<op::Parameter>(element_type, shape); auto cacheable = get_or_default<bool>(node_js, "cacheable", false);
node = make_shared<op::Parameter>(element_type, shape, cacheable);
} }
else if (node_op == "Power") else if (node_op == "Power")
{ {
......
...@@ -5837,6 +5837,48 @@ TEST(${BACKEND_NAME}, mkldnn_layouts) ...@@ -5837,6 +5837,48 @@ TEST(${BACKEND_NAME}, mkldnn_layouts)
EXPECT_EQ(vector<float>{expected_result}, rv); EXPECT_EQ(vector<float>{expected_result}, rv);
} }
TEST(${BACKEND_NAME}, computation_reuse)
{
ONLY_ENABLE_TEST_FOR("CPU", "${BACKEND_NAME}");
Shape shape_a{1, 16, 2, 2};
auto A = make_shared<op::Parameter>(element::f32, shape_a);
Shape shape_b{32, 16, 1, 1};
auto B = make_shared<op::Parameter>(element::f32, shape_b);
Shape shape_r{1, 32, 2, 2};
auto conv = make_shared<op::Convolution>(A,
B,
Strides{1, 1},
Strides{1, 1},
CoordinateDiff{0, 0},
CoordinateDiff{0, 0},
Strides{1, 1});
Shape pool_shape{1, 1};
auto pool = make_shared<op::AvgPool>(conv, pool_shape);
auto bias = make_shared<op::Broadcast>(
op::Constant::create(element::f32, Shape{}, {2.14}), shape_r, AxisSet{0, 1, 2, 3});
auto result_op = make_shared<op::Result>(pool + bias);
auto f = make_shared<Function>(ResultVector{result_op}, op::ParameterVector{A, B});
auto backend = runtime::Backend::create("${BACKEND_NAME}");
vector<float> input(64, 1.0f);
vector<float> weights(512, 0.5f);
vector<float> rv(128);
auto a = backend->create_tensor(element::f32, shape_a, input.data());
auto b = backend->create_tensor(element::f32, shape_b, weights.data());
auto result = backend->create_tensor(element::f32, shape_r, rv.data());
backend->call(f, {result}, {a, b});
vector<float> rv_saved(rv);
b->set_stale(false);
backend->call(f, {result}, {a, b});
EXPECT_EQ(rv_saved, rv);
}
TEST(${BACKEND_NAME}, avg_pool_1d_1channel_1image) TEST(${BACKEND_NAME}, avg_pool_1d_1channel_1image)
{ {
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}"); SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
......
...@@ -205,11 +205,13 @@ void run_benchmark(shared_ptr<Function> f, ...@@ -205,11 +205,13 @@ void run_benchmark(shared_ptr<Function> f,
cout << "compile time: " << timer.get_milliseconds() << "ms" << endl; cout << "compile time: " << timer.get_milliseconds() << "ms" << endl;
vector<shared_ptr<runtime::TensorView>> args; vector<shared_ptr<runtime::TensorView>> args;
vector<bool> args_cacheable;
for (shared_ptr<op::Parameter> param : f->get_parameters()) for (shared_ptr<op::Parameter> param : f->get_parameters())
{ {
auto tensor = backend->create_tensor(param->get_element_type(), param->get_shape()); auto tensor = backend->create_tensor(param->get_element_type(), param->get_shape());
random_init(tensor); random_init(tensor);
args.push_back(tensor); args.push_back(tensor);
args_cacheable.push_back(param->get_cacheable());
} }
vector<shared_ptr<runtime::TensorView>> results; vector<shared_ptr<runtime::TensorView>> results;
for (shared_ptr<Node> out : f->get_results()) for (shared_ptr<Node> out : f->get_results())
...@@ -218,6 +220,13 @@ void run_benchmark(shared_ptr<Function> f, ...@@ -218,6 +220,13 @@ void run_benchmark(shared_ptr<Function> f,
results.push_back(result); results.push_back(result);
} }
for (size_t i = 0; i < args.size(); i++)
{
if (args_cacheable[i])
{
args[i]->set_stale(false);
}
}
stopwatch t1; stopwatch t1;
t1.start(); t1.start();
for (size_t i = 0; i < static_cast<size_t>(iterations); i++) for (size_t i = 0; i < static_cast<size_t>(iterations); i++)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment