Commit 44b58722 authored by Nishant Patel's avatar Nishant Patel Committed by Sang Ik Lee

LRU cache for dynamic shapes (#3827)

* LRU cache for caching graphs

* static

* LRU Cache

* Cache graph

* Make data members static

* Temp test case functional

* Temp test case functional

* Fix shape

* Make lru local to dynamic wrapper

* Make cache thread safe

* Remove static copies of data members

* Add a separator  between shapes of different inputs

* Clear list and map in destructor

* Caching on values of shape relevant inputs

* Replace cout's by NGRAPH_INFO

* Add a environment variable for cache size

* Add mutex header

* style

* change to int64_t

* Save the cloned function to get the output shape to allocate output storage

* Pass inputs without wrapping

* Fix conv shape relevant inputs

* gcc 4.8 doesnt support ostring stream as a copyable object

* Pass key by reference

* PR feedback

* Apply suggestions from code review

* Replace malloc
Co-authored-by: 's avatarScott Cyphers <diyessi@users.noreply.github.com>
Co-authored-by: 's avatarRobert Kimball <robert.kimball@intel.com>
parent 4614e4d9
......@@ -590,6 +590,8 @@ set (SRC
runtime/backend.hpp
runtime/backend_manager.cpp
runtime/backend_manager.hpp
runtime/cache.cpp
runtime/cache.hpp
runtime/chrome_trace.cpp
runtime/chrome_trace.hpp
runtime/executable.cpp
......
......@@ -519,8 +519,6 @@ void op::v1::ConvolutionBackpropFilters::validate_and_infer_types()
").");
}
set_input_is_relevant_to_shape(0);
set_input_is_relevant_to_shape(1);
set_input_is_relevant_to_shape(2);
set_output_type(0, forward_result_et, filters_shape);
}
......
//*****************************************************************************
// Copyright 2017-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#include "ngraph/runtime/cache.hpp"
using namespace ngraph;
using namespace std;
// Constructor
runtime::LRUCache::LRUCache()
{
char* cache_size = getenv("NGRAPH_CACHE_SIZE");
if (cache_size == nullptr)
{
m_cache_size = 1024; // TODO(nbpatel): Figure out a default size for the cache
}
else
{
m_cache_size = atoi(cache_size);
}
m_map = {};
m_list = {};
}
// Destructor
runtime::LRUCache::~LRUCache()
{
m_list.clear();
m_map.clear();
m_clone_function_map.clear();
}
void runtime::LRUCache::convert_shape_to_string(const vector<int>& shape, ostringstream& key)
{
if (!shape.empty())
{
std::copy(shape.begin(), shape.end(), std::ostream_iterator<int>(key, ", "));
}
}
void runtime::LRUCache::add_entry(const vector<int>& shape,
shared_ptr<runtime::Executable> exec,
shared_ptr<Function> func)
{
std::lock_guard<std::mutex> guard(m_mutex);
ostringstream key;
// check if the list is empty
if (m_list.size() == m_cache_size)
{
ostringstream key;
convert_shape_to_string(m_list.back(), key);
m_list.pop_back();
m_map.erase(key.str());
}
convert_shape_to_string(shape, key);
m_map.insert({key.str(), exec});
m_list.push_front(shape);
m_clone_function_map.insert({key.str(), func});
}
bool runtime::LRUCache::is_cached(const vector<int>& shape)
{
for (auto itr = m_list.begin(); itr != m_list.end(); itr++)
{
if (*itr == shape)
{
return true;
}
}
return false;
}
shared_ptr<runtime::Executable> runtime::LRUCache::get_cached_entry(const vector<int>& shape)
{
std::lock_guard<std::mutex> guard(m_mutex);
ostringstream key;
convert_shape_to_string(shape, key);
// find the entry and return the function
auto it = m_map.find(key.str());
if (it == m_map.end())
{
throw ngraph_error("Entry not found in cache");
}
else
{
// update list to push this reference to the front
for (auto itr = m_list.begin(); itr != m_list.end(); itr++)
{
if (*itr == shape)
{
m_list.remove(shape);
m_list.push_front(shape);
break;
}
}
return it->second;
}
}
// Need the clone function to get the output shape so that
// storage can be allocated for output
shared_ptr<Function> runtime::LRUCache::get_cloned_function(const vector<int>& shape)
{
std::lock_guard<std::mutex> guard(m_mutex);
ostringstream key;
convert_shape_to_string(shape, key);
// find the entry and return the function
auto it = m_clone_function_map.find(key.str());
if (it == m_clone_function_map.end())
{
throw ngraph_error("Cloned function not found");
}
return it->second;
}
//*****************************************************************************
// Copyright 2017-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#pragma once
#include <algorithm>
#include <iostream>
#include <iterator>
#include <list>
#include <mutex>
#include <sstream>
#include <string>
#include <unordered_map>
#include "ngraph/function.hpp"
#include "ngraph/runtime/executable.hpp"
#include "ngraph/shape.hpp"
namespace ngraph
{
namespace runtime
{
class LRUCache : public std::enable_shared_from_this<LRUCache>
{
public:
using GraphCache = std::unordered_map<std::string, std::shared_ptr<Executable>>;
using ClonedFunctionMap = std::unordered_map<std::string, std::shared_ptr<Function>>;
LRUCache();
virtual ~LRUCache();
void add_entry(const std::vector<int>& shape,
std::shared_ptr<Executable> exec,
std::shared_ptr<Function> func);
bool is_cached(const std::vector<int>& shape);
std::shared_ptr<Executable> get_cached_entry(const std::vector<int>& shape);
void convert_shape_to_string(const std::vector<int>& shape, std::ostringstream& key);
std::shared_ptr<Function> get_cloned_function(const std::vector<int>& shape);
private:
int m_cache_size;
GraphCache m_map;
ClonedFunctionMap m_clone_function_map;
std::list<std::vector<int>> m_list;
std::mutex m_mutex;
};
}
}
......@@ -120,130 +120,204 @@ bool runtime::dynamic::DynamicExecutable::call(
// (1) all shapes;
// (2) all values of shape-relevant input tensors.
NGRAPH_CHECK(m_wrapped_function->get_parameters().size() == inputs.size());
std::vector<std::shared_ptr<runtime::Tensor>> wrapped_inputs;
std::vector<element::Type> arg_element_types;
std::vector<PartialShape> arg_shapes;
std::shared_ptr<Function> clone;
std::vector<int> merged_input_shapes;
std::ostringstream key;
size_t loop_count = 0;
for (auto& input : inputs)
{
// We'll use AlignedBuffers to back the base pointers, storing them in this vector for RAII
// purposes.
std::vector<AlignedBuffer> arg_buffers;
arg_buffers.reserve(inputs.size());
std::vector<void*> arg_value_base_pointers(inputs.size());
size_t i = 0;
for (auto& input : inputs)
if (m_wrapped_function->get_parameters()[loop_count]->is_relevant_to_shapes())
{
if (m_wrapped_function->get_parameters()[i]->is_relevant_to_shapes())
// Caching on values of Shape relevant inputs
int size = input->get_size_in_bytes() / (input->get_element_type().bitwidth() / 8);
std::vector<int64_t> data(size);
input->read(data.data(), input->get_size_in_bytes());
for (int i = 0; i < input->get_element_count(); i++)
{
// TODO(amprocte): Move has_storage() to runtime::Tensor?
if (auto dynamic_tensor =
std::dynamic_pointer_cast<runtime::dynamic::DynamicTensor>(input))
{
NGRAPH_CHECK(dynamic_tensor->has_storage());
}
arg_buffers.emplace_back(input->get_size_in_bytes(), /*alignment=*/64);
arg_value_base_pointers[i] = arg_buffers.back().get_ptr();
// TODO(amprocte): For host-resident tensors we should be able to skip the read,
// but no API for that yet.
input->read(arg_value_base_pointers[i], input->get_size_in_bytes());
merged_input_shapes.emplace_back(data[i]);
}
else
}
else
{
// Caching on all remaining shapes
for (int i = 0; i < input->get_shape().size(); i++)
{
arg_value_base_pointers[i] = nullptr;
merged_input_shapes.emplace_back(input->get_shape()[i]);
}
}
// -1 is the separator.
// So if shape of Input 1 = {2, 2, 3, 3} & Input 2 = {4, 5}
// the key would be 2, 2, 3, 3, -1, 4, 5, -1
merged_input_shapes.emplace_back(-1);
loop_count++;
}
std::copy(merged_input_shapes.begin(),
merged_input_shapes.end(),
std::ostream_iterator<int>(key, ", "));
if (m_lru->is_cached(merged_input_shapes))
{
std::vector<std::shared_ptr<runtime::Tensor>> wrapped_inputs;
std::vector<std::shared_ptr<runtime::Tensor>> wrapped_outputs;
std::shared_ptr<Function> clone = m_lru->get_cloned_function(merged_input_shapes);
const ResultVector& results = clone->get_results();
for (auto& result : results)
{
NGRAPH_CHECK(result->get_output_partial_shape(0).is_static(),
"Shape staticization failed for result node ",
*result);
}
NGRAPH_CHECK(results.size() == outputs.size());
for (size_t i = 0; i < outputs.size(); i++)
{
if (auto dynamic_tensor =
std::dynamic_pointer_cast<runtime::dynamic::DynamicTensor>(input))
std::dynamic_pointer_cast<runtime::dynamic::DynamicTensor>(outputs[i]))
{
NGRAPH_CHECK(dynamic_tensor->has_storage());
arg_element_types.push_back(
dynamic_tensor->get_wrapped_tensor()->get_element_type());
arg_shapes.push_back(dynamic_tensor->get_wrapped_tensor()->get_shape());
wrapped_inputs.push_back(dynamic_tensor->get_wrapped_tensor());
dynamic_tensor->make_storage(results[i]->get_output_element_type(0),
results[i]->get_output_shape(0));
wrapped_outputs.push_back(dynamic_tensor->get_wrapped_tensor());
}
else
{
arg_element_types.push_back(input->get_element_type());
arg_shapes.push_back(input->get_shape());
wrapped_inputs.push_back(input);
wrapped_outputs.push_back(outputs[i]);
}
i++;
}
clone = specialize_function(
m_wrapped_function, arg_element_types, arg_shapes, arg_value_base_pointers);
return m_lru->get_cached_entry(merged_input_shapes)->call(wrapped_outputs, inputs);
}
pass::Manager passes;
passes.register_pass<pass::ConstantFolding>();
passes.register_pass<pass::DynElimination>();
passes.register_pass<pass::Opset0Downgrade>(); // Converts dynamic v1 variants to v0 ops
passes.set_per_pass_validation(false);
// FIXME(amprocte): Vile, temporary hack: we need to do repeated rounds of
// ConstantFolding/DynElimination until everything that DynElimination is supposed to
// eliminate has actually been eliminated. We could do this by monitoring the return values of
// of the passes (keep iterating until both CF and DE report no changes), but that did not
// seem to work so here we are. Probably a better fix is to somehow combine the matchers in CF
// and DE into one pass.
size_t num_dyn_nodes_last_pass = std::numeric_limits<size_t>::max();
while (num_dyn_nodes_last_pass != 0)
else
{
passes.run_passes(clone);
auto num_dyn_nodes_this_pass = count_dyn_nodes(clone);
NGRAPH_CHECK(m_wrapped_function->get_parameters().size() == inputs.size());
NGRAPH_CHECK(num_dyn_nodes_this_pass < num_dyn_nodes_last_pass,
"Could not eliminate all Dyn nodes (",
num_dyn_nodes_this_pass,
" remaining)");
std::vector<std::shared_ptr<runtime::Tensor>> wrapped_inputs;
std::vector<element::Type> arg_element_types;
std::vector<PartialShape> arg_shapes;
num_dyn_nodes_last_pass = num_dyn_nodes_this_pass;
}
std::shared_ptr<Function> clone;
{
// We'll use AlignedBuffers to back the base pointers, storing them in this vector for
// RAII
// purposes.
std::vector<AlignedBuffer> arg_buffers;
arg_buffers.reserve(inputs.size());
std::vector<void*> arg_value_base_pointers(inputs.size());
size_t i = 0;
for (auto& input : inputs)
{
if (m_wrapped_function->get_parameters()[i]->is_relevant_to_shapes())
{
// TODO(amprocte): Move has_storage() to runtime::Tensor?
if (auto dynamic_tensor =
std::dynamic_pointer_cast<runtime::dynamic::DynamicTensor>(input))
{
NGRAPH_CHECK(dynamic_tensor->has_storage());
}
arg_buffers.emplace_back(input->get_size_in_bytes(), /*alignment=*/64);
arg_value_base_pointers[i] = arg_buffers.back().get_ptr();
// TODO(amprocte): For host-resident tensors we should be able to skip the read,
// but no API for that yet.
input->read(arg_value_base_pointers[i], input->get_size_in_bytes());
}
else
{
arg_value_base_pointers[i] = nullptr;
}
pass::Manager pass_val;
pass_val.register_pass<pass::Validate>();
pass_val.run_passes(clone);
if (auto dynamic_tensor =
std::dynamic_pointer_cast<runtime::dynamic::DynamicTensor>(input))
{
NGRAPH_CHECK(dynamic_tensor->has_storage());
arg_element_types.push_back(
dynamic_tensor->get_wrapped_tensor()->get_element_type());
arg_shapes.push_back(dynamic_tensor->get_wrapped_tensor()->get_shape());
wrapped_inputs.push_back(dynamic_tensor->get_wrapped_tensor());
}
else
{
arg_element_types.push_back(input->get_element_type());
arg_shapes.push_back(input->get_shape());
wrapped_inputs.push_back(input);
}
std::vector<std::shared_ptr<runtime::Tensor>> wrapped_outputs;
i++;
}
const ResultVector& results = clone->get_results();
for (auto& result : results)
{
NGRAPH_CHECK(result->get_output_partial_shape(0).is_static(),
"Shape staticization failed for result node ",
*result);
}
NGRAPH_CHECK(results.size() == outputs.size());
clone = specialize_function(
m_wrapped_function, arg_element_types, arg_shapes, arg_value_base_pointers);
}
for (size_t i = 0; i < outputs.size(); i++)
{
if (auto dynamic_tensor =
std::dynamic_pointer_cast<runtime::dynamic::DynamicTensor>(outputs[i]))
pass::Manager passes;
passes.register_pass<pass::ConstantFolding>();
passes.register_pass<pass::DynElimination>();
passes.register_pass<pass::Opset0Downgrade>(); // Converts dynamic v1 variants to v0 ops
passes.set_per_pass_validation(false);
// FIXME(amprocte): Vile, temporary hack: we need to do repeated rounds of
// ConstantFolding/DynElimination until everything that DynElimination is supposed to
// eliminate has actually been eliminated. We could do this by monitoring the return values
// of the passes (keep iterating until both CF and DE report no changes), but that did not
// seem to work so here we are. Probably a better fix is to somehow combine the matchers in
// CF
// and DE into one pass.
size_t num_dyn_nodes_last_pass = std::numeric_limits<size_t>::max();
while (num_dyn_nodes_last_pass != 0)
{
dynamic_tensor->make_storage(results[i]->get_output_element_type(0),
results[i]->get_output_shape(0));
wrapped_outputs.push_back(dynamic_tensor->get_wrapped_tensor());
passes.run_passes(clone);
auto num_dyn_nodes_this_pass = count_dyn_nodes(clone);
NGRAPH_CHECK(num_dyn_nodes_this_pass < num_dyn_nodes_last_pass,
"Could not eliminate all Dyn nodes (",
num_dyn_nodes_this_pass,
" remaining)");
num_dyn_nodes_last_pass = num_dyn_nodes_this_pass;
}
else
pass::Manager pass_val;
pass_val.register_pass<pass::Validate>();
pass_val.run_passes(clone);
std::vector<std::shared_ptr<runtime::Tensor>> wrapped_outputs;
const ResultVector& results = clone->get_results();
for (auto& result : results)
{
wrapped_outputs.push_back(outputs[i]);
NGRAPH_CHECK(result->get_output_partial_shape(0).is_static(),
"Shape staticization failed for result node ",
*result);
}
NGRAPH_CHECK(results.size() == outputs.size());
for (size_t i = 0; i < outputs.size(); i++)
{
if (auto dynamic_tensor =
std::dynamic_pointer_cast<runtime::dynamic::DynamicTensor>(outputs[i]))
{
dynamic_tensor->make_storage(results[i]->get_output_element_type(0),
results[i]->get_output_shape(0));
wrapped_outputs.push_back(dynamic_tensor->get_wrapped_tensor());
}
else
{
wrapped_outputs.push_back(outputs[i]);
}
}
}
// TODO: Put compiled executable in the cache.
auto compiled_executable = m_wrapped_backend->compile(clone, m_enable_performance_collection);
auto result = compiled_executable->call(wrapped_outputs, wrapped_inputs);
auto compiled_executable =
m_wrapped_backend->compile(clone, m_enable_performance_collection);
// Put compiled executable in the cache.
m_lru->add_entry(merged_input_shapes, compiled_executable, clone);
auto result = compiled_executable->call(wrapped_outputs, wrapped_inputs);
return result;
return result;
}
}
runtime::dynamic::DynamicTensor::DynamicTensor(
......
......@@ -22,6 +22,7 @@
#include <vector>
#include "ngraph/runtime/backend.hpp"
#include "ngraph/runtime/cache.hpp"
#include "ngraph/runtime/host_tensor.hpp"
#include "ngraph/runtime/tensor.hpp"
......@@ -100,6 +101,8 @@ public:
private:
std::shared_ptr<ngraph::Function> m_wrapped_function;
std::shared_ptr<ngraph::runtime::Backend> m_wrapped_backend;
std::shared_ptr<ngraph::runtime::LRUCache> m_lru =
std::make_shared<ngraph::runtime::LRUCache>();
bool m_enable_performance_collection;
};
......
......@@ -1408,6 +1408,8 @@ NGRAPH_TEST(${BACKEND_NAME}, avg_pool_bprop_2d_2channel_2image_dyn_shape)
float denom = 2 * 2;
ex->call_with_validate({t_r}, {deltas, forward_shape});
ex->call_with_validate({t_r}, {deltas, forward_shape});
ex->call_with_validate({t_r}, {deltas, forward_shape});
ASSERT_EQ(t_r->get_shape(), (Shape{2, 2, 3, 3}));
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment