Commit 41c50b44 authored by Jayaram Bobba's avatar Jayaram Bobba Committed by Scott Cyphers

Computation reuse (#945)

* Make temp memory pools static to avoid memory allocation overheads

* Initial implementation for graph control to enable caching and computation reuse

* Added sphinx documentation

* Turned off memory buffer reuse in CPU transformer to support computation reuse. Added unit test

* Change memoizable to cacheable

* Change memoizable to cacheable

* Rename variables
parent 698f29f1
......@@ -25,6 +25,9 @@ Attributes
+------------------+------------------------------------------+
| ``shape`` | The ``Shape`` of the parameter. |
+------------------+------------------------------------------+
| ``cacheable`` | True if the parameter is not expected to |
| | be frequently updated. |
+------------------+------------------------------------------+
Outputs
-------
......
......@@ -21,8 +21,11 @@
using namespace std;
using namespace ngraph;
op::Parameter::Parameter(const element::Type& element_type, const Shape& shape)
op::Parameter::Parameter(const element::Type& element_type,
const Shape& shape,
const bool cacheable)
: Op("Parameter", {})
, m_cacheable(cacheable)
{
add_output(element_type, shape);
}
......
......@@ -40,10 +40,17 @@ namespace ngraph
///
/// \param element_type The element type of the parameter.
/// \param shape The shape of the parameter.
Parameter(const ngraph::element::Type& element_type, const Shape& shape);
/// \param cacheable True if the parameter is not expected to be frequently updated.
Parameter(const ngraph::element::Type& element_type,
const Shape& shape,
const bool cacheable = false);
const bool get_cacheable() const { return m_cacheable; }
virtual std::shared_ptr<Node>
copy_with_new_args(const NodeVector& new_args) const override;
protected:
bool m_cacheable;
};
}
}
......@@ -27,8 +27,9 @@
using namespace std;
using namespace ngraph;
pass::MemoryLayout::MemoryLayout(size_t alignment)
pass::MemoryLayout::MemoryLayout(size_t alignment, bool disable_memory_sharing)
: m_alignment(alignment)
, m_disable_memory_sharing(disable_memory_sharing)
{
}
......@@ -42,9 +43,12 @@ bool pass::MemoryLayout::run_on_function(shared_ptr<ngraph::Function> function)
size_t offset = mm.allocate(tensor->size());
tensor->set_pool_offset(offset);
}
for (const descriptor::Tensor* tensor : node->liveness_free_list)
if (!m_disable_memory_sharing)
{
mm.free(tensor->get_pool_offset());
for (const descriptor::Tensor* tensor : node->liveness_free_list)
{
mm.free(tensor->get_pool_offset());
}
}
}
function->set_temporary_pool_size(mm.max_allocated());
......
......@@ -35,11 +35,12 @@ namespace ngraph
class ngraph::pass::MemoryLayout : public FunctionPass
{
public:
MemoryLayout(size_t alignment = 1);
MemoryLayout(size_t alignment = 1, bool disable_memory_sharing = false);
bool run_on_function(std::shared_ptr<ngraph::Function>) override;
private:
size_t m_alignment;
bool m_disable_memory_sharing;
};
class ngraph::pass::MemoryManager
......
......@@ -51,6 +51,7 @@ void runtime::cpu::CPU_CallFrame::call(
{
shared_ptr<runtime::cpu::CPUTensorView> tv =
static_pointer_cast<runtime::cpu::CPUTensorView>(input_tvs[i]);
ctx->p_en[i] = tv->get_stale();
inputs.push_back(tv->get_data_ptr());
}
for (size_t i = 0; i < output_tvs.size(); i++)
......@@ -100,6 +101,7 @@ void runtime::cpu::CPU_CallFrame::setup_runtime_context()
{
ctx->op_durations = new int64_t[m_external_function->get_op_attrs().size()];
}
ctx->p_en = new bool[m_external_function->get_parameter_layout_descriptors().size()];
const auto& mkldnn_emitter = m_external_function->get_mkldnn_emitter();
ctx->mkldnn_primitives = mkldnn_emitter->get_mkldnn_primitives().data();
ctx->mkldnn_workspaces = mkldnn_emitter->get_mkldnn_workspaces().data();
......@@ -108,5 +110,6 @@ void runtime::cpu::CPU_CallFrame::setup_runtime_context()
void runtime::cpu::CPU_CallFrame::cleanup_runtime_context()
{
delete[] ctx->op_durations;
delete[] ctx->p_en;
delete ctx;
}
......@@ -330,7 +330,7 @@ void runtime::cpu::CPU_ExternalFunction::compile()
pass_manager.register_pass<ngraph::pass::ResultCopyElimination>();
pass_manager.register_pass<ngraph::pass::GetOutputElementElimination>();
pass_manager.register_pass<ngraph::pass::Liveness>();
pass_manager.register_pass<ngraph::pass::MemoryLayout>(s_memory_pool_alignment);
pass_manager.register_pass<ngraph::pass::MemoryLayout>(s_memory_pool_alignment, true);
pass_manager.run_passes(m_function);
unordered_map<shared_ptr<Function>, list<shared_ptr<Node>>> function_ordered_ops;
......@@ -588,6 +588,26 @@ using namespace ngraph::runtime;
<< ");\n";
}
// Indexing for Control Flags
std::map<std::string, size_t> tensor_index_map;
std::map<std::string, size_t> param_index_map;
size_t tensor_index = 0;
for (shared_ptr<Node> node : ordered_ops)
{
if (!node->is_parameter() && !node->is_constant())
{
for (const descriptor::Input& input : node->get_inputs())
{
const descriptor::Output& output = input.get_output();
shared_ptr<descriptor::TensorView> tv = output.get_tensor_view();
tensor_index_map.insert({tv->get_tensor().get_name(), tensor_index++});
}
}
}
writer << "bool " << current_function->get_name() << "_t_en[" << tensor_index << "];\n";
writer << "bool " << current_function->get_name() << "_init = true;\n";
writer << "extern \"C\" void " << current_function->get_name();
writer << "(void** inputs, void** outputs, cpu::CPURuntimeContext* ctx)\n";
writer << "{\n";
......@@ -625,6 +645,8 @@ using namespace ngraph::runtime;
}
}
writer << "bool* t_en = (bool*)" << current_function->get_name() << "_t_en;\n";
// Add inputs to the variable name map
size_t arg_index = 0;
for (shared_ptr<ngraph::op::Parameter> param : current_function->get_parameters())
......@@ -637,6 +659,7 @@ using namespace ngraph::runtime;
stringstream ss;
ss << "((" << type << "*)(inputs[" << arg_index << "]))";
m_variable_name_map[tv->get_tensor().get_name()] = ss.str();
param_index_map[tv->get_tensor().get_name()] = arg_index;
arg_index++;
}
}
......@@ -748,6 +771,30 @@ using namespace ngraph::runtime;
{
emit_debug_function_entry(writer, node.get(), in, out);
}
// Op Control
if (!node->is_parameter() && !node->is_constant())
{
writer << "if (" << current_function->get_name() << "_init ";
for (const descriptor::Input& input : node->get_inputs())
{
const descriptor::Output& output = input.get_output();
shared_ptr<descriptor::TensorView> tv = output.get_tensor_view();
auto input_name = tv->get_tensor().get_name();
if (output.get_node()->is_parameter())
{
writer << " || ctx->p_en[" << param_index_map[input_name] << "]";
}
else if (!output.get_node()->is_constant())
{
writer << " || t_en[" << tensor_index_map[input_name] << "]";
}
}
writer << ") {\n";
writer.indent++;
}
string func_name;
auto it = match_functions.find(node.get());
if (it == match_functions.end())
......@@ -794,6 +841,19 @@ using namespace ngraph::runtime;
// Emit operation epilogue
if (!node->is_parameter() && !node->is_constant())
{
for (auto output_name : node_output_names)
{
writer << "t_en[" << tensor_index_map[output_name] << "] = true;\n";
}
writer.indent--;
writer << "} else {\n";
writer.indent++;
for (auto output_name : node_output_names)
{
writer << "t_en[" << tensor_index_map[output_name] << "] = false;\n";
}
writer.indent--;
writer << "}\n";
emit_debug_function_exit(writer, node.get(), in, out);
if (runtime::cpu::IsTracingEnabled() &&
current_function->get_name() == m_function_name)
......@@ -850,6 +910,7 @@ using namespace ngraph::runtime;
writer << "try { G.wait_for_all(); } catch(...) { throw; }\n";
}
}
writer << current_function->get_name() << "_init = false;\n";
writer.indent--;
// End generated function
......
......@@ -38,6 +38,7 @@ namespace ngraph
struct CPURuntimeContext
{
int64_t* op_durations;
bool* p_en;
mkldnn::primitive* const* mkldnn_primitives;
char* const* mkldnn_workspaces;
};
......
......@@ -38,6 +38,7 @@ namespace ngraph
protected:
TensorView(const std::shared_ptr<ngraph::descriptor::TensorView>& descriptor)
: m_descriptor(descriptor)
, m_stale(true)
{
}
......@@ -58,6 +59,8 @@ namespace ngraph
std::shared_ptr<ngraph::descriptor::layout::TensorViewLayout>
get_tensor_view_layout() const;
bool get_stale() { return m_stale; }
void set_stale(bool val) { m_stale = val; }
/// @brief Write bytes directly into the tensor
/// @param p Pointer to source of data
/// @param tensor_offset Offset into tensor storage to begin writing. Must be element-aligned.
......@@ -72,6 +75,7 @@ namespace ngraph
protected:
std::shared_ptr<ngraph::descriptor::TensorView> m_descriptor;
bool m_stale;
};
using TensorViewPtrs = std::vector<std::shared_ptr<TensorView>>;
......
......@@ -739,7 +739,8 @@ static shared_ptr<ngraph::Function>
node_js.count("element_type") == 0 ? node_js.at("value_type") : node_js;
auto element_type = read_element_type(type_node_js.at("element_type"));
auto shape = type_node_js.at("shape");
node = make_shared<op::Parameter>(element_type, shape);
auto cacheable = get_or_default<bool>(node_js, "cacheable", false);
node = make_shared<op::Parameter>(element_type, shape, cacheable);
}
else if (node_op == "Power")
{
......
......@@ -5837,6 +5837,48 @@ TEST(${BACKEND_NAME}, mkldnn_layouts)
EXPECT_EQ(vector<float>{expected_result}, rv);
}
TEST(${BACKEND_NAME}, computation_reuse)
{
ONLY_ENABLE_TEST_FOR("CPU", "${BACKEND_NAME}");
Shape shape_a{1, 16, 2, 2};
auto A = make_shared<op::Parameter>(element::f32, shape_a);
Shape shape_b{32, 16, 1, 1};
auto B = make_shared<op::Parameter>(element::f32, shape_b);
Shape shape_r{1, 32, 2, 2};
auto conv = make_shared<op::Convolution>(A,
B,
Strides{1, 1},
Strides{1, 1},
CoordinateDiff{0, 0},
CoordinateDiff{0, 0},
Strides{1, 1});
Shape pool_shape{1, 1};
auto pool = make_shared<op::AvgPool>(conv, pool_shape);
auto bias = make_shared<op::Broadcast>(
op::Constant::create(element::f32, Shape{}, {2.14}), shape_r, AxisSet{0, 1, 2, 3});
auto result_op = make_shared<op::Result>(pool + bias);
auto f = make_shared<Function>(ResultVector{result_op}, op::ParameterVector{A, B});
auto backend = runtime::Backend::create("${BACKEND_NAME}");
vector<float> input(64, 1.0f);
vector<float> weights(512, 0.5f);
vector<float> rv(128);
auto a = backend->create_tensor(element::f32, shape_a, input.data());
auto b = backend->create_tensor(element::f32, shape_b, weights.data());
auto result = backend->create_tensor(element::f32, shape_r, rv.data());
backend->call(f, {result}, {a, b});
vector<float> rv_saved(rv);
b->set_stale(false);
backend->call(f, {result}, {a, b});
EXPECT_EQ(rv_saved, rv);
}
TEST(${BACKEND_NAME}, avg_pool_1d_1channel_1image)
{
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
......
......@@ -205,11 +205,13 @@ void run_benchmark(shared_ptr<Function> f,
cout << "compile time: " << timer.get_milliseconds() << "ms" << endl;
vector<shared_ptr<runtime::TensorView>> args;
vector<bool> args_cacheable;
for (shared_ptr<op::Parameter> param : f->get_parameters())
{
auto tensor = backend->create_tensor(param->get_element_type(), param->get_shape());
random_init(tensor);
args.push_back(tensor);
args_cacheable.push_back(param->get_cacheable());
}
vector<shared_ptr<runtime::TensorView>> results;
for (shared_ptr<Node> out : f->get_results())
......@@ -218,6 +220,13 @@ void run_benchmark(shared_ptr<Function> f,
results.push_back(result);
}
for (size_t i = 0; i < args.size(); i++)
{
if (args_cacheable[i])
{
args[i]->set_stale(false);
}
}
stopwatch t1;
t1.start();
for (size_t i = 0; i < static_cast<size_t>(iterations); i++)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment