Commit f277e1c2 authored by Jaikrishnan Menon's avatar Jaikrishnan Menon Committed by Scott Cyphers

DEX Micro-optimizations: Use the mapped-value reference capture idiom everywhere (#1352)

parent 4efcb76e
......@@ -1129,7 +1129,9 @@ void runtime::cpu::CPU_ExternalFunction::build()
for (size_t i = 0; i < param->get_output_size(); ++i)
{
shared_ptr<descriptor::TensorView> tv = param->get_output_tensor_view(i);
function_input_index[tv->get_tensor().get_name()] = arg_index;
function_input_index.emplace_back(tensor_data[tv->get_tensor().get_name()],
arg_index,
tensor_stale[tv->get_tensor().get_name()]);
arg_index++;
}
}
......@@ -1139,14 +1141,14 @@ void runtime::cpu::CPU_ExternalFunction::build()
{
shared_ptr<Node> op = m_function->get_output_op(i);
shared_ptr<descriptor::TensorView> tv = op->get_output_tensor_view();
function_output_index[tv->get_tensor().get_name()] = i;
function_output_index.emplace_back(tensor_data[tv->get_tensor().get_name()], i);
auto res = std::dynamic_pointer_cast<ngraph::op::Result>(op);
if (!res->needs_copy())
{
shared_ptr<descriptor::TensorView> itv =
res->get_inputs().at(0).get_output().get_tensor_view();
function_output_index[itv->get_tensor().get_name()] = i;
function_output_index.emplace_back(tensor_data[itv->get_tensor().get_name()], i);
}
}
......@@ -1159,7 +1161,8 @@ void runtime::cpu::CPU_ExternalFunction::build()
{
for (auto tensor : node->liveness_new_list)
{
intermediates_offsets[tensor->get_name()] = tensor->get_pool_offset();
intermediates_offsets.emplace_back(tensor_data[tensor->get_name()],
tensor->get_pool_offset());
}
}
}
......@@ -1212,21 +1215,47 @@ void runtime::cpu::CPU_ExternalFunction::build()
handler->second(this, node.get(), in, out);
bool disable_caching = computes_result(node.get()) || possibly_overwritten(node.get());
auto enable = [&, in_names, out_names, disable_caching](CPURuntimeContext* ctx) -> bool {
bool en = false;
vector<reference_wrapper<bool>> in_stale, out_stale;
for (const auto& name : in_names)
{
if (tensor_stale[name] || disable_caching)
in_stale.emplace_back(tensor_stale[name]);
}
for (const auto& name : out_names)
{
out_stale.emplace_back(tensor_stale[name]);
}
function<bool(CPURuntimeContext*)> enable;
if (disable_caching)
{
enable = [in_stale, out_stale](CPURuntimeContext* ctx) -> bool {
for (auto& stale : out_stale)
{
stale.get() = true;
}
return true;
};
}
else
{
enable = [in_stale, out_stale](CPURuntimeContext* ctx) -> bool {
bool en = false;
for (const auto& stale : in_stale)
{
if (stale)
{
en = true;
break;
}
}
for (const auto& name : out_names)
for (auto& stale : out_stale)
{
tensor_stale[name] = en;
stale.get() = en;
}
return en;
};
}
enables.emplace_back(make_pair(enable, functors.size() - functor_count));
enable_nodename_list.emplace_back(make_pair(enable, node->get_name()));
......@@ -1240,20 +1269,19 @@ void runtime::cpu::CPU_ExternalFunction::build()
{
for (auto& p : intermediates_offsets)
{
tensor_data[p.first] =
static_cast<uint8_t*>(ctx->memory_buffers[0]->get_ptr()) + p.second;
p.first.get() = static_cast<uint8_t*>(ctx->memory_buffers[0]->get_ptr()) + p.second;
}
}
for (const auto& p : function_input_index)
{
tensor_data[p.first] = inputs[p.second];
tensor_stale[p.first] = ctx->p_en[p.second];
get<0>(p).get() = inputs[get<1>(p)];
get<2>(p).get() = ctx->p_en[get<1>(p)];
}
for (const auto& p : function_output_index)
{
tensor_data[p.first] = outputs[p.second];
p.first.get() = outputs[p.second];
}
auto functor = functors.begin();
......
......@@ -24,6 +24,7 @@
#include <typeindex>
#include <typeinfo>
#include <unordered_map>
#include <utility>
#include <vector>
#include "ngraph/codegen/code_writer.hpp"
......@@ -178,8 +179,11 @@ namespace ngraph
executor;
std::unordered_map<std::string, void*> tensor_data;
std::unordered_map<std::string, bool> tensor_stale;
std::unordered_map<std::string, size_t> intermediates_offsets;
std::unordered_map<std::string, size_t> function_input_index, function_output_index;
std::list<std::pair<std::reference_wrapper<void*>, size_t>> intermediates_offsets;
std::list<
std::tuple<std::reference_wrapper<void*>, size_t, std::reference_wrapper<bool>>>
function_input_index;
std::list<std::pair<std::reference_wrapper<void*>, size_t>> function_output_index;
std::unordered_map<std::string, std::shared_ptr<CPU_ExternalFunction>> callees;
bool m_is_built;
bool m_direct_execution;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment