Commit 78b0a365 authored by Rob Earhart's avatar Rob Earhart Committed by Scott Cyphers

Minor perf tweaks (#2095)

* Only synchronize stale input tensors

* Cache backend.call() compilations

* Cache invoker bindings

* Update test list
parent 1e5e7145
...@@ -52,11 +52,7 @@ bool ngraph::runtime::plaidml::PlaidML_Backend::call( ...@@ -52,11 +52,7 @@ bool ngraph::runtime::plaidml::PlaidML_Backend::call(
const std::vector<std::shared_ptr<runtime::Tensor>>& outputs, const std::vector<std::shared_ptr<runtime::Tensor>>& outputs,
const std::vector<std::shared_ptr<runtime::Tensor>>& inputs) const std::vector<std::shared_ptr<runtime::Tensor>>& inputs)
{ {
auto cfunc = m_cache.try_lookup(func); auto cfunc = m_cache.compile(func, &m_compiler);
if (!cfunc)
{
cfunc = m_compiler.compile(func);
}
cfunc->schedule_invocation(inputs, outputs); cfunc->schedule_invocation(inputs, outputs);
return true; return true;
} }
......
...@@ -42,19 +42,31 @@ bool ngraph::runtime::plaidml::CompiledFunction::schedule_invocation( ...@@ -42,19 +42,31 @@ bool ngraph::runtime::plaidml::CompiledFunction::schedule_invocation(
NGRAPH_DEBUG << "Binding PlaidML function " << this; NGRAPH_DEBUG << "Binding PlaidML function " << this;
m_bound_inputs.resize(inputs.size());
m_bound_outputs.resize(outputs.size());
std::size_t input_count = 0; std::size_t input_count = 0;
for (const auto& param : m_func->get_parameters()) for (const auto& param : m_func->get_parameters())
{ {
for (std::size_t idx = 0; idx < param->get_output_size(); ++idx) for (std::size_t idx = 0; idx < param->get_output_size(); ++idx)
{ {
descriptor::Tensor* tv = param->get_output_tensor_ptr(idx).get(); descriptor::Tensor* tv = param->get_output_tensor_ptr(idx).get();
auto rtv = dynamic_cast<PlaidML_Tensor*>(inputs[input_count++].get()); auto& input = inputs.at(input_count);
auto rtv = dynamic_cast<PlaidML_Tensor*>(input.get());
if (!rtv) if (!rtv)
{ {
throw std::runtime_error{ throw std::runtime_error{
"The PlaidML backend only operations on PlaidML tensor views"}; "The PlaidML backend only operates on PlaidML tensor views"};
} }
rtv->sync_input(); rtv->sync_input();
auto& bound_input = m_bound_inputs.at(input_count);
++input_count;
if (bound_input.lock() == input)
{
// No need to re-bind this input.
continue;
}
bound_input = input;
NGRAPH_DEBUG << "Binding input " << m_input_names.at(tv) << " to tensor " << rtv; NGRAPH_DEBUG << "Binding input " << m_input_names.at(tv) << " to tensor " << rtv;
m_invoker.set_input(m_input_names.at(tv), rtv->tensor()); m_invoker.set_input(m_input_names.at(tv), rtv->tensor());
} }
...@@ -66,12 +78,21 @@ bool ngraph::runtime::plaidml::CompiledFunction::schedule_invocation( ...@@ -66,12 +78,21 @@ bool ngraph::runtime::plaidml::CompiledFunction::schedule_invocation(
for (std::size_t idx = 0; idx < result->get_output_size(); ++idx) for (std::size_t idx = 0; idx < result->get_output_size(); ++idx)
{ {
descriptor::Tensor* tv = result->get_output_tensor_ptr(idx).get(); descriptor::Tensor* tv = result->get_output_tensor_ptr(idx).get();
auto rtv = dynamic_cast<PlaidML_Tensor*>(outputs[output_count++].get()); auto& output = outputs.at(output_count);
auto rtv = dynamic_cast<PlaidML_Tensor*>(output.get());
if (!rtv) if (!rtv)
{ {
throw std::runtime_error{ throw std::runtime_error{
"The PlaidML backend only operations on PlaidML tensor views"}; "The PlaidML backend only operates on PlaidML tensor views"};
}
auto& bound_output = m_bound_outputs.at(output_count);
++output_count;
if (bound_output.lock() == output)
{
// No need to re-bind this output.
continue;
} }
bound_output = output;
NGRAPH_DEBUG << "Binding output " << m_output_names.at(tv) << " to tensor " << rtv; NGRAPH_DEBUG << "Binding output " << m_output_names.at(tv) << " to tensor " << rtv;
m_invoker.set_output(m_output_names.at(tv), rtv->tensor()); m_invoker.set_output(m_output_names.at(tv), rtv->tensor());
} }
...@@ -91,7 +112,7 @@ bool ngraph::runtime::plaidml::CompiledFunction::schedule_invocation( ...@@ -91,7 +112,7 @@ bool ngraph::runtime::plaidml::CompiledFunction::schedule_invocation(
if (!rtv) if (!rtv)
{ {
throw std::runtime_error{ throw std::runtime_error{
"The PlaidML backend only operations on PlaidML tensor views"}; "The PlaidML backend only operates on PlaidML tensor views"};
} }
rtv->sync_output(); rtv->sync_output();
} }
......
...@@ -58,5 +58,7 @@ private: ...@@ -58,5 +58,7 @@ private:
std::shared_ptr<Function> m_func; std::shared_ptr<Function> m_func;
std::unordered_map<descriptor::Tensor*, std::string> m_input_names; std::unordered_map<descriptor::Tensor*, std::string> m_input_names;
std::unordered_map<descriptor::Tensor*, std::string> m_output_names; std::unordered_map<descriptor::Tensor*, std::string> m_output_names;
mutable std::vector<std::weak_ptr<runtime::Tensor>> m_bound_inputs;
mutable std::vector<std::weak_ptr<runtime::Tensor>> m_bound_outputs;
mutable vertexai::plaidml::invoker m_invoker; mutable vertexai::plaidml::invoker m_invoker;
}; };
...@@ -101,6 +101,11 @@ void ngraph::runtime::plaidml::PlaidML_Tensor::read(void* p, size_t tensor_offse ...@@ -101,6 +101,11 @@ void ngraph::runtime::plaidml::PlaidML_Tensor::read(void* p, size_t tensor_offse
void ngraph::runtime::plaidml::PlaidML_Tensor::sync_input() void ngraph::runtime::plaidml::PlaidML_Tensor::sync_input()
{ {
if (!get_stale())
{
return;
}
set_stale(false);
if (!m_memory) if (!m_memory)
{ {
if (m_is_logically_zero) if (m_is_logically_zero)
...@@ -122,6 +127,7 @@ void ngraph::runtime::plaidml::PlaidML_Tensor::sync_output() ...@@ -122,6 +127,7 @@ void ngraph::runtime::plaidml::PlaidML_Tensor::sync_output()
{ {
// The tensor's been used for an output, so it's no longer logically zero. // The tensor's been used for an output, so it's no longer logically zero.
m_is_logically_zero = false; m_is_logically_zero = false;
set_stale(false);
if (!m_memory) if (!m_memory)
{ {
......
...@@ -26,12 +26,15 @@ topk_1d_max_one # No plans to implement TopK ...@@ -26,12 +26,15 @@ topk_1d_max_one # No plans to implement TopK
topk_1d_min_all # No plans to implement TopK topk_1d_min_all # No plans to implement TopK
topk_1d_min_partial # No plans to implement TopK topk_1d_min_partial # No plans to implement TopK
topk_1d_min_one # No plans to implement TopK topk_1d_min_one # No plans to implement TopK
topk_3d_large_input_max # No plans to implement TopK
topk_3d_large_input_min # No plans to implement TopK
topk_3d_max_all # No plans to implement TopK topk_3d_max_all # No plans to implement TopK
topk_3d_max_partial # No plans to implement TopK topk_3d_max_partial # No plans to implement TopK
topk_3d_max_one # No plans to implement TopK topk_3d_max_one # No plans to implement TopK
topk_3d_min_all # No plans to implement TopK topk_3d_min_all # No plans to implement TopK
topk_3d_min_partial # No plans to implement TopK topk_3d_min_partial # No plans to implement TopK
topk_3d_min_one # No plans to implement TopK topk_3d_min_one # No plans to implement TopK
topk_3d_single_output # No plans to implement TopK
topk_2d_max_all # No plans to implement TopK topk_2d_max_all # No plans to implement TopK
topk_2d_max_partial # No plans to implement TopK topk_2d_max_partial # No plans to implement TopK
topk_2d_max_one # No plans to implement TopK topk_2d_max_one # No plans to implement TopK
...@@ -43,15 +46,21 @@ topk_5d_max_partial # No plans to implement TopK ...@@ -43,15 +46,21 @@ topk_5d_max_partial # No plans to implement TopK
# Tests that PlaidML might be able to run at some point. # Tests that PlaidML might be able to run at some point.
backwards_maxpool_n2_c1_hw5_3x3_str2_max_pad1x2_2x3 backwards_maxpool_n2_c1_hw5_3x3_str2_max_pad1x2_2x3
backwards_maxpool_n4c1h4w4_kh2kw2_sh1sw1
backwards_maxpool_n2c1h5w5_kh3kw3_sh2sw2
backwards_maxpool_n4_c1_hw4_2x2_max
backwards_maxpool_n2_c1_hw5_3x3_str2_max
backwards_slice backwards_slice
batchnorm_fprop_bprop # To debug batchnorm_fprop_bprop # To debug
batchnorm_fprop_bprop_2step # To debug batchnorm_fprop_bprop_2step # To debug
softmax_axis_3d_double # To debug
reduce_matrix_rows_zero # To debug: possible broadcasting error? reduce_matrix_rows_zero # To debug: possible broadcasting error?
reduce_matrix_cols_zero # To debug: possible broadcasting error? reduce_matrix_cols_zero # To debug: possible broadcasting error?
reduce_3d_to_vector # To debug: possible broadcasting error? reduce_3d_to_vector # To debug: possible broadcasting error?
replace_slice_matrix_inplace replace_slice_matrix_inplace
max_pool_2d_1channel_1image_overpadded max_pool_2d_1channel_1image_overpadded
max_pool_3d max_pool_3d
maxpool_bprop_larger_than_cache
reduce_window_emulating_max_pool_1d_1channel_1image reduce_window_emulating_max_pool_1d_1channel_1image
reduce_window_emulating_max_pool_1d_1channel_2image reduce_window_emulating_max_pool_1d_1channel_2image
reduce_window_emulating_max_pool_1d_2channel_2image reduce_window_emulating_max_pool_1d_2channel_2image
...@@ -60,31 +69,49 @@ reduce_window_emulating_max_pool_2d_1channel_1image_strided ...@@ -60,31 +69,49 @@ reduce_window_emulating_max_pool_2d_1channel_1image_strided
select_and_scatter_with_overlap select_and_scatter_with_overlap
select_and_scatter_without_overlap select_and_scatter_without_overlap
select_and_scatter_3d_without_overlap select_and_scatter_3d_without_overlap
generate_mask
avg_pool_3d avg_pool_3d
avg_pool_3d_uneven_strided_padded_include_in_computation avg_pool_3d_uneven_strided_padded_include_in_computation
dequantize_zero_offset # Quantization/Dequantization is unimplemented dequantize_int8_zero_offset # Quantization/Dequantization is unimplemented
quantize_ROUND_NEAREST_TOWARD_ZERO # Quantization/Dequantization is unimplemented dequantize_int32 # Quantization/Dequantization is unimplemented
quantize_ROUND_NEAREST_UPWARD # Quantization/Dequantization is unimplemented dequantize_int32_zero_offset # Quantization/Dequantization is unimplemented
quantize_ROUND_NEAREST_DOWNWARD # Quantization/Dequantization is unimplemented dequantize_zero_offset # Quantization/Dequantization is unimplemented
quantize_ROUND_NEAREST_TOWARD_EVEN # Quantization/Dequantization is unimplemented quantize_ROUND_NEAREST_TOWARD_ZERO # Quantization/Dequantization is unimplemented
quantize_ROUND_TOWARD_INFINITY # Quantization/Dequantization is unimplemented quantize_ROUND_NEAREST_UPWARD # Quantization/Dequantization is unimplemented
quantize_ROUND_TOWARD_ZERO # Quantization/Dequantization is unimplemented quantize_ROUND_NEAREST_DOWNWARD # Quantization/Dequantization is unimplemented
quantize_ROUND_UP # Quantization/Dequantization is unimplemented quantize_ROUND_NEAREST_TOWARD_EVEN # Quantization/Dequantization is unimplemented
quantize_ROUND_DOWN # Quantization/Dequantization is unimplemented quantize_ROUND_NEAREST_TOWARD_INFINITY # Quantization/Dequantization is unimplemented
quantize # Quantization/Dequantization is unimplemented quantize_ROUND_TOWARD_INFINITY # Quantization/Dequantization is unimplemented
quantize_axes # Quantization/Dequantization is unimplemented quantize_ROUND_TOWARD_ZERO # Quantization/Dequantization is unimplemented
quantize_int8 # Quantization/Dequantization is unimplemented quantize_ROUND_UP # Quantization/Dequantization is unimplemented
quantize_clamp # Quantization/Dequantization is unimplemented quantize_ROUND_DOWN # Quantization/Dequantization is unimplemented
dequantize # Quantization/Dequantization is unimplemented quantize # Quantization/Dequantization is unimplemented
dequantize_axes # Quantization/Dequantization is unimplemented quantize_zero_offset # Quantization/Dequantization is unimplemented
dequantize_int8 # Quantization/Dequantization is unimplemented quantize_axes # Quantization/Dequantization is unimplemented
sum_matrix_rows_zero # Empty dims apparently should produce shaped 0s quantize_int8 # Quantization/Dequantization is unimplemented
sum_matrix_cols_zero # Empty dims apparently should produce shaped 0s quantize_int8_zero_offset # Quantization/Dequantization is unimplemented
sum_vector_zero # Empty dims apparently should produce shaped 0s quantize_int32 # Quantization/Dequantization is unimplemented
sum_matrix_to_scalar_zero_by_zero # Empty dims apparently should produce shaped 0s quantize_int32_zero_offset # Quantization/Dequantization is unimplemented
sum_3d_eliminate_zero_dim # Empty dims apparently should produce shaped 0s quantize_clamp # Quantization/Dequantization is unimplemented
dot_0_0 # Empty dims apparently should produce shaped 0s quantize_clamp_int8 # Quantization/Dequantization is unimplemented
dot_matrix_2x0_0x2 # Empty dims apparently should produce shaped 0s quantize_clamp_int32 # Quantization/Dequantization is unimplemented
dot_2x0_0 # Empty dims apparently should produce shaped 0s quantize_clamp_int32_zero_offset # Quantization/Dequantization is unimplemented
quantize_clamp_uint8 # Quantization/Dequantization is unimplemented
dequantize # Quantization/Dequantization is unimplemented
dequantize_axes # Quantization/Dequantization is unimplemented
dequantize_int8 # Quantization/Dequantization is unimplemented
sum_matrix_rows_zero # Empty dims apparently should produce shaped 0s
sum_matrix_cols_zero # Empty dims apparently should produce shaped 0s
sum_vector_zero # Empty dims apparently should produce shaped 0s
sum_matrix_to_scalar_zero_by_zero # Empty dims apparently should produce shaped 0s
sum_3d_eliminate_zero_dim # Empty dims apparently should produce shaped 0s
dot_0_0 # Empty dims apparently should produce shaped 0s
dot_matrix_2x0_0x2 # Empty dims apparently should produce shaped 0s
dot_2x0_0 # Empty dims apparently should produce shaped 0s
numeric_float_nan numeric_float_nan
numeric_double_nan numeric_double_nan
shape_of_scalar
shape_of_vector
shape_of_matrix
shape_of_5d
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment