Minor perf tweaks (#2095)

* Only synchronize stale input tensors * Cache backend.call() compilations * Cache invoker bindings * Update test list

Minor perf tweaks (#2095)
* Only synchronize stale input tensors * Cache backend.call() compilations * Cache invoker bindings * Update test list
78b0a365 · Rob Earhart · Scott Cyphers · 1e5e7145 · 78b0a365 · 78b0a365
Commit 78b0a365 authored Dec 01, 2018 by Rob Earhart Committed by Scott Cyphers Dec 01, 2018
5 changed files
--- a/src/ngraph/runtime/plaidml/plaidml_backend.cpp
+++ b/src/ngraph/runtime/plaidml/plaidml_backend.cpp
@@ -52,11 +52,7 @@ bool ngraph::runtime::plaidml::PlaidML_Backend::call(
    const std::vector<std::shared_ptr<runtime::Tensor>>& outputs,
    const std::vector<std::shared_ptr<runtime::Tensor>>& inputs)
 {
-    auto cfunc = m_cache.try_lookup(func);
+    auto cfunc = m_cache.compile(func, &m_compiler);
-    if (!cfunc)
-    {
-        cfunc = m_compiler.compile(func);
-    }
    cfunc->schedule_invocation(inputs, outputs);
    return true;
 }

--- a/src/ngraph/runtime/plaidml/plaidml_compiled_function.cpp
+++ b/src/ngraph/runtime/plaidml/plaidml_compiled_function.cpp
@@ -42,19 +42,31 @@ bool ngraph::runtime::plaidml::CompiledFunction::schedule_invocation(
    NGRAPH_DEBUG << "Binding PlaidML function " << this;
+    m_bound_inputs.resize(inputs.size());
+    m_bound_outputs.resize(outputs.size());
    std::size_t input_count = 0;
    for (const auto& param : m_func->get_parameters())
    {
        for (std::size_t idx = 0; idx < param->get_output_size(); ++idx)
        {
            descriptor::Tensor* tv = param->get_output_tensor_ptr(idx).get();
-            auto rtv = dynamic_cast<PlaidML_Tensor*>(inputs[input_count++].get());
+            auto& input = inputs.at(input_count);
+            auto rtv = dynamic_cast<PlaidML_Tensor*>(input.get());
            if (!rtv)
            {
                throw std::runtime_error{
-                    "The PlaidML backend only operations on PlaidML tensor views"};
+                    "The PlaidML backend only operates on PlaidML tensor views"};
            }
            rtv->sync_input();
+            auto& bound_input = m_bound_inputs.at(input_count);
+            ++input_count;
+            if (bound_input.lock() == input)
+            {
+                // No need to re-bind this input.
+                continue;
+            }
+            bound_input = input;
            NGRAPH_DEBUG << "Binding input " << m_input_names.at(tv) << " to tensor " << rtv;
            m_invoker.set_input(m_input_names.at(tv), rtv->tensor());
        }
@@ -66,12 +78,21 @@ bool ngraph::runtime::plaidml::CompiledFunction::schedule_invocation(
        for (std::size_t idx = 0; idx < result->get_output_size(); ++idx)
        {
            descriptor::Tensor* tv = result->get_output_tensor_ptr(idx).get();
-            auto rtv = dynamic_cast<PlaidML_Tensor*>(outputs[output_count++].get());
+            auto& output = outputs.at(output_count);
+            auto rtv = dynamic_cast<PlaidML_Tensor*>(output.get());
            if (!rtv)
            {
                throw std::runtime_error{
-                    "The PlaidML backend only operations on PlaidML tensor views"};
+                    "The PlaidML backend only operates on PlaidML tensor views"};
+            }
+            auto& bound_output = m_bound_outputs.at(output_count);
+            ++output_count;
+            if (bound_output.lock() == output)
+            {
+                // No need to re-bind this output.
+                continue;
            }
+            bound_output = output;
            NGRAPH_DEBUG << "Binding output " << m_output_names.at(tv) << " to tensor " << rtv;
            m_invoker.set_output(m_output_names.at(tv), rtv->tensor());
        }
@@ -91,7 +112,7 @@ bool ngraph::runtime::plaidml::CompiledFunction::schedule_invocation(
            if (!rtv)
            {
                throw std::runtime_error{
-                    "The PlaidML backend only operations on PlaidML tensor views"};
+                    "The PlaidML backend only operates on PlaidML tensor views"};
            }
            rtv->sync_output();
        }

--- a/src/ngraph/runtime/plaidml/plaidml_compiled_function.hpp
+++ b/src/ngraph/runtime/plaidml/plaidml_compiled_function.hpp
@@ -58,5 +58,7 @@ private:
    std::shared_ptr<Function> m_func;
    std::unordered_map<descriptor::Tensor*, std::string> m_input_names;
    std::unordered_map<descriptor::Tensor*, std::string> m_output_names;
+    mutable std::vector<std::weak_ptr<runtime::Tensor>> m_bound_inputs;
+    mutable std::vector<std::weak_ptr<runtime::Tensor>> m_bound_outputs;
    mutable vertexai::plaidml::invoker m_invoker;
 };
--- a/src/ngraph/runtime/plaidml/plaidml_tensor.cpp
+++ b/src/ngraph/runtime/plaidml/plaidml_tensor.cpp
@@ -101,6 +101,11 @@ void ngraph::runtime::plaidml::PlaidML_Tensor::read(void* p, size_t tensor_offse
 void ngraph::runtime::plaidml::PlaidML_Tensor::sync_input()
 {
+    if (!get_stale())
+    {
+        return;
+    }
+    set_stale(false);
    if (!m_memory)
    {
        if (m_is_logically_zero)
@@ -122,6 +127,7 @@ void ngraph::runtime::plaidml::PlaidML_Tensor::sync_output()
 {
    // The tensor's been used for an output, so it's no longer logically zero.
    m_is_logically_zero = false;
+    set_stale(false);
    if (!m_memory)
    {

--- a/src/ngraph/runtime/plaidml/unit_test.manifest
+++ b/src/ngraph/runtime/plaidml/unit_test.manifest
@@ -26,12 +26,15 @@ topk_1d_max_one                         # No plans to implement TopK
 topk_1d_min_all                         # No plans to implement TopK
 topk_1d_min_partial                     # No plans to implement TopK
 topk_1d_min_one                         # No plans to implement TopK
+topk_3d_large_input_max                 # No plans to implement TopK
+topk_3d_large_input_min                 # No plans to implement TopK
 topk_3d_max_all                         # No plans to implement TopK
 topk_3d_max_partial                     # No plans to implement TopK
 topk_3d_max_one                         # No plans to implement TopK
 topk_3d_min_all                         # No plans to implement TopK
 topk_3d_min_partial                     # No plans to implement TopK
 topk_3d_min_one                         # No plans to implement TopK
+topk_3d_single_output                   # No plans to implement TopK
 topk_2d_max_all                         # No plans to implement TopK
 topk_2d_max_partial                     # No plans to implement TopK
 topk_2d_max_one                         # No plans to implement TopK
@@ -43,15 +46,21 @@ topk_5d_max_partial                     # No plans to implement TopK
 # Tests that PlaidML might be able to run at some point.
 backwards_maxpool_n2_c1_hw5_3x3_str2_max_pad1x2_2x3
+backwards_maxpool_n4c1h4w4_kh2kw2_sh1sw1
+backwards_maxpool_n2c1h5w5_kh3kw3_sh2sw2
+backwards_maxpool_n4_c1_hw4_2x2_max
+backwards_maxpool_n2_c1_hw5_3x3_str2_max
 backwards_slice
 batchnorm_fprop_bprop  # To debug
 batchnorm_fprop_bprop_2step  # To debug
+softmax_axis_3d_double  # To debug
 reduce_matrix_rows_zero  # To debug: possible broadcasting error?
 reduce_matrix_cols_zero  # To debug: possible broadcasting error?
 reduce_3d_to_vector  # To debug: possible broadcasting error?
 replace_slice_matrix_inplace
 max_pool_2d_1channel_1image_overpadded
 max_pool_3d
+maxpool_bprop_larger_than_cache
 reduce_window_emulating_max_pool_1d_1channel_1image
 reduce_window_emulating_max_pool_1d_1channel_2image
 reduce_window_emulating_max_pool_1d_2channel_2image
@@ -60,31 +69,49 @@ reduce_window_emulating_max_pool_2d_1channel_1image_strided
 select_and_scatter_with_overlap
 select_and_scatter_without_overlap
 select_and_scatter_3d_without_overlap
+generate_mask
 avg_pool_3d
 avg_pool_3d_uneven_strided_padded_include_in_computation
-dequantize_zero_offset              # Quantization/Dequantization is unimplemented
+dequantize_int8_zero_offset             # Quantization/Dequantization is unimplemented
-quantize_ROUND_NEAREST_TOWARD_ZERO  # Quantization/Dequantization is unimplemented
+dequantize_int32                        # Quantization/Dequantization is unimplemented
-quantize_ROUND_NEAREST_UPWARD       # Quantization/Dequantization is unimplemented
+dequantize_int32_zero_offset            # Quantization/Dequantization is unimplemented
-quantize_ROUND_NEAREST_DOWNWARD     # Quantization/Dequantization is unimplemented
+dequantize_zero_offset                  # Quantization/Dequantization is unimplemented
-quantize_ROUND_NEAREST_TOWARD_EVEN  # Quantization/Dequantization is unimplemented
+quantize_ROUND_NEAREST_TOWARD_ZERO      # Quantization/Dequantization is unimplemented
-quantize_ROUND_TOWARD_INFINITY      # Quantization/Dequantization is unimplemented
+quantize_ROUND_NEAREST_UPWARD           # Quantization/Dequantization is unimplemented
-quantize_ROUND_TOWARD_ZERO          # Quantization/Dequantization is unimplemented
+quantize_ROUND_NEAREST_DOWNWARD         # Quantization/Dequantization is unimplemented
-quantize_ROUND_UP                   # Quantization/Dequantization is unimplemented
+quantize_ROUND_NEAREST_TOWARD_EVEN      # Quantization/Dequantization is unimplemented
-quantize_ROUND_DOWN                 # Quantization/Dequantization is unimplemented
+quantize_ROUND_NEAREST_TOWARD_INFINITY  # Quantization/Dequantization is unimplemented
-quantize                            # Quantization/Dequantization is unimplemented
+quantize_ROUND_TOWARD_INFINITY          # Quantization/Dequantization is unimplemented
-quantize_axes                       # Quantization/Dequantization is unimplemented
+quantize_ROUND_TOWARD_ZERO              # Quantization/Dequantization is unimplemented
-quantize_int8                       # Quantization/Dequantization is unimplemented
+quantize_ROUND_UP                       # Quantization/Dequantization is unimplemented
-quantize_clamp                      # Quantization/Dequantization is unimplemented
+quantize_ROUND_DOWN                     # Quantization/Dequantization is unimplemented
-dequantize                          # Quantization/Dequantization is unimplemented
+quantize                                # Quantization/Dequantization is unimplemented
-dequantize_axes                     # Quantization/Dequantization is unimplemented
+quantize_zero_offset                    # Quantization/Dequantization is unimplemented
-dequantize_int8                     # Quantization/Dequantization is unimplemented
+quantize_axes                           # Quantization/Dequantization is unimplemented
-sum_matrix_rows_zero                # Empty dims apparently should produce shaped 0s
+quantize_int8                           # Quantization/Dequantization is unimplemented
-sum_matrix_cols_zero                # Empty dims apparently should produce shaped 0s
+quantize_int8_zero_offset               # Quantization/Dequantization is unimplemented
-sum_vector_zero                     # Empty dims apparently should produce shaped 0s
+quantize_int32                          # Quantization/Dequantization is unimplemented
-sum_matrix_to_scalar_zero_by_zero   # Empty dims apparently should produce shaped 0s
+quantize_int32_zero_offset              # Quantization/Dequantization is unimplemented
-sum_3d_eliminate_zero_dim           # Empty dims apparently should produce shaped 0s
+quantize_clamp                          # Quantization/Dequantization is unimplemented
-dot_0_0                             # Empty dims apparently should produce shaped 0s
+quantize_clamp_int8                     # Quantization/Dequantization is unimplemented
-dot_matrix_2x0_0x2                  # Empty dims apparently should produce shaped 0s
+quantize_clamp_int32                    # Quantization/Dequantization is unimplemented
-dot_2x0_0                           # Empty dims apparently should produce shaped 0s
+quantize_clamp_int32_zero_offset        # Quantization/Dequantization is unimplemented
+quantize_clamp_uint8                    # Quantization/Dequantization is unimplemented
+dequantize                              # Quantization/Dequantization is unimplemented
+dequantize_axes                         # Quantization/Dequantization is unimplemented
+dequantize_int8                         # Quantization/Dequantization is unimplemented
+sum_matrix_rows_zero                    # Empty dims apparently should produce shaped 0s
+sum_matrix_cols_zero                    # Empty dims apparently should produce shaped 0s
+sum_vector_zero                         # Empty dims apparently should produce shaped 0s
+sum_matrix_to_scalar_zero_by_zero       # Empty dims apparently should produce shaped 0s
+sum_3d_eliminate_zero_dim               # Empty dims apparently should produce shaped 0s
+dot_0_0                                 # Empty dims apparently should produce shaped 0s
+dot_matrix_2x0_0x2                      # Empty dims apparently should produce shaped 0s
+dot_2x0_0                               # Empty dims apparently should produce shaped 0s
 numeric_float_nan
 numeric_double_nan
+shape_of_scalar
+shape_of_vector
+shape_of_matrix
+shape_of_5d