Merge branch 'bob/backend_api3' of https://github.com/NervanaSystems/ngraph into bob/backend_api3

05062078 · Robert Kimball · 0a11fa24 · e873255d · 05062078 · 05062078
Commit 05062078 authored Jan 28, 2019 by Robert Kimball
6 changed files
--- a/src/ngraph/runtime/backend.cpp
+++ b/src/ngraph/runtime/backend.cpp
@@ -144,6 +144,10 @@ bool runtime::Backend::is_supported_property(const Property prop) const
    return false;
 }
+void runtime::Backend::remove_compiled_function(std::shared_ptr<Executable> exec)
+{
+}
 bool runtime::Backend::call_with_validate(
    std::shared_ptr<Executable> exec,
    const std::vector<std::shared_ptr<runtime::Tensor>>& outputs,

--- a/src/ngraph/runtime/backend.hpp
+++ b/src/ngraph/runtime/backend.hpp
@@ -101,6 +101,8 @@ public:
    /// \returns true if the property is supported, false otherwise.
    virtual bool is_supported_property(const Property prop) const;
+    virtual void remove_compiled_function(std::shared_ptr<Executable> exec);
    /// The following methods are temporary hacks to reduce the number of changes in this PR
    /// They will be removed in a follow-on PR
    bool call_with_validate(std::shared_ptr<Executable> handle,

--- a/src/ngraph/runtime/cpu/cpu_backend.cpp
+++ b/src/ngraph/runtime/cpu/cpu_backend.cpp
@@ -121,6 +121,18 @@ bool runtime::cpu::CPU_Executable::call(const vector<shared_ptr<runtime::Tensor>
    return rc;
 }
+void runtime::cpu::CPU_Backend::remove_compiled_function(shared_ptr<Executable> exec)
+{
+    for (auto it = m_exec_map.begin(); it != m_exec_map.end(); ++it)
+    {
+        if (it->second == exec)
+        {
+            m_exec_map.erase(it);
+            break;
+        }
+    }
+}
 vector<runtime::PerformanceCounter> runtime::cpu::CPU_Executable::get_performance_data() const
 {
    vector<runtime::PerformanceCounter> rc;

--- a/src/ngraph/runtime/cpu/cpu_backend.hpp
+++ b/src/ngraph/runtime/cpu/cpu_backend.hpp
@@ -49,6 +49,8 @@ namespace ngraph
                    compile(std::shared_ptr<Function> func,
                            bool enable_performance_counters = false) override;
+                void remove_compiled_function(shared_ptr<Executable> exec) override;
                bool is_supported(const Node& node) const override;
                bool is_supported_property(const Property prop) const override;

--- a/src/ngraph/runtime/intelgpu/intelgpu_backend.cpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_backend.cpp
@@ -414,13 +414,31 @@ shared_ptr<runtime::Tensor> runtime::intelgpu::IntelGPUBackend::create_tensor(
        element_type, shape, *ocl_engine, memory_pointer, this);
 }
-runtime::Handle runtime::intelgpu::IntelGPUBackend::compile(shared_ptr<Function> func)
+shared_ptr<runtime::Executable>
+    runtime::intelgpu::IntelGPUBackend::compile(shared_ptr<Function> func, bool enable_timing)
 {
-    FunctionInstance& instance = ocl_networks[func];
+    shared_ptr<runtime::Executable> rc;
-    if (instance.ocl_network != nullptr)
+    auto it = ocl_networks.find(func);
+    if (it != ocl_networks.end())
    {
-        return func;
+        rc = it->second;
+    }
+    else
+    {
+        rc = make_shared<IntelGPUExecutable>(func, enable_timing);
+        if (!m_function_cache_disabled)
+        {
+            ocl_networks.insert({func, rc});
+        }
    }
+    return rc;
+}
+runtime::intelgpu::IntelGPUExecutable::IntelGPUExecutable(shared_ptr<Function> func,
+                                                          bool enable_timing)
+{
+    FunctionInstance& instance = m_function_instance;
+    instance.m_function = func;
    set<cldnn::primitive_id> func_output_names;
    cldnn::topology topology;
@@ -1803,18 +1821,16 @@ runtime::Handle runtime::intelgpu::IntelGPUBackend::compile(shared_ptr<Function>
        instance.m_compilation_time = timer_compile.get_milliseconds();
        instance.m_consumed_memory = get_max_memory_rss() - mem_before_compile;
    }
-    return func;
 }
-bool runtime::intelgpu::IntelGPUBackend::call(shared_ptr<Function> func,
+bool runtime::intelgpu::IntelGPUExecutable::call(const vector<shared_ptr<runtime::Tensor>>& outputs,
-                                              const vector<shared_ptr<runtime::Tensor>>& outputs,
                                                 const vector<shared_ptr<runtime::Tensor>>& inputs)
 {
    double mem_call_consumed = 0.0f;
    stopwatch timer_call;
-    FunctionInstance& instance = ocl_networks[func];
+    FunctionInstance& instance = m_function_instance;
+    shared_ptr<Function> func = instance.m_function;
    if (instance.ocl_network == nullptr)
    {
        throw runtime_error("compile() must be called before call().");
@@ -1835,7 +1851,7 @@ bool runtime::intelgpu::IntelGPUBackend::call(shared_ptr<Function> func,
    {
        shared_ptr<runtime::intelgpu::IntelGPUTensorView> tv =
            static_pointer_cast<runtime::intelgpu::IntelGPUTensorView>(inputs[i]);
-        const ParameterVector& input_params = func->get_parameters();
+        const ParameterVector& input_params = get_parameters();
        const string& tensor_name = input_params[i]->get_output_tensor().get_name();
        network->set_input_data(tensor_name, *tv->get_data_ptr());
    }
@@ -1884,29 +1900,19 @@ bool runtime::intelgpu::IntelGPUBackend::call(shared_ptr<Function> func,
        instance.m_compilation_time = 0.0;
    }
-    if (m_function_cache_disabled)
-    {
-        remove_compiled_function(func);
-    }
    return true;
 }
-void runtime::intelgpu::IntelGPUBackend::remove_compiled_function(shared_ptr<Function> func)
+void runtime::intelgpu::IntelGPUBackend::remove_compiled_function(shared_ptr<Executable> exec)
 {
-    ocl_networks.erase(func);
+    for (auto it = ocl_networks.begin(); it != ocl_networks.end(); ++it)
-}
+    {
+        if (it->second == exec)
-void runtime::intelgpu::IntelGPUBackend::enable_performance_data(shared_ptr<Function> func,
-                                                                 bool enable)
-{
-    FunctionInstance& instance = ocl_networks[func];
-    if (instance.ocl_network != nullptr)
        {
-        throw runtime_error("Performance data collection must be enabled prior to compiling.");
+            ocl_networks.erase(it);
+            break;
+        }
    }
-    instance.m_performance_counters_enabled = enable;
 }
 // The cldnn::network contains something like "generic_layer_0_Parameter_254_0" names
@@ -1933,15 +1939,13 @@ static string convert_cldnn_names(shared_ptr<Function> func, const string& cldnn
 }
 vector<runtime::PerformanceCounter>
-    runtime::intelgpu::IntelGPUBackend::get_performance_data(shared_ptr<Function> func) const
+    runtime::intelgpu::IntelGPUExecutable::get_performance_data() const
 {
    vector<runtime::PerformanceCounter> rc;
-    auto it = ocl_networks.find(func);
+    const shared_ptr<cldnn::network> network = m_function_instance.ocl_network;
-    if (it != ocl_networks.end())
+    shared_ptr<Function> func = m_function_instance.m_function;
-    {
-        const shared_ptr<cldnn::network> network = it->second.ocl_network;
-        if (network != nullptr && it->second.m_performance_counters_enabled)
+    if (network != nullptr && m_function_instance.m_performance_counters_enabled)
    {
        const map<cldnn::primitive_id, cldnn::event>& primitives =
            network->get_executed_primitives();
@@ -1964,7 +1968,6 @@ vector<runtime::PerformanceCounter>
            rc.push_back(perf_counter);
        }
    }
-    }
    return rc;
 }
@@ -1981,7 +1984,7 @@ static Node* get_node_by_name(const shared_ptr<Function> func, const string& nam
    return nullptr;
 }
-void runtime::intelgpu::IntelGPUBackend::print_call_performance(
+void runtime::intelgpu::IntelGPUExecutable::print_call_performance(
    const shared_ptr<cldnn::network> network,
    const shared_ptr<Function> func,
    double time_compile,

--- a/src/ngraph/runtime/intelgpu/intelgpu_backend.hpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_backend.hpp
@@ -31,6 +31,7 @@ namespace ngraph
        namespace intelgpu
        {
            class IntelGPUBackend;
+            class IntelGPUExecutable;
        }
    }
 }
@@ -47,18 +48,35 @@ public:
    std::shared_ptr<ngraph::runtime::Tensor>
        create_tensor(const ngraph::element::Type& element_type, const Shape& shape) override;
-    Handle compile(std::shared_ptr<Function> func) override;
+    std::shared_ptr<runtime::Executable> compile(std::shared_ptr<Function> func,
+                                                 bool enable_timing = false) override;
+    void remove_compiled_function(std::shared_ptr<runtime::Executable> exec) override;
-    bool call(std::shared_ptr<Function> func,
+    bool is_supported_property(const Property prop) const override;
-              const std::vector<std::shared_ptr<runtime::Tensor>>& outputs,
-              const std::vector<std::shared_ptr<runtime::Tensor>>& inputs) override;
-    void remove_compiled_function(std::shared_ptr<Function> func) override;
+private:
-    void enable_performance_data(std::shared_ptr<Function> func, bool enable) override;
+    std::shared_ptr<cldnn::engine> ocl_engine;
-    std::vector<PerformanceCounter>
+    std::map<std::shared_ptr<Function>, std::shared_ptr<runtime::Executable>> ocl_networks;
-        get_performance_data(std::shared_ptr<Function> func) const override;
-    bool is_supported_property(const Property prop) const override;
+    bool m_profile_enable = false;
+    long m_profile_lines_limit_count = 10;
+    bool m_dump_graph_enable = false;
+    bool m_cldnn_graph_optimize = true;
+    bool m_cldnn_dump_enable = false;
+    bool m_function_cache_disabled = false;
+    bool m_disable_backend_optimizations = false;
+    std::string m_cldnn_dump_dir = std::string("intelgpu_codegen");
+    std::string delim = std::string(":");
+};
+class ngraph::runtime::intelgpu::IntelGPUExecutable : public runtime::Executable
+{
+public:
+    IntelGPUExecutable(std::shared_ptr<Function> func, bool enable_timing);
+    bool call(const std::vector<std::shared_ptr<runtime::Tensor>>& outputs,
+              const std::vector<std::shared_ptr<runtime::Tensor>>& inputs) override;
+    std::vector<PerformanceCounter> get_performance_data() const override;
 private:
    class FunctionInstance
@@ -68,12 +86,19 @@ private:
        bool m_performance_counters_enabled = false;
        double m_compilation_time = 0.0;
        double m_consumed_memory = 0.0;
-    };
+        std::shared_ptr<Function> m_function;
+    } m_function_instance;
-    std::map<std::shared_ptr<Function>, FunctionInstance> ocl_networks;
-    std::shared_ptr<cldnn::engine> ocl_engine;
+    bool m_profile_enable = false;
+    long m_profile_lines_limit_count = 10;
+    bool m_dump_graph_enable = false;
+    bool m_cldnn_graph_optimize = true;
+    bool m_cldnn_dump_enable = false;
+    bool m_function_cache_disabled = false;
    bool m_disable_backend_optimizations = false;
+    std::shared_ptr<cldnn::engine> ocl_engine;
+    std::string m_cldnn_dump_dir = std::string("intelgpu_codegen");
+    std::string delim = std::string(":");
    // Statistic related things
    void print_call_performance(const std::shared_ptr<cldnn::network> network,
@@ -83,13 +108,4 @@ private:
                                double mem_compilation_consumed,
                                double mem_call_consumed,
                                double mem_current) const;
-    bool m_profile_enable = false;
-    long m_profile_lines_limit_count = 10;
-    bool m_dump_graph_enable = false;
-    bool m_cldnn_graph_optimize = true;
-    bool m_cldnn_dump_enable = false;
-    bool m_function_cache_disabled = false;
-    std::string m_cldnn_dump_dir = std::string("intelgpu_codegen");
-    std::string delim = std::string(":");
 };