copy executable from bob/backend_api2

ef2e0118 · Robert Kimball · 122754c1 · ef2e0118 · ef2e0118 · ef2e0118
Commit ef2e0118 authored Jan 26, 2019 by Robert Kimball
20 changed files
--- a/python/ngraph/impl/runtime/__init__.py
+++ b/python/ngraph/impl/runtime/__init__.py
@@ -28,4 +28,5 @@ else:
 sys.setdlopenflags(flags)

 from _pyngraph.runtime import Backend
+from _pyngraph.runtime import Executable
 from _pyngraph.runtime import Tensor
--- a/python/ngraph/runtime.py
+++ b/python/ngraph/runtime.py
@@ -20,7 +20,7 @@ from typing import List, Union
 import numpy as np

 from ngraph.impl import Function, Node, Shape, serialize, util
-from ngraph.impl.runtime import Backend, Tensor
+from ngraph.impl.runtime import Backend, Executable, Tensor
 from ngraph.utils.types import get_dtype, NumericData
 from ngraph.exceptions import UserInputError

@@ -93,7 +93,7 @@ class Computation(object):
                value = np.array(value)
            Computation._write_ndarray_to_tensor_view(value, tensor_view)

-        self.runtime.backend.call(self.handle, self.result_views, self.tensor_views)
+        self.handle.call(self.result_views, self.tensor_views)

        results = []
        for result_view in self.result_views:

--- a/python/pyngraph/runtime/backend.cpp
+++ b/python/pyngraph/runtime/backend.cpp
@@ -35,23 +35,7 @@ void regclass_pyngraph_runtime_Backend(py::module m)
                    const ngraph::element::Type&, const ngraph::Shape&)) &
                    ngraph::runtime::Backend::create_tensor);
    backend.def("compile",
-                (std::shared_ptr<ngraph::Function>(ngraph::runtime::Backend::*)(
+                (std::unique_ptr<ngraph::runtime::Executable>(ngraph::runtime::Backend::*)(
                    std::shared_ptr<ngraph::Function>)) &
                    ngraph::runtime::Backend::compile);
-    backend.def("call",
-                (bool (ngraph::runtime::Backend::*)(
-                    std::shared_ptr<ngraph::Function>,
-                    const std::vector<std::shared_ptr<ngraph::runtime::Tensor>>&,
-                    const std::vector<std::shared_ptr<ngraph::runtime::Tensor>>&)) &
-                    ngraph::runtime::Backend::call);
-    backend.def("remove_compiled_function",
-                (void (ngraph::runtime::Backend::*)(std::shared_ptr<ngraph::Function>)) &
-                    ngraph::runtime::Backend::remove_compiled_function);
-    backend.def("enable_performance_data",
-                (void (ngraph::runtime::Backend::*)(std::shared_ptr<ngraph::Function>, bool)) &
-                    ngraph::runtime::Backend::enable_performance_data);
-    backend.def("get_performance_data",
-                (std::vector<ngraph::runtime::PerformanceCounter>(ngraph::runtime::Backend::*)(
-                    std::shared_ptr<ngraph::Function>)) &
-                    ngraph::runtime::Backend::get_performance_data);
 }
--- a/python/pyngraph/runtime/executable.cpp
+++ b/python/pyngraph/runtime/executable.cpp
+//*****************************************************************************
+// Copyright 2017-2019 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "ngraph/runtime/backend.hpp"
+#include "ngraph/runtime/tensor.hpp"
+#include "pyngraph/runtime/executable.hpp"
+
+namespace py = pybind11;
+
+void regclass_pyngraph_runtime_Executable(py::module m)
+{
+    py::class_<ngraph::runtime::Executable, std::unique_ptr<ngraph::runtime::Executable>>
+        executable(m, "Executable");
+    executable.doc() = "ngraph.impl.runtime.Executable wraps ngraph::runtime::Executable";
+    executable.def("call",
+                   (bool (ngraph::runtime::Executable::*)(
+                       const std::vector<std::shared_ptr<ngraph::runtime::Tensor>>&,
+                       const std::vector<std::shared_ptr<ngraph::runtime::Tensor>>&)) &
+                       ngraph::runtime::Executable::call);
+    executable.def(
+        "get_performance_data",
+        (std::vector<ngraph::runtime::PerformanceCounter>(ngraph::runtime::Executable::*)()) &
+            ngraph::runtime::Executable::get_performance_data);
+}
--- a/python/pyngraph/runtime/executable.hpp
+++ b/python/pyngraph/runtime/executable.hpp
+//*****************************************************************************
+// Copyright 2017-2019 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+void regclass_pyngraph_runtime_Executable(py::module m);
--- a/python/pyngraph/runtime/regmodule_pyngraph_runtime.cpp
+++ b/python/pyngraph/runtime/regmodule_pyngraph_runtime.cpp
@@ -25,4 +25,5 @@ void regmodule_pyngraph_runtime(py::module m)
        m.def_submodule("runtime", "Package ngraph.impl.runtime wraps ngraph::runtime");
    regclass_pyngraph_runtime_Tensor(m_runtime);
    regclass_pyngraph_runtime_Backend(m_runtime);
+    regclass_pyngraph_runtime_Executable(m_runtime);
 }
--- a/python/setup.py
+++ b/python/setup.py
@@ -228,6 +228,7 @@ sources = [
    'pyngraph/ops/softmax.cpp',
    'pyngraph/ops/result.cpp',
    'pyngraph/runtime/backend.cpp',
+    'pyngraph/runtime/executable.cpp',
    'pyngraph/runtime/regmodule_pyngraph_runtime.cpp',
    'pyngraph/runtime/tensor.cpp',
    'pyngraph/passes/manager.cpp',

--- a/python/test/test_ops.py
+++ b/python/test/test_ops.py
--- a/src/ngraph/CMakeLists.txt
+++ b/src/ngraph/CMakeLists.txt
@@ -139,8 +139,8 @@ set (SRC
    pass/memory_visualize.cpp
    pass/nop_elimination.cpp
    pass/pass.cpp
-    pass/pass_config.cpp 
-    pass/prefix_reshape_elimination.cpp 
+    pass/pass_config.cpp
+    pass/prefix_reshape_elimination.cpp
    pass/propagate_cacheability.cpp
    pass/reshape_elimination.cpp
    pass/reshape_sinking.cpp

--- a/src/ngraph/runtime/backend.cpp
+++ b/src/ngraph/runtime/backend.cpp
@@ -39,78 +39,123 @@ vector<string> runtime::Backend::get_registered_devices()
    return BackendManager::get_registered_backends();
 }

-void runtime::Backend::remove_compiled_function(shared_ptr<Function> func)
+bool runtime::Backend::is_supported(const Node& node) const
 {
+    // The default behavior is that a backend does not support any ops. If this is not the case
+    // then override this method and enhance.
+    return false;
 }

-vector<ngraph::runtime::PerformanceCounter>
-    runtime::Backend::get_performance_data(shared_ptr<Function> func) const
+runtime::Executable::Executable()
 {
-    return vector<PerformanceCounter>();
 }

-void runtime::Backend::validate(shared_ptr<const Function> function,
-                                const vector<shared_ptr<runtime::Tensor>>& outputs,
-                                const vector<shared_ptr<runtime::Tensor>>& inputs)
+runtime::Executable::~Executable()
 {
-    const ParameterVector& input_parameters = function->get_parameters();
-    if (input_parameters.size() != inputs.size())
+}
+
+bool runtime::Executable::call_with_validate(const vector<shared_ptr<runtime::Tensor>>& outputs,
+                                             const vector<shared_ptr<runtime::Tensor>>& inputs)
+{
+    validate(outputs, inputs);
+    return call(outputs, inputs);
+}
+
+void runtime::Executable::validate(const vector<std::shared_ptr<runtime::Tensor>>& outputs,
+                                   const vector<std::shared_ptr<runtime::Tensor>>& inputs)
+{
+    const ParameterVector& parameters = get_parameters();
+    const ResultVector& results = get_results();
+    if (parameters.size() != inputs.size())
    {
        stringstream ss;
        ss << "Call input count " << inputs.size() << " does not match Function's Parameter count "
-           << input_parameters.size();
+           << parameters.size();
        throw runtime_error(ss.str());
    }
-    if (function->get_output_size() != outputs.size())
+    if (results.size() != outputs.size())
    {
        stringstream ss;
        ss << "Call output count " << outputs.size() << " does not match Function's Result count "
-           << function->get_output_size();
+           << results.size();
        throw runtime_error(ss.str());
    }

-    for (size_t i = 0; i < input_parameters.size(); i++)
+    for (size_t i = 0; i < parameters.size(); i++)
    {
-        if (input_parameters[i]->get_element_type() != inputs[i]->get_element_type())
+        if (parameters[i]->get_element_type() != inputs[i]->get_element_type())
        {
            stringstream ss;
            ss << "Input " << i << " type '" << inputs[i]->get_element_type()
-               << "' does not match Parameter type '" << input_parameters[i]->get_element_type()
-               << "'";
+               << "' does not match Parameter type '" << parameters[i]->get_element_type() << "'";
            throw runtime_error(ss.str());
        }
-        if (input_parameters[i]->get_shape() != inputs[i]->get_shape())
+        if (parameters[i]->get_shape() != inputs[i]->get_shape())
        {
            stringstream ss;
            ss << "Input " << i << " shape {" << join(inputs[i]->get_shape())
-               << "} does not match Parameter shape {" << join(input_parameters[i]->get_shape())
-               << "}";
+               << "} does not match Parameter shape {" << join(parameters[i]->get_shape()) << "}";
            throw runtime_error(ss.str());
        }
    }

-    for (size_t i = 0; i < function->get_output_size(); i++)
+    for (size_t i = 0; i < results.size(); i++)
    {
-        if (function->get_output_element_type(i) != outputs[i]->get_element_type())
+        if (results[i]->get_element_type() != outputs[i]->get_element_type())
        {
            stringstream ss;
            ss << "Output " << i << " type '" << outputs[i]->get_element_type()
-               << "' does not match Result type '" << function->get_output_element_type(i) << "'";
+               << "' does not match Result type '" << results[i]->get_element_type() << "'";
            throw runtime_error(ss.str());
        }
-        if (function->get_output_shape(i) != outputs[i]->get_shape())
+        if (results[i]->get_shape() != outputs[i]->get_shape())
        {
            stringstream ss;
            ss << "Output " << i << " shape {" << join(outputs[i]->get_shape())
-               << "} does not match Result shape {" << join(function->get_output_shape(i)) << "}";
+               << "} does not match Result shape {" << join(results[i]->get_shape()) << "}";
            throw runtime_error(ss.str());
        }
    }
 }

-bool runtime::Backend::is_supported(const Node& node) const
+const ngraph::ParameterVector& runtime::Executable::get_parameters() const
+{
+    return m_parameters;
+}
+
+const ngraph::ResultVector& runtime::Executable::get_results() const
+{
+    return m_results;
+}
+
+void runtime::Executable::set_parameters_and_results(const Function& func)
+{
+    m_parameters = func.get_parameters();
+    m_results = func.get_results();
+}
+
+vector<runtime::PerformanceCounter> runtime::Executable::get_performance_data() const
+{
+    return vector<PerformanceCounter>();
+}
+
+bool runtime::Backend::is_supported_property(const Property prop) const
 {
-    // The default behavior is that a backend does not support any ops. If this is not the case
-    // then override this method and enhance.
    return false;
 }
+
+bool runtime::Backend::call_with_validate(
+    std::shared_ptr<Executable> exec,
+    const std::vector<std::shared_ptr<runtime::Tensor>>& outputs,
+    const std::vector<std::shared_ptr<runtime::Tensor>>& inputs)
+{
+    return exec->call_with_validate(outputs, inputs);
+}
+
+bool runtime::Backend::call_with_validate(
+    const std::unique_ptr<Executable>& exec,
+    const std::vector<std::shared_ptr<runtime::Tensor>>& outputs,
+    const std::vector<std::shared_ptr<runtime::Tensor>>& inputs)
+{
+    return exec->call_with_validate(outputs, inputs);
+}
--- a/src/ngraph/runtime/backend.hpp
+++ b/src/ngraph/runtime/backend.hpp
@@ -30,7 +30,8 @@ namespace ngraph
        class ExternalFunction;
        class Tensor;
        class Backend;
-        using Handle = std::shared_ptr<Function>;
+        class Executable;
+        using Handle = std::shared_ptr<Executable>;
    }
 }

@@ -81,43 +82,8 @@ public:
    /// \brief Compiles a Function.
    /// \param func The function to compile
    /// \returns compiled function or nullptr on failure
-    virtual Handle compile(std::shared_ptr<Function> func) = 0;
-
-    /// \brief Executes a single iteration of a Function. If func is not compiled the call will
-    ///     compile it.
-    /// \param func The function to execute
-    /// \returns true if iteration is successful, false otherwise
-    virtual bool call(std::shared_ptr<Function> func,
-                      const std::vector<std::shared_ptr<runtime::Tensor>>& outputs,
-                      const std::vector<std::shared_ptr<runtime::Tensor>>& inputs) = 0;
-
-    /// \brief Executes a single iteration of a Function. If func is not compiled the call will
-    ///     compile it. Optionally validates the inputs and outputs against the function graph.
-    /// \param func The function to execute
-    /// \returns true if iteration is successful, false otherwise
-    bool call_with_validate(std::shared_ptr<Function> func,
-                            const std::vector<std::shared_ptr<runtime::Tensor>>& outputs,
-                            const std::vector<std::shared_ptr<runtime::Tensor>>& inputs)
-    {
-        validate(func, outputs, inputs);
-        return call(func, outputs, inputs);
-    }
-
-    /// \brief Compiled functions may be cached. This function removes a compiled function
-    ///     from the cache.
-    /// \param func The function to execute
-    virtual void remove_compiled_function(std::shared_ptr<Function> func);
-
-    /// \brief Enable the collection of per-op performance information on a specified Function.
-    ///     Data collection is via the `get_performance_data` method.
-    /// \param func The function to collect perfomance data on.
-    /// \param enable Set to true to enable or false to disable data collection
-    virtual void enable_performance_data(std::shared_ptr<Function> func, bool enable) {}
-    /// \brief Collect performance information gathered on a Function.
-    /// \param func The function to get collected data.
-    /// \returns Vector of PerformanceCounter information.
-    virtual std::vector<PerformanceCounter>
-        get_performance_data(std::shared_ptr<Function> func) const;
+    virtual std::shared_ptr<Executable> compile(std::shared_ptr<Function> func,
+                                                bool enable_performance_data = false) = 0;

    /// \brief Test if a backend is capable of supporting an op
    /// \param node is the op to test.
@@ -133,8 +99,62 @@ public:
    /// \brief Test if a backend particular property is supported
    /// \param prop is the feature to test.
    /// \returns true if the property is supported, false otherwise.
-    virtual bool is_supported_property(const Property prop) const { return false; }
-    void validate(std::shared_ptr<const Function> func,
-                  const std::vector<std::shared_ptr<runtime::Tensor>>& outputs,
+    virtual bool is_supported_property(const Property prop) const;
+
+    /// The following methods are temporary hacks to reduce the number of changes in this PR
+    /// They will be removed in a follow-on PR
+    bool call_with_validate(std::shared_ptr<Executable> handle,
+                            const std::vector<std::shared_ptr<runtime::Tensor>>& outputs,
+                            const std::vector<std::shared_ptr<runtime::Tensor>>& inputs);
+    bool call_with_validate(const std::unique_ptr<Executable>& handle,
+                            const std::vector<std::shared_ptr<runtime::Tensor>>& outputs,
+                            const std::vector<std::shared_ptr<runtime::Tensor>>& inputs);
+};
+
+class ngraph::runtime::Executable
+{
+public:
+    Executable();
+    virtual ~Executable();
+
+    /// \param outputs vector of runtime::Tensor used as outputs
+    /// \param inputs vector of runtime::Tensor used as inputs
+    /// \returns true if iteration is successful, false otherwise
+    virtual bool call(const std::vector<std::shared_ptr<runtime::Tensor>>& outputs,
+                      const std::vector<std::shared_ptr<runtime::Tensor>>& inputs) = 0;
+
+    /// \brief Executes a single iteration of a Function.
+    /// \param outputs vector of runtime::Tensor used as outputs
+    /// \param inputs vector of runtime::Tensor used as inputs
+    /// \returns true if iteration is successful, false otherwise
+    bool call_with_validate(const std::vector<std::shared_ptr<runtime::Tensor>>& outputs,
+                            const std::vector<std::shared_ptr<runtime::Tensor>>& inputs);
+
+    /// \brief Collect performance information gathered on a Function.
+    /// \returns Vector of PerformanceCounter information.
+    virtual std::vector<PerformanceCounter> get_performance_data() const;
+
+    /// \brief Validates a Function.
+    /// \param outputs vector of runtime::Tensor used as outputs
+    /// \param inputs vector of runtime::Tensor used as inputs
+    void validate(const std::vector<std::shared_ptr<runtime::Tensor>>& outputs,
                  const std::vector<std::shared_ptr<runtime::Tensor>>& inputs);
+
+    /// \brief Query the input Parameters
+    /// \returns an ngraph::op::ParameterVector of all input parameters
+    const ngraph::ParameterVector& get_parameters() const;
+
+    /// \brief Query the output Results
+    /// \returns an ngraph::ResultVector of all input parameters
+    const ngraph::ResultVector& get_results() const;
+
+protected:
+    /// \brief Called at the end of compile to the the values to be returned by get_parameters
+    ///     and get_results
+    /// \param func The function with Results fully resolved.
+    void set_parameters_and_results(const Function& func);
+
+private:
+    ngraph::ParameterVector m_parameters;
+    ngraph::ResultVector m_results;
 };
--- a/src/ngraph/runtime/hybrid/hybrid_backend.cpp
+++ b/src/ngraph/runtime/hybrid/hybrid_backend.cpp
@@ -62,14 +62,24 @@ static void node_modifiers(const Node& node, vector<string>& attributes)
    }
 }

-runtime::Handle runtime::hybrid::HybridBackend::compile(shared_ptr<Function> func)
+shared_ptr<runtime::Executable>
+    runtime::hybrid::HybridBackend::compile(shared_ptr<Function> func,
+                                            bool enable_performance_collection)
 {
-    if (m_function_map.find(func) == m_function_map.end())
-    {
-        // Clone function
-        FunctionInstance instance;
-        instance.m_function = clone_function(*func);
+    return make_shared<HybridExecutable>(
+        m_backend_list, func, enable_performance_collection, m_debug_enabled);
+}

+runtime::hybrid::HybridExecutable::HybridExecutable(
+    const std::vector<std::shared_ptr<runtime::Backend>>& backend_list,
+    const shared_ptr<Function>& func,
+    bool enable_performance_collection,
+    bool debug_enabled)
+    : m_function{func}
+    , m_backend_list{backend_list}
+    , m_debug_enabled{debug_enabled}
+{
+    {
        // Run placement pass
        ngraph::pass::Manager pass_manager;
        pass_manager.register_pass<runtime::hybrid::pass::AssignPlacement>(m_backend_list);
@@ -81,16 +91,15 @@ runtime::Handle runtime::hybrid::HybridBackend::compile(shared_ptr<Function> fun
        {
            pass_manager.register_pass<ngraph::pass::VisualizeTree>("graph.png", node_modifiers);
        }
-        pass_manager.run_passes(instance.m_function);
+        pass_manager.run_passes(m_function);

        // Split function to sub_functions
-        tie(instance.m_sub_functions, instance.m_map_parameter_to_result) =
-            runtime::hybrid::split_function_by_placement(instance.m_function);
-        m_function_map.insert({func, instance});
+        tie(m_sub_functions, m_map_parameter_to_result) =
+            runtime::hybrid::split_function_by_placement(m_function);

        // Compile subfunctions in corresponding backends
        size_t subfunction_number = 0;
-        for (shared_ptr<Function>& sub_function : instance.m_sub_functions)
+        for (shared_ptr<Function>& sub_function : m_sub_functions)
        {
            size_t placement = runtime::hybrid::get_colocated_function_placement(sub_function);
            if (m_debug_enabled)
@@ -102,7 +111,8 @@ runtime::Handle runtime::hybrid::HybridBackend::compile(shared_ptr<Function> fun
                pm.run_passes(sub_function);
            }
            auto backend = m_backend_list[placement];
-            backend->compile(sub_function);
+            shared_ptr<Executable> exec = backend->compile(sub_function);
+            m_executable_map[sub_function] = exec;

            // Compile will replace nodes so we need to make one more pass through all
            // ops to reset placement
@@ -113,38 +123,29 @@ runtime::Handle runtime::hybrid::HybridBackend::compile(shared_ptr<Function> fun
        }
    }

-    return func;
+    set_parameters_and_results(*func);
 }

-bool runtime::hybrid::HybridBackend::call(shared_ptr<Function> func,
-                                          const vector<shared_ptr<runtime::Tensor>>& outputs,
-                                          const vector<shared_ptr<runtime::Tensor>>& inputs)
+bool runtime::hybrid::HybridExecutable::call(const vector<shared_ptr<runtime::Tensor>>& outputs,
+                                             const vector<shared_ptr<runtime::Tensor>>& inputs)
 {
-    // Get FunctionInstance
    bool rc = true;

    using node_map_t = unordered_map<shared_ptr<Node>, shared_ptr<runtime::Tensor>>;

-    auto fit = m_function_map.find(func);
-    if (fit == m_function_map.end())
-    {
-        throw runtime_error("compile() must be called before call().");
-    }
-    FunctionInstance& instance = fit->second;
-
    // Parameter and result node in sub_function maps to one Tensor
    node_map_t map_node_to_tensor;
    for (size_t i = 0; i < inputs.size(); ++i)
    {
-        map_node_to_tensor[instance.m_function->get_parameters()[i]] = inputs[i];
+        map_node_to_tensor[m_function->get_parameters()[i]] = inputs[i];
    }
    for (size_t i = 0; i < outputs.size(); ++i)
    {
-        map_node_to_tensor[instance.m_function->get_results()[i]] = outputs[i];
+        map_node_to_tensor[m_function->get_results()[i]] = outputs[i];
    }

    // Call subfunctions
-    for (const shared_ptr<Function>& sub_function : instance.m_sub_functions)
+    for (const shared_ptr<Function>& sub_function : m_sub_functions)
    {
        // Init backend
        size_t placement = runtime::hybrid::get_colocated_function_placement(sub_function);
@@ -172,7 +173,7 @@ bool runtime::hybrid::HybridBackend::call(shared_ptr<Function> func,
            else
            {
                // Handle temporary tensors that go between subgraphs
-                auto result_node = instance.m_map_parameter_to_result.at(parameter_node);
+                auto result_node = m_map_parameter_to_result.at(parameter_node);
                auto result = map_node_to_tensor.at(result_node);
                auto parameter = backend->create_tensor(parameter_node->get_element_type(),
                                                        parameter_node->get_shape());
@@ -213,7 +214,8 @@ bool runtime::hybrid::HybridBackend::call(shared_ptr<Function> func,
        }

        // Call
-        backend->call(sub_function, results, parameters);
+        auto exec = m_executable_map[sub_function];
+        exec->call(results, parameters);

        // Need to copy any results to the correct device
        for (const auto& p : copy_back)
@@ -229,7 +231,7 @@ bool runtime::hybrid::HybridBackend::is_supported(const Node& node) const
    return true;
 }

-size_t runtime::hybrid::HybridBackend::get_placement(const runtime::Tensor* t)
+size_t runtime::hybrid::HybridExecutable::get_placement(const runtime::Tensor* t)
 {
    size_t index = 0;
    for (const shared_ptr<ngraph::runtime::Backend>& be : m_backend_list)

--- a/src/ngraph/runtime/hybrid/hybrid_backend.hpp
+++ b/src/ngraph/runtime/hybrid/hybrid_backend.hpp
@@ -30,6 +30,7 @@ namespace ngraph
        namespace hybrid
        {
            class HybridBackend;
+            class HybridExecutable;
        }
    }
 }
@@ -48,29 +49,37 @@ public:
                      const ngraph::Shape& shape,
                      void* memory_pointer) override;

-    Handle compile(std::shared_ptr<ngraph::Function> func) override;
-
-    bool call(std::shared_ptr<ngraph::Function> func,
-              const std::vector<std::shared_ptr<ngraph::runtime::Tensor>>& outputs,
-              const std::vector<std::shared_ptr<ngraph::runtime::Tensor>>& inputs) override;
+    std::shared_ptr<Executable> compile(std::shared_ptr<ngraph::Function> func,
+                                        bool enable_performance_data = false) override;

    bool is_supported(const ngraph::Node& node) const override;

    void set_debug_enabled(bool flag) { m_debug_enabled = flag; }
 private:
-    class FunctionInstance
-    {
-    public:
-        std::shared_ptr<ngraph::Function> m_function;
-        std::vector<std::shared_ptr<ngraph::Function>> m_sub_functions;
-        std::unordered_map<std::shared_ptr<ngraph::op::Parameter>,
-                           std::shared_ptr<ngraph::op::Result>>
-            m_map_parameter_to_result;
-    };
+    std::vector<std::shared_ptr<runtime::Backend>> m_backend_list;
+    bool m_debug_enabled = false;
+};
+
+class ngraph::runtime::hybrid::HybridExecutable : public runtime::Executable
+{
+public:
+    HybridExecutable(const std::vector<std::shared_ptr<runtime::Backend>>& backend_list,
+                     const std::shared_ptr<Function>& func,
+                     bool enable_performance_collection = false,
+                     bool debug_enabled = false);
+
+    bool call(const std::vector<std::shared_ptr<ngraph::runtime::Tensor>>& outputs,
+              const std::vector<std::shared_ptr<ngraph::runtime::Tensor>>& inputs) override;
+
+private:
+    std::shared_ptr<ngraph::Function> m_function;
+    std::vector<std::shared_ptr<ngraph::Function>> m_sub_functions;
+    std::unordered_map<std::shared_ptr<ngraph::op::Parameter>, std::shared_ptr<ngraph::op::Result>>
+        m_map_parameter_to_result;

-    std::map<std::shared_ptr<ngraph::Function>, FunctionInstance> m_function_map;
    std::vector<std::shared_ptr<runtime::Backend>> m_backend_list;
    bool m_debug_enabled = false;
+    std::unordered_map<std::shared_ptr<Function>, std::shared_ptr<Executable>> m_executable_map;

    size_t get_placement(const runtime::Tensor* t);
 };
--- a/src/ngraph/runtime/interpreter/int_backend.cpp
+++ b/src/ngraph/runtime/interpreter/int_backend.cpp
@@ -64,12 +64,17 @@ shared_ptr<runtime::Tensor> runtime::interpreter::INTBackend::create_tensor(
    return make_shared<runtime::HostTensor>(type, shape, memory_pointer, this);
 }

-runtime::Handle runtime::interpreter::INTBackend::compile(shared_ptr<Function> function)
+shared_ptr<runtime::Executable>
+    runtime::interpreter::INTBackend::compile(shared_ptr<Function> function,
+                                              bool enable_performance_collection)
+{
+    return make_shared<INTExecutable>(function, enable_performance_collection);
+}
+
+runtime::interpreter::INTExecutable::INTExecutable(const shared_ptr<Function>& function,
+                                                   bool enable_performance_collection)
 {
-    FunctionInstance& instance = m_function_map[function];
-    if (!instance.m_is_compiled)
    {
-        instance.m_is_compiled = true;
        pass::Manager pass_manager;
        pass_manager.register_pass<pass::LikeReplacement>();
        pass_manager.register_pass<pass::AssignLayout<DenseTensorLayout>>();
@@ -78,32 +83,20 @@ runtime::Handle runtime::interpreter::INTBackend::compile(shared_ptr<Function> f
        pass_manager.run_passes(function);

        size_t memory_pool_size = function->get_temporary_pool_size();
-        instance.m_temporary_memory.reset(new AlignedBuffer(memory_pool_size, get_alignment()));
+        m_temporary_memory.reset(new AlignedBuffer(memory_pool_size, get_alignment()));

        for (const shared_ptr<Node>& node : function->get_ordered_ops())
        {
-            instance.m_wrapped_nodes.emplace_back(node);
+            m_wrapped_nodes.emplace_back(node);
        }
    }

-    return function;
+    set_parameters_and_results(*function);
 }

-bool runtime::interpreter::INTBackend::call(shared_ptr<Function> function,
-                                            const vector<shared_ptr<runtime::Tensor>>& outputs,
-                                            const vector<shared_ptr<runtime::Tensor>>& inputs)
+bool runtime::interpreter::INTExecutable::call(const vector<shared_ptr<runtime::Tensor>>& outputs,
+                                               const vector<shared_ptr<runtime::Tensor>>& inputs)
 {
-    auto fit = m_function_map.find(function);
-    if (fit == m_function_map.end())
-    {
-        throw runtime_error("compile() must be called before call().");
-    }
-    FunctionInstance& instance = fit->second;
-    if (!instance.m_is_compiled)
-    {
-        throw runtime_error("compile() must be called before call().");
-    }
-
    // convert inputs to HostTensor
    vector<void*> func_inputs;
    vector<shared_ptr<runtime::HostTensor>> htv_inputs;
@@ -113,7 +106,7 @@ bool runtime::interpreter::INTBackend::call(shared_ptr<Function> function,
        func_inputs.push_back(static_cast<void*>(host_tensor->get_data_ptr()));
        htv_inputs.push_back(host_tensor);
    }
-    if (instance.m_nan_check_enabled)
+    if (m_nan_check_enabled)
    {
        perform_nan_check(htv_inputs);
    }
@@ -129,7 +122,7 @@ bool runtime::interpreter::INTBackend::call(shared_ptr<Function> function,
    // map function params -> HostTensor
    unordered_map<descriptor::Tensor*, void*> tensor_map;
    size_t input_count = 0;
-    for (auto param : function->get_parameters())
+    for (auto param : get_parameters())
    {
        for (size_t i = 0; i < param->get_output_size(); ++i)
        {
@@ -139,9 +132,9 @@ bool runtime::interpreter::INTBackend::call(shared_ptr<Function> function,
    }

    // map function outputs -> HostTensor
-    for (size_t output_count = 0; output_count < function->get_output_size(); ++output_count)
+    for (size_t output_count = 0; output_count < get_results().size(); ++output_count)
    {
-        auto output = function->get_output_op(output_count);
+        auto output = get_results()[output_count];
        if (!dynamic_pointer_cast<op::Result>(output))
        {
            throw ngraph_error("One of function's outputs isn't op::Result");
@@ -151,7 +144,7 @@ bool runtime::interpreter::INTBackend::call(shared_ptr<Function> function,
    }

    // for each ordered op in the graph
-    for (const NodeWrapper& wrapped : instance.m_wrapped_nodes)
+    for (const NodeWrapper& wrapped : m_wrapped_nodes)
    {
        const Node* op = &wrapped.get_node();
        auto type_id = wrapped.get_typeid();
@@ -185,7 +178,7 @@ bool runtime::interpreter::INTBackend::call(shared_ptr<Function> function,
            if (it == tensor_map.end())
            {
                auto offset = op->get_output_tensor(i).get_pool_offset();
-                host_tensor = instance.get_temporary_pointer(offset);
+                host_tensor = get_temporary_pointer(offset);
                tensor_map.insert({tensor, host_tensor});
            }
            else
@@ -224,16 +217,16 @@ bool runtime::interpreter::INTBackend::call(shared_ptr<Function> function,
        }
 #pragma GCC diagnostic pop

-        if (instance.m_performance_counters_enabled)
+        if (m_performance_counters_enabled)
        {
-            instance.m_timer_map[op].start();
+            m_timer_map[op].start();
        }
-        generate_calls(type, wrapped, op_outputs, op_inputs, instance);
-        if (instance.m_performance_counters_enabled)
+        generate_calls(type, wrapped, op_outputs, op_inputs);
+        if (m_performance_counters_enabled)
        {
-            instance.m_timer_map[op].stop();
+            m_timer_map[op].stop();
        }
-        if (instance.m_nan_check_enabled)
+        if (m_nan_check_enabled)
        {
            perform_nan_check(htv_outputs, op);
        }
@@ -242,26 +235,25 @@ bool runtime::interpreter::INTBackend::call(shared_ptr<Function> function,
    return true;
 }

-void runtime::interpreter::INTBackend::generate_calls(const element::Type& type,
-                                                      const NodeWrapper& op,
-                                                      const vector<void*>& outputs,
-                                                      const vector<const void*>& inputs,
-                                                      FunctionInstance& instance)
+void runtime::interpreter::INTExecutable::generate_calls(const element::Type& type,
+                                                         const NodeWrapper& op,
+                                                         const vector<void*>& outputs,
+                                                         const vector<const void*>& inputs)
 {
    stringstream ss;
    switch (type.get_type_enum())
    {
-    case element::Type_t::boolean: op_engine<char>(op, outputs, inputs, instance); break;
-    case element::Type_t::f32: op_engine<float>(op, outputs, inputs, instance); break;
-    case element::Type_t::f64: op_engine<double>(op, outputs, inputs, instance); break;
-    case element::Type_t::i8: op_engine<int8_t>(op, outputs, inputs, instance); break;
-    case element::Type_t::i16: op_engine<int16_t>(op, outputs, inputs, instance); break;
-    case element::Type_t::i32: op_engine<int32_t>(op, outputs, inputs, instance); break;
-    case element::Type_t::i64: op_engine<int64_t>(op, outputs, inputs, instance); break;
-    case element::Type_t::u8: op_engine<uint8_t>(op, outputs, inputs, instance); break;
-    case element::Type_t::u16: op_engine<uint16_t>(op, outputs, inputs, instance); break;
-    case element::Type_t::u32: op_engine<uint32_t>(op, outputs, inputs, instance); break;
-    case element::Type_t::u64: op_engine<uint64_t>(op, outputs, inputs, instance); break;
+    case element::Type_t::boolean: op_engine<char>(op, outputs, inputs); break;
+    case element::Type_t::f32: op_engine<float>(op, outputs, inputs); break;
+    case element::Type_t::f64: op_engine<double>(op, outputs, inputs); break;
+    case element::Type_t::i8: op_engine<int8_t>(op, outputs, inputs); break;
+    case element::Type_t::i16: op_engine<int16_t>(op, outputs, inputs); break;
+    case element::Type_t::i32: op_engine<int32_t>(op, outputs, inputs); break;
+    case element::Type_t::i64: op_engine<int64_t>(op, outputs, inputs); break;
+    case element::Type_t::u8: op_engine<uint8_t>(op, outputs, inputs); break;
+    case element::Type_t::u16: op_engine<uint16_t>(op, outputs, inputs); break;
+    case element::Type_t::u32: op_engine<uint32_t>(op, outputs, inputs); break;
+    case element::Type_t::u64: op_engine<uint64_t>(op, outputs, inputs); break;
    case element::Type_t::undefined:
    case element::Type_t::dynamic:
    case element::Type_t::bf16:
@@ -270,25 +262,11 @@ void runtime::interpreter::INTBackend::generate_calls(const element::Type& type,
    }
 }

-void runtime::interpreter::INTBackend::set_nan_check(shared_ptr<Function> func, bool enable)
-{
-    FunctionInstance& instance = m_function_map[func];
-    instance.m_nan_check_enabled = enable;
-}
-
-void runtime::interpreter::INTBackend::enable_performance_data(shared_ptr<Function> func,
-                                                               bool enable)
-{
-    FunctionInstance& instance = m_function_map[func];
-    instance.m_performance_counters_enabled = enable;
-}
-
 vector<runtime::PerformanceCounter>
-    runtime::interpreter::INTBackend::get_performance_data(shared_ptr<Function> func) const
+    runtime::interpreter::INTExecutable::get_performance_data() const
 {
    vector<runtime::PerformanceCounter> rc;
-    const FunctionInstance& instance = m_function_map.at(func);
-    for (const pair<const Node*, stopwatch> p : instance.m_timer_map)
+    for (const pair<const Node*, stopwatch> p : m_timer_map)
    {
        rc.emplace_back(p.first->get_name().c_str(),
                        p.second.get_total_microseconds(),
@@ -297,7 +275,7 @@ vector<runtime::PerformanceCounter>
    return rc;
 }

-void runtime::interpreter::INTBackend::perform_nan_check(
+void runtime::interpreter::INTExecutable::perform_nan_check(
    const vector<shared_ptr<HostTensor>>& tensors, const Node* op)
 {
    size_t arg_number = 1;

--- a/src/ngraph/runtime/interpreter/int_backend.hpp
+++ b/src/ngraph/runtime/interpreter/int_backend.hpp
@@ -143,6 +143,7 @@ namespace ngraph
        namespace interpreter
        {
            class INTBackend;
+            class INTExecutable;
        }
    }
 }
@@ -161,52 +162,49 @@ public:

    std::shared_ptr<Tensor> create_tensor(const element::Type& type, const Shape& shape) override;

-    Handle compile(std::shared_ptr<Function> function) override;
+    std::shared_ptr<Executable> compile(std::shared_ptr<Function> function,
+                                        bool enable_performance_data = false) override;

-    bool call(std::shared_ptr<Function> function,
-              const std::vector<std::shared_ptr<Tensor>>& outputs,
-              const std::vector<std::shared_ptr<Tensor>>& intputs) override;
+    bool is_supported(const Node& node) const override;

-    void set_nan_check(std::shared_ptr<Function> func, bool);
+private:
+    std::set<std::string> m_unsupported_op_name_list;
+};

-    void enable_performance_data(std::shared_ptr<Function> func, bool enable) override;
-    std::vector<PerformanceCounter>
-        get_performance_data(std::shared_ptr<Function> func) const override;
+class ngraph::runtime::interpreter::INTExecutable : public Executable
+{
+public:
+    INTExecutable(const std::shared_ptr<Function>& function,
+                  bool enable_performance_collection = false);

-    bool is_supported(const Node& node) const override;
+    bool call(const std::vector<std::shared_ptr<Tensor>>& outputs,
+              const std::vector<std::shared_ptr<Tensor>>& intputs) override;
+
+    void set_nan_check(bool value) { m_nan_check_enabled = value; }
+    std::vector<PerformanceCounter> get_performance_data() const override;

 private:
    int get_alignment() const { return 64; }
-    class FunctionInstance
-    {
-    public:
-        bool m_is_compiled = false;
-        bool m_nan_check_enabled = false;
-        bool m_performance_counters_enabled = false;
-        std::unordered_map<const Node*, stopwatch> m_timer_map;
-        std::vector<NodeWrapper> m_wrapped_nodes;
-        std::unordered_map<const Node*, std::shared_ptr<RNGState>> m_states;
-        std::shared_ptr<AlignedBuffer> m_temporary_memory;
-
-        void* get_temporary_pointer(size_t offset) { return m_temporary_memory->get_ptr(offset); }
-    };
-    std::map<std::shared_ptr<Function>, FunctionInstance> m_function_map;
-    std::set<std::string> m_unsupported_op_name_list;
+    bool m_nan_check_enabled = false;
+    bool m_performance_counters_enabled = false;
+    std::unordered_map<const Node*, stopwatch> m_timer_map;
+    std::vector<NodeWrapper> m_wrapped_nodes;
+    std::unordered_map<const Node*, std::shared_ptr<RNGState>> m_states;
+    std::shared_ptr<AlignedBuffer> m_temporary_memory;

+    void* get_temporary_pointer(size_t offset) { return m_temporary_memory->get_ptr(offset); }
    static void perform_nan_check(const std::vector<std::shared_ptr<HostTensor>>&,
                                  const Node* op = nullptr);

    void generate_calls(const element::Type& type,
                        const NodeWrapper& op,
                        const std::vector<void*>& outputs,
-                        const std::vector<const void*>& inputs,
-                        FunctionInstance& instance);
+                        const std::vector<const void*>& inputs);

    template <typename T>
    void op_engine(const NodeWrapper& node_wrapper,
                   const std::vector<void*>& out,
-                   const std::vector<const void*>& args,
-                   FunctionInstance& instance)
+                   const std::vector<const void*>& args)
    {
        const Node& node = node_wrapper.get_node();
        std::string node_op = node.description();
@@ -364,15 +362,15 @@ private:
        }
        case OP_TYPEID::GenerateMask:
        {
-            if (instance.m_states.count(&node) == 0)
+            if (m_states.count(&node) == 0)
            {
                const op::GenerateMask* gm = static_cast<const op::GenerateMask*>(&node);
-                instance.m_states[&node] = std::unique_ptr<ngraph::RNGState>(
+                m_states[&node] = std::unique_ptr<ngraph::RNGState>(
                    ngraph::RNGState::create_rng_state(gm->get_seed(), gm->get_probability()));
            }

            bool training = static_cast<bool>(static_cast<const T*>(args[0])[0]);
-            auto state = instance.m_states.at(&node).get();
+            auto state = m_states.at(&node).get();
            size_t element_count = shape_size(node.get_output_shape(0));
            reference::generate_mask<T>(
                reinterpret_cast<T*>(out[0]), element_count, state, training);

--- a/src/ngraph/runtime/nop/nop_backend.cpp
+++ b/src/ngraph/runtime/nop/nop_backend.cpp
@@ -54,14 +54,25 @@ shared_ptr<runtime::Tensor> runtime::nop::NOPBackend::create_tensor(const elemen
    return make_shared<runtime::HostTensor>(type, shape, memory_pointer, "external");
 }

-runtime::Handle runtime::nop::NOPBackend::compile(shared_ptr<Function> function)
+shared_ptr<runtime::Executable>
+    runtime::nop::NOPBackend::compile(shared_ptr<Function> function,
+                                      bool enable_performance_collection)
 {
-    return function;
+    return make_shared<NOPExecutable>(function, enable_performance_collection);
 }

-bool runtime::nop::NOPBackend::call(shared_ptr<Function> function,
-                                    const vector<shared_ptr<runtime::Tensor>>& outputs,
-                                    const vector<shared_ptr<runtime::Tensor>>& inputs)
+runtime::nop::NOPExecutable::NOPExecutable(shared_ptr<Function> function,
+                                           bool enable_performance_collection)
+{
+    pass::Manager pass_manager;
+    pass_manager.register_pass<pass::AssignLayout<DenseTensorLayout>>();
+    pass_manager.run_passes(function);
+
+    set_parameters_and_results(*function);
+}
+
+bool runtime::nop::NOPExecutable::call(const vector<shared_ptr<runtime::Tensor>>& outputs,
+                                       const vector<shared_ptr<runtime::Tensor>>& inputs)
 {
    return true;
 }
--- a/src/ngraph/runtime/nop/nop_backend.hpp
+++ b/src/ngraph/runtime/nop/nop_backend.hpp
@@ -32,6 +32,7 @@ namespace ngraph
        namespace nop
        {
            class NOPBackend;
+            class NOPExecutable;
        }
    }
 }
@@ -44,9 +45,14 @@ public:

    std::shared_ptr<Tensor> create_tensor(const element::Type& type, const Shape& shape) override;

-    Handle compile(std::shared_ptr<Function> function) override;
+    std::shared_ptr<Executable> compile(std::shared_ptr<Function> function,
+                                        bool enable_performance_data = false) override;
+};

-    bool call(std::shared_ptr<Function> function,
-              const std::vector<std::shared_ptr<Tensor>>& outputs,
-              const std::vector<std::shared_ptr<Tensor>>& intputs) override;
+class ngraph::runtime::nop::NOPExecutable : public Executable
+{
+public:
+    NOPExecutable(std::shared_ptr<Function> function, bool enable_performance_collection = false);
+    bool call(const std::vector<std::shared_ptr<runtime::Tensor>>& outputs,
+              const std::vector<std::shared_ptr<runtime::Tensor>>& inputs) override;
 };
--- a/src/tools/nbench/benchmark.cpp
+++ b/src/tools/nbench/benchmark.cpp
@@ -136,8 +136,7 @@ vector<runtime::PerformanceCounter> run_benchmark(shared_ptr<Function> f,
    stopwatch timer;
    timer.start();
    auto backend = runtime::Backend::create(backend_name);
-    backend->enable_performance_data(f, timing_detail);
-    auto compiled_func = backend->compile(f);
+    auto compiled_func = backend->compile(f, timing_detail);
    timer.stop();
    cout.imbue(locale(""));
    cout << "compile time: " << timer.get_milliseconds() << "ms" << endl;
@@ -183,7 +182,7 @@ vector<runtime::PerformanceCounter> run_benchmark(shared_ptr<Function> f,
    {
        for (int i = 0; i < warmup_iterations; i++)
        {
-            backend->call(compiled_func, results, args);
+            compiled_func->call(results, args);
        }
    }

@@ -205,7 +204,7 @@ vector<runtime::PerformanceCounter> run_benchmark(shared_ptr<Function> f,
                }
            }
        }
-        backend->call(compiled_func, results, args);
+        compiled_func->call(results, args);
        if (copy_data)
        {
            for (size_t result_index = 0; result_index < results.size(); result_index++)
@@ -222,6 +221,6 @@ vector<runtime::PerformanceCounter> run_benchmark(shared_ptr<Function> f,
    float time = t1.get_milliseconds();
    cout << time / iterations << "ms per iteration" << endl;

-    vector<runtime::PerformanceCounter> perf_data = backend->get_performance_data(f);
+    vector<runtime::PerformanceCounter> perf_data = compiled_func->get_performance_data();
    return perf_data;
 }
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -36,7 +36,6 @@ set(SRC
    cse.cpp
    element_type.cpp
    file_util.cpp
-    graph_partition.cpp
    includes.cpp
    input_output_assign.cpp
    main.cpp

--- a/test/backend_debug_api.cpp
+++ b/test/backend_debug_api.cpp
@@ -37,9 +37,6 @@ TEST(INTERPRETER, nan_check_input)

    shared_ptr<runtime::Backend> backend = runtime::Backend::create("INTERPRETER");

-    shared_ptr<runtime::interpreter::INTBackend> ibackend =
-        static_pointer_cast<runtime::interpreter::INTBackend>(backend);
-
    // Create some tensors for input/output
    auto a = backend->create_tensor(element::f32, shape);
    copy_data(a, vector<float>{2, 4, NAN, 16});
@@ -47,9 +44,12 @@ TEST(INTERPRETER, nan_check_input)
    copy_data(b, vector<float>{1, 2, 1, 8});
    auto result = backend->create_tensor(element::f32, shape);

-    auto handle = backend->compile(f);
-    ibackend->set_nan_check(handle, true);
-    EXPECT_ANY_THROW(ibackend->call_with_validate(handle, {result}, {a, b}));
+    shared_ptr<runtime::Executable> handle = backend->compile(f);
+
+    shared_ptr<runtime::interpreter::INTExecutable> ihandle =
+        static_pointer_cast<runtime::interpreter::INTExecutable>(handle);
+    ihandle->set_nan_check(true);
+    EXPECT_ANY_THROW(handle->call_with_validate({result}, {a, b}));
 }

 TEST(INTERPRETER, nan_check_output)
@@ -61,9 +61,6 @@ TEST(INTERPRETER, nan_check_output)

    shared_ptr<runtime::Backend> backend = runtime::Backend::create("INTERPRETER");

-    shared_ptr<runtime::interpreter::INTBackend> ibackend =
-        static_pointer_cast<runtime::interpreter::INTBackend>(backend);
-
    // Create some tensors for input/output
    auto a = backend->create_tensor(element::f32, shape);
    copy_data(a, vector<float>{2, 4, 0, 16});
@@ -71,7 +68,9 @@ TEST(INTERPRETER, nan_check_output)
    copy_data(b, vector<float>{1, 2, 0, 8});
    auto result = backend->create_tensor(element::f32, shape);

-    auto handle = backend->compile(f);
-    ibackend->set_nan_check(handle, true);
-    EXPECT_ANY_THROW(ibackend->call_with_validate(handle, {result}, {a, b}));
+    shared_ptr<runtime::Executable> handle = backend->compile(f);
+    shared_ptr<runtime::interpreter::INTExecutable> ihandle =
+        static_pointer_cast<runtime::interpreter::INTExecutable>(handle);
+    ihandle->set_nan_check(true);
+    EXPECT_ANY_THROW(handle->call_with_validate({result}, {a, b}));
 }