GPU transformer cleanup (#4361)

* Move Executable to its own files * Normalize class names * More cleanup Co-authored-by: Chris Sullivan <chris.sullivan@intel.com>

GPU transformer cleanup (#4361)
* Move Executable to its own files * Normalize class names * More cleanup Co-authored-by: Chris Sullivan <chris.sullivan@intel.com>
e6b26ac6 · Robert Kimball · GitHub · 134b285f · e6b26ac6 · e6b26ac6
Unverified Commit e6b26ac6 authored Feb 24, 2020 by Robert Kimball Committed by GitHub Feb 24, 2020
11 changed files
--- a/src/ngraph/runtime/gpu/CMakeLists.txt
+++ b/src/ngraph/runtime/gpu/CMakeLists.txt
@@ -30,6 +30,7 @@ set(SRC
    gpu_cuda_function_pool.cpp
    gpu_cuda_kernel_builder.cpp
    gpu_emitter.cpp
+    gpu_executable.cpp
    gpu_compiled_function.cpp
    gpu_internal_function.cpp
    gpu_invoke.cpp

--- a/src/ngraph/runtime/gpu/gpu_backend.cpp
+++ b/src/ngraph/runtime/gpu/gpu_backend.cpp
@@ -23,6 +23,7 @@
 #include "ngraph/op/batch_norm.hpp"
 #include "ngraph/runtime/backend_manager.hpp"
 #include "ngraph/runtime/gpu/gpu_backend.hpp"
+#include "ngraph/runtime/gpu/gpu_executable.hpp"
 #include "ngraph/runtime/gpu/gpu_external_function.hpp"
 #include "ngraph/runtime/gpu/gpu_internal_function.hpp"
 #include "ngraph/runtime/gpu/gpu_primitive_emitter.hpp"
@@ -36,16 +37,16 @@ using namespace std;
 extern "C" GPU_BACKEND_API void ngraph_register_gpu_backend()
 {
    runtime::BackendManager::register_backend("GPU", [](const std::string& /* config */) {
-        return make_shared<runtime::gpu::GPU_Backend>();
+        return make_shared<runtime::gpu::GPUBackend>();
    });
 }

-runtime::gpu::GPU_Backend::GPU_Backend()
+runtime::gpu::GPUBackend::GPUBackend()
    : runtime::Backend()
 {
 }

-runtime::gpu::GPU_Backend::BackendContext::BackendContext()
+runtime::gpu::GPUBackend::BackendContext::BackendContext()
    : m_runtime_context(new GPURuntimeContext)
    , m_primitive_emitter(new GPUPrimitiveEmitter(m_runtime_context))
    , m_cuda_manager(new CudaContextManager)
@@ -75,7 +76,7 @@ runtime::gpu::GPU_Backend::BackendContext::BackendContext()
    m_runtime_context->compiled_kernel_pool = new CudaFunctionPool;
 }

-void runtime::gpu::GPU_Backend::BackendContext::prepare_runtime_context()
+void runtime::gpu::GPUBackend::BackendContext::prepare_runtime_context()
 {
    // set context current each time in case thread changed
    bind_cuda_context_to_thread();
@@ -84,12 +85,12 @@ void runtime::gpu::GPU_Backend::BackendContext::prepare_runtime_context()
    m_runtime_context->gpu_memory_primitives = m_primitive_emitter->get_memory_primitives().data();
 }

-void runtime::gpu::GPU_Backend::BackendContext::bind_cuda_context_to_thread()
+void runtime::gpu::GPUBackend::BackendContext::bind_cuda_context_to_thread()
 {
    m_cuda_manager->SetContextCurrent();
 }

-runtime::gpu::GPU_Backend::BackendContext::~BackendContext()
+runtime::gpu::GPUBackend::BackendContext::~BackendContext()
 {
    cublasDestroy(*m_runtime_context->cublas_handle);
    delete m_runtime_context->cublas_handle;
@@ -99,12 +100,12 @@ runtime::gpu::GPU_Backend::BackendContext::~BackendContext()
 }

 shared_ptr<runtime::Tensor>
-    runtime::gpu::GPU_Backend::create_tensor(const element::Type& element_type, const Shape& shape)
+    runtime::gpu::GPUBackend::create_tensor(const element::Type& element_type, const Shape& shape)
 {
    return make_shared<runtime::gpu::GPUTensor>(element_type, shape);
 }

-shared_ptr<runtime::Tensor> runtime::gpu::GPU_Backend::create_tensor(
+shared_ptr<runtime::Tensor> runtime::gpu::GPUBackend::create_tensor(
    const element::Type& element_type, const Shape& shape, void* memory_pointer)
 {
    if (memory_pointer != nullptr && !is_device_pointer(memory_pointer))
@@ -114,8 +115,8 @@ shared_ptr<runtime::Tensor> runtime::gpu::GPU_Backend::create_tensor(
    return make_shared<runtime::gpu::GPUTensor>(element_type, shape, memory_pointer);
 }

-shared_ptr<runtime::Executable> runtime::gpu::GPU_Backend::compile(shared_ptr<Function> func,
-                                                                   bool timing_enable)
+shared_ptr<runtime::Executable> runtime::gpu::GPUBackend::compile(shared_ptr<Function> func,
+                                                                  bool timing_enable)
 {
    shared_ptr<runtime::Executable> rc;
    auto it = m_exec_map.find(func);
@@ -125,87 +126,13 @@ shared_ptr<runtime::Executable> runtime::gpu::GPU_Backend::compile(shared_ptr<Fu
    }
    else
    {
-        rc = make_shared<GPU_Executable>(func, timing_enable);
+        rc = make_shared<GPUExecutable>(func, timing_enable);
        m_exec_map.insert({func, rc});
    }
    return rc;
 }

-runtime::gpu::GPU_Executable::GPU_Executable(shared_ptr<Function> func, bool enable_timing)
-    : m_context(new GPU_Backend::BackendContext())
-
-{
-    FunctionInstance& instance = m_function_instance;
-    if (instance.m_compiled_function == nullptr)
-    {
-        m_context->bind_cuda_context_to_thread();
-        instance.m_compiled_function = runtime::gpu::GPUCompiledFunction::make(func, m_context);
-        instance.m_compiled_function->m_emit_timing = enable_timing;
-        instance.m_compiled_function->compile();
-        instance.m_runtime = instance.m_compiled_function->m_runtime;
-        instance.m_inputs.resize(func->get_parameters().size());
-        instance.m_outputs.resize(func->get_output_size());
-    }
-    set_parameters_and_results(*func);
-}
-
-void runtime::gpu::GPU_Executable::initialize_io(void** target,
-                                                 const vector<shared_ptr<runtime::Tensor>>& source)
-{
-    for (size_t i = 0; i < source.size(); i++)
-    {
-        shared_ptr<runtime::gpu::GPUTensor> tv =
-            dynamic_pointer_cast<runtime::gpu::GPUTensor>(source[i]);
-        if (tv)
-        {
-            target[i] = tv->m_allocated_buffer_pool;
-        }
-        else
-        {
-            throw invalid_argument("Tensors passed to GPU backend must be GPU Tensors");
-        }
-    }
-}
-
-bool runtime::gpu::GPU_Executable::call(const vector<shared_ptr<runtime::Tensor>>& outputs,
-                                        const vector<shared_ptr<runtime::Tensor>>& inputs)
-{
-    FunctionInstance& instance = m_function_instance;
-    if (instance.m_compiled_function == nullptr)
-    {
-        throw runtime_error("compile() must be called before call().");
-    }
-
-    // ensure the GPURuntimeContext primitive pointers are valid
-    m_context->prepare_runtime_context();
-
-    // Device tensors
-    initialize_io(instance.m_inputs.data(), inputs);
-    initialize_io(instance.m_outputs.data(), outputs);
-
-    auto ctx = m_context->m_runtime_context.get();
-    instance.m_runtime(instance.m_inputs.data(), instance.m_outputs.data(), ctx);
-
-    return true;
-}
-
-// void runtime::gpu::GPU_Backend::remove_compiled_function(shared_ptr<Function> func)
-// {
-//     m_function_map.erase(func);
-// }
-
-vector<runtime::PerformanceCounter> runtime::gpu::GPU_Executable::get_performance_data() const
-{
-    std::vector<runtime::PerformanceCounter> rc;
-    const FunctionInstance& instance = m_function_instance;
-    if (instance.m_compiled_function != nullptr)
-    {
-        instance.m_compiled_function->get_performance_data(rc);
-    }
-    return rc;
-}
-
-bool runtime::gpu::GPU_Backend::is_supported(const Node& op) const
+bool runtime::gpu::GPUBackend::is_supported(const Node& op) const
 {
    set<string> unsupported_ops = {"Quantize",
                                   "Dequantize",

--- a/src/ngraph/runtime/gpu/gpu_backend.hpp
+++ b/src/ngraph/runtime/gpu/gpu_backend.hpp
@@ -35,15 +35,16 @@ namespace ngraph
            class GPUPrimitiveEmitter;
            struct GPURuntimeContext;
            class CudaContextManager;
+            class GPUExecutable;

            using EntryPoint_t = void(void** inputs, void** outputs, GPURuntimeContext* ctx);
            using EntryPoint = std::function<EntryPoint_t>;

            BackendConstructor GPU_BACKEND_API get_backend_constructor_pointer();
-            class GPU_Backend : public Backend
+            class GPUBackend : public Backend
            {
            public:
-                GPU_Backend();
+                GPUBackend();

                std::shared_ptr<ngraph::runtime::Tensor>
                    create_tensor(const ngraph::element::Type& element_type,
@@ -77,40 +78,6 @@ namespace ngraph
            private:
                std::map<std::shared_ptr<Function>, std::shared_ptr<Executable>> m_exec_map;
            };
-
-            class GPU_Executable : public Executable
-            {
-            public:
-                GPU_Executable(std::shared_ptr<Function> func, bool enable_timing);
-
-                bool call(const std::vector<std::shared_ptr<runtime::Tensor>>& outputs,
-                          const std::vector<std::shared_ptr<runtime::Tensor>>& inputs) override;
-
-                // void remove_compiled_function(std::shared_ptr<Function> func) override;
-                std::vector<PerformanceCounter> get_performance_data() const override;
-
-            private:
-                class FunctionInstance
-                {
-                public:
-                    std::shared_ptr<GPUCompiledFunction> m_compiled_function;
-                    bool m_performance_counters_enabled = false;
-                    EntryPoint m_runtime;
-                    std::vector<void*> m_inputs;
-                    std::vector<void*> m_outputs;
-                } m_function_instance;
-
-                /// \brief Convert a vector of Tensor into a vector of void* where each void*
-                /// points to a Tensor's data buffer.
-                /// \param target Pointer to a pre-allocated array of void* with
-                /// size >= source.size()
-                /// \param source Source vector of Tensors
-                static void
-                    initialize_io(void** target,
-                                  const std::vector<std::shared_ptr<runtime::Tensor>>& source);
-
-                std::shared_ptr<GPU_Backend::BackendContext> m_context;
-            };
        }
    }
 }
--- a/src/ngraph/runtime/gpu/gpu_compiled_function.cpp
+++ b/src/ngraph/runtime/gpu/gpu_compiled_function.cpp
@@ -81,7 +81,7 @@ static GPUStaticInitializers s_static_initializers;

 runtime::gpu::GPUCompiledFunction::GPUCompiledFunction(
    const shared_ptr<ngraph::Function>& function,
-    const std::shared_ptr<GPU_Backend::BackendContext>& shared_context)
+    const std::shared_ptr<GPUBackend::BackendContext>& shared_context)
    : m_runtime(nullptr)
    , m_function(function)
    , m_emit_timing(false)
@@ -119,7 +119,7 @@ std::vector<std::string> get_case_variants(std::vector<std::string> cases)

 std::shared_ptr<runtime::gpu::GPUCompiledFunction> runtime::gpu::GPUCompiledFunction::make(
    const std::shared_ptr<ngraph::Function>& function,
-    const std::shared_ptr<GPU_Backend::BackendContext>& shared_context)
+    const std::shared_ptr<GPUBackend::BackendContext>& shared_context)
 {
    return std::make_shared<runtime::gpu::GPUInternalFunction>(function, shared_context);
 }

--- a/src/ngraph/runtime/gpu/gpu_compiled_function.hpp
+++ b/src/ngraph/runtime/gpu/gpu_compiled_function.hpp
@@ -48,18 +48,18 @@ namespace ngraph

            class GPUCompiledFunction
            {
-                friend class GPU_Backend;
-                friend class GPU_Executable;
+                friend class GPUBackend;
+                friend class GPUExecutable;

            public:
                GPUCompiledFunction(
                    const std::shared_ptr<ngraph::Function>& function,
-                    const std::shared_ptr<GPU_Backend::BackendContext>& shared_context);
+                    const std::shared_ptr<GPUBackend::BackendContext>& shared_context);
                virtual ~GPUCompiledFunction();

                static std::shared_ptr<GPUCompiledFunction>
                    make(const std::shared_ptr<ngraph::Function>& function,
-                         const std::shared_ptr<GPU_Backend::BackendContext>& shared_context);
+                         const std::shared_ptr<GPUBackend::BackendContext>& shared_context);
                std::unique_ptr<runtime::gpu::GPURuntimeContext>& ctx();
                const std::unique_ptr<GPUPrimitiveEmitter>& get_primitive_emitter() const
                {
@@ -110,7 +110,7 @@ namespace ngraph
                std::string m_function_name;

                std::unordered_map<std::string, size_t> m_tensor_memory_buffers;
-                std::shared_ptr<GPU_Backend::BackendContext> m_shared_context;
+                std::shared_ptr<GPUBackend::BackendContext> m_shared_context;
            };
        }
    }

--- a/src/ngraph/runtime/gpu/gpu_executable.cpp
+++ b/src/ngraph/runtime/gpu/gpu_executable.cpp
+//*****************************************************************************
+// Copyright 2017-2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+
+#include <cublas_v2.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cudnn.h>
+
+#include "ngraph/graph_util.hpp"
+#include "ngraph/op/batch_norm.hpp"
+#include "ngraph/runtime/backend_manager.hpp"
+#include "ngraph/runtime/gpu/gpu_executable.hpp"
+#include "ngraph/runtime/gpu/gpu_external_function.hpp"
+#include "ngraph/runtime/gpu/gpu_internal_function.hpp"
+#include "ngraph/runtime/gpu/gpu_primitive_emitter.hpp"
+#include "ngraph/runtime/gpu/gpu_tensor.hpp"
+#include "ngraph/runtime/gpu/gpu_util.hpp"
+#include "ngraph/util.hpp"
+
+using namespace ngraph;
+using namespace std;
+
+runtime::gpu::GPUExecutable::GPUExecutable(shared_ptr<Function> func, bool enable_timing)
+    : m_context(new GPUBackend::BackendContext())
+
+{
+    if (m_compiled_function == nullptr)
+    {
+        m_context->bind_cuda_context_to_thread();
+        m_compiled_function = runtime::gpu::GPUCompiledFunction::make(func, m_context);
+        m_compiled_function->m_emit_timing = enable_timing;
+        m_compiled_function->compile();
+        m_runtime = m_compiled_function->m_runtime;
+        m_inputs.resize(func->get_parameters().size());
+        m_outputs.resize(func->get_output_size());
+    }
+    set_parameters_and_results(*func);
+}
+
+void runtime::gpu::GPUExecutable::initialize_io(void** target,
+                                                const vector<shared_ptr<runtime::Tensor>>& source)
+{
+    for (size_t i = 0; i < source.size(); i++)
+    {
+        shared_ptr<runtime::gpu::GPUTensor> tv =
+            dynamic_pointer_cast<runtime::gpu::GPUTensor>(source[i]);
+        if (tv)
+        {
+            target[i] = tv->m_allocated_buffer_pool;
+        }
+        else
+        {
+            throw invalid_argument("Tensors passed to GPU backend must be GPU Tensors");
+        }
+    }
+}
+
+bool runtime::gpu::GPUExecutable::call(const vector<shared_ptr<runtime::Tensor>>& outputs,
+                                       const vector<shared_ptr<runtime::Tensor>>& inputs)
+{
+    if (m_compiled_function == nullptr)
+    {
+        throw runtime_error("compile() must be called before call().");
+    }
+
+    // ensure the GPURuntimeContext primitive pointers are valid
+    m_context->prepare_runtime_context();
+
+    // Device tensors
+    initialize_io(m_inputs.data(), inputs);
+    initialize_io(m_outputs.data(), outputs);
+
+    auto ctx = m_context->m_runtime_context.get();
+    m_runtime(m_inputs.data(), m_outputs.data(), ctx);
+
+    return true;
+}
+
+vector<runtime::PerformanceCounter> runtime::gpu::GPUExecutable::get_performance_data() const
+{
+    std::vector<runtime::PerformanceCounter> rc;
+    if (m_compiled_function != nullptr)
+    {
+        m_compiled_function->get_performance_data(rc);
+    }
+    return rc;
+}
--- a/src/ngraph/runtime/gpu/gpu_executable.hpp
+++ b/src/ngraph/runtime/gpu/gpu_executable.hpp
+//*****************************************************************************
+// Copyright 2017-2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+
+#pragma once
+
+#include <map>
+#include <memory>
+
+#include "gpu_backend_visibility.hpp"
+#include "ngraph/runtime/backend.hpp"
+#include "ngraph/runtime/backend_manager.hpp"
+#include "ngraph/runtime/gpu/gpu_backend.hpp"
+
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace gpu
+        {
+            class GPUPrimitiveEmitter;
+            struct GPURuntimeContext;
+            class CudaContextManager;
+
+            using EntryPoint_t = void(void** inputs, void** outputs, GPURuntimeContext* ctx);
+            using EntryPoint = std::function<EntryPoint_t>;
+
+            class GPUExecutable : public Executable
+            {
+            public:
+                GPUExecutable(std::shared_ptr<Function> func, bool enable_timing);
+
+                bool call(const std::vector<std::shared_ptr<runtime::Tensor>>& outputs,
+                          const std::vector<std::shared_ptr<runtime::Tensor>>& inputs) override;
+
+                // void remove_compiled_function(std::shared_ptr<Function> func) override;
+                std::vector<PerformanceCounter> get_performance_data() const override;
+
+            private:
+                std::shared_ptr<GPUCompiledFunction> m_compiled_function;
+                bool m_performance_counters_enabled = false;
+                EntryPoint m_runtime;
+                std::vector<void*> m_inputs;
+                std::vector<void*> m_outputs;
+
+                /// \brief Convert a vector of Tensor into a vector of void* where each void*
+                /// points to a Tensor's data buffer.
+                /// \param target Pointer to a pre-allocated array of void* with
+                /// size >= source.size()
+                /// \param source Source vector of Tensors
+                static void
+                    initialize_io(void** target,
+                                  const std::vector<std::shared_ptr<runtime::Tensor>>& source);
+
+                std::shared_ptr<GPUBackend::BackendContext> m_context;
+            };
+        }
+    }
+}
--- a/src/ngraph/runtime/gpu/gpu_external_function.cpp
+++ b/src/ngraph/runtime/gpu/gpu_external_function.cpp
@@ -153,7 +153,7 @@ std::string runtime::gpu::GPUExternalFunction::emit_op(GPUCompiledFunction* exte

 runtime::gpu::GPUExternalFunction::GPUExternalFunction(
    const shared_ptr<ngraph::Function>& function,
-    const std::shared_ptr<GPU_Backend::BackendContext>& shared_context)
+    const std::shared_ptr<GPUBackend::BackendContext>& shared_context)
    : GPUCompiledFunction(function, shared_context)
 {
 }

--- a/src/ngraph/runtime/gpu/gpu_external_function.hpp
+++ b/src/ngraph/runtime/gpu/gpu_external_function.hpp
@@ -52,7 +52,7 @@ namespace ngraph
            public:
                GPUExternalFunction(
                    const std::shared_ptr<ngraph::Function>& function,
-                    const std::shared_ptr<GPU_Backend::BackendContext>& shared_context);
+                    const std::shared_ptr<GPUBackend::BackendContext>& shared_context);
                virtual ~GPUExternalFunction();

                virtual std::string

--- a/src/ngraph/runtime/gpu/gpu_internal_function.cpp
+++ b/src/ngraph/runtime/gpu/gpu_internal_function.cpp
@@ -128,7 +128,7 @@ std::string runtime::gpu::GPUInternalFunction::emit_op(GPUCompiledFunction* comp

 runtime::gpu::GPUInternalFunction::GPUInternalFunction(
    const shared_ptr<ngraph::Function>& function,
-    const std::shared_ptr<GPU_Backend::BackendContext>& shared_context)
+    const std::shared_ptr<GPUBackend::BackendContext>& shared_context)
    : GPUCompiledFunction(function, shared_context)
 {
 }

--- a/src/ngraph/runtime/gpu/gpu_internal_function.hpp
+++ b/src/ngraph/runtime/gpu/gpu_internal_function.hpp
@@ -49,7 +49,7 @@ namespace ngraph
            public:
                GPUInternalFunction(
                    const std::shared_ptr<ngraph::Function>& function,
-                    const std::shared_ptr<GPU_Backend::BackendContext>& shared_context);
+                    const std::shared_ptr<GPUBackend::BackendContext>& shared_context);
                virtual ~GPUInternalFunction();

                virtual std::string