Unverified Commit e6b26ac6 authored by Robert Kimball's avatar Robert Kimball Committed by GitHub

GPU transformer cleanup (#4361)

* Move Executable to its own files

* Normalize class names

* More cleanup
Co-authored-by: 's avatarChris Sullivan <chris.sullivan@intel.com>
parent 134b285f
...@@ -30,6 +30,7 @@ set(SRC ...@@ -30,6 +30,7 @@ set(SRC
gpu_cuda_function_pool.cpp gpu_cuda_function_pool.cpp
gpu_cuda_kernel_builder.cpp gpu_cuda_kernel_builder.cpp
gpu_emitter.cpp gpu_emitter.cpp
gpu_executable.cpp
gpu_compiled_function.cpp gpu_compiled_function.cpp
gpu_internal_function.cpp gpu_internal_function.cpp
gpu_invoke.cpp gpu_invoke.cpp
......
...@@ -23,6 +23,7 @@ ...@@ -23,6 +23,7 @@
#include "ngraph/op/batch_norm.hpp" #include "ngraph/op/batch_norm.hpp"
#include "ngraph/runtime/backend_manager.hpp" #include "ngraph/runtime/backend_manager.hpp"
#include "ngraph/runtime/gpu/gpu_backend.hpp" #include "ngraph/runtime/gpu/gpu_backend.hpp"
#include "ngraph/runtime/gpu/gpu_executable.hpp"
#include "ngraph/runtime/gpu/gpu_external_function.hpp" #include "ngraph/runtime/gpu/gpu_external_function.hpp"
#include "ngraph/runtime/gpu/gpu_internal_function.hpp" #include "ngraph/runtime/gpu/gpu_internal_function.hpp"
#include "ngraph/runtime/gpu/gpu_primitive_emitter.hpp" #include "ngraph/runtime/gpu/gpu_primitive_emitter.hpp"
...@@ -36,16 +37,16 @@ using namespace std; ...@@ -36,16 +37,16 @@ using namespace std;
extern "C" GPU_BACKEND_API void ngraph_register_gpu_backend() extern "C" GPU_BACKEND_API void ngraph_register_gpu_backend()
{ {
runtime::BackendManager::register_backend("GPU", [](const std::string& /* config */) { runtime::BackendManager::register_backend("GPU", [](const std::string& /* config */) {
return make_shared<runtime::gpu::GPU_Backend>(); return make_shared<runtime::gpu::GPUBackend>();
}); });
} }
runtime::gpu::GPU_Backend::GPU_Backend() runtime::gpu::GPUBackend::GPUBackend()
: runtime::Backend() : runtime::Backend()
{ {
} }
runtime::gpu::GPU_Backend::BackendContext::BackendContext() runtime::gpu::GPUBackend::BackendContext::BackendContext()
: m_runtime_context(new GPURuntimeContext) : m_runtime_context(new GPURuntimeContext)
, m_primitive_emitter(new GPUPrimitiveEmitter(m_runtime_context)) , m_primitive_emitter(new GPUPrimitiveEmitter(m_runtime_context))
, m_cuda_manager(new CudaContextManager) , m_cuda_manager(new CudaContextManager)
...@@ -75,7 +76,7 @@ runtime::gpu::GPU_Backend::BackendContext::BackendContext() ...@@ -75,7 +76,7 @@ runtime::gpu::GPU_Backend::BackendContext::BackendContext()
m_runtime_context->compiled_kernel_pool = new CudaFunctionPool; m_runtime_context->compiled_kernel_pool = new CudaFunctionPool;
} }
void runtime::gpu::GPU_Backend::BackendContext::prepare_runtime_context() void runtime::gpu::GPUBackend::BackendContext::prepare_runtime_context()
{ {
// set context current each time in case thread changed // set context current each time in case thread changed
bind_cuda_context_to_thread(); bind_cuda_context_to_thread();
...@@ -84,12 +85,12 @@ void runtime::gpu::GPU_Backend::BackendContext::prepare_runtime_context() ...@@ -84,12 +85,12 @@ void runtime::gpu::GPU_Backend::BackendContext::prepare_runtime_context()
m_runtime_context->gpu_memory_primitives = m_primitive_emitter->get_memory_primitives().data(); m_runtime_context->gpu_memory_primitives = m_primitive_emitter->get_memory_primitives().data();
} }
void runtime::gpu::GPU_Backend::BackendContext::bind_cuda_context_to_thread() void runtime::gpu::GPUBackend::BackendContext::bind_cuda_context_to_thread()
{ {
m_cuda_manager->SetContextCurrent(); m_cuda_manager->SetContextCurrent();
} }
runtime::gpu::GPU_Backend::BackendContext::~BackendContext() runtime::gpu::GPUBackend::BackendContext::~BackendContext()
{ {
cublasDestroy(*m_runtime_context->cublas_handle); cublasDestroy(*m_runtime_context->cublas_handle);
delete m_runtime_context->cublas_handle; delete m_runtime_context->cublas_handle;
...@@ -99,12 +100,12 @@ runtime::gpu::GPU_Backend::BackendContext::~BackendContext() ...@@ -99,12 +100,12 @@ runtime::gpu::GPU_Backend::BackendContext::~BackendContext()
} }
shared_ptr<runtime::Tensor> shared_ptr<runtime::Tensor>
runtime::gpu::GPU_Backend::create_tensor(const element::Type& element_type, const Shape& shape) runtime::gpu::GPUBackend::create_tensor(const element::Type& element_type, const Shape& shape)
{ {
return make_shared<runtime::gpu::GPUTensor>(element_type, shape); return make_shared<runtime::gpu::GPUTensor>(element_type, shape);
} }
shared_ptr<runtime::Tensor> runtime::gpu::GPU_Backend::create_tensor( shared_ptr<runtime::Tensor> runtime::gpu::GPUBackend::create_tensor(
const element::Type& element_type, const Shape& shape, void* memory_pointer) const element::Type& element_type, const Shape& shape, void* memory_pointer)
{ {
if (memory_pointer != nullptr && !is_device_pointer(memory_pointer)) if (memory_pointer != nullptr && !is_device_pointer(memory_pointer))
...@@ -114,8 +115,8 @@ shared_ptr<runtime::Tensor> runtime::gpu::GPU_Backend::create_tensor( ...@@ -114,8 +115,8 @@ shared_ptr<runtime::Tensor> runtime::gpu::GPU_Backend::create_tensor(
return make_shared<runtime::gpu::GPUTensor>(element_type, shape, memory_pointer); return make_shared<runtime::gpu::GPUTensor>(element_type, shape, memory_pointer);
} }
shared_ptr<runtime::Executable> runtime::gpu::GPU_Backend::compile(shared_ptr<Function> func, shared_ptr<runtime::Executable> runtime::gpu::GPUBackend::compile(shared_ptr<Function> func,
bool timing_enable) bool timing_enable)
{ {
shared_ptr<runtime::Executable> rc; shared_ptr<runtime::Executable> rc;
auto it = m_exec_map.find(func); auto it = m_exec_map.find(func);
...@@ -125,87 +126,13 @@ shared_ptr<runtime::Executable> runtime::gpu::GPU_Backend::compile(shared_ptr<Fu ...@@ -125,87 +126,13 @@ shared_ptr<runtime::Executable> runtime::gpu::GPU_Backend::compile(shared_ptr<Fu
} }
else else
{ {
rc = make_shared<GPU_Executable>(func, timing_enable); rc = make_shared<GPUExecutable>(func, timing_enable);
m_exec_map.insert({func, rc}); m_exec_map.insert({func, rc});
} }
return rc; return rc;
} }
runtime::gpu::GPU_Executable::GPU_Executable(shared_ptr<Function> func, bool enable_timing) bool runtime::gpu::GPUBackend::is_supported(const Node& op) const
: m_context(new GPU_Backend::BackendContext())
{
FunctionInstance& instance = m_function_instance;
if (instance.m_compiled_function == nullptr)
{
m_context->bind_cuda_context_to_thread();
instance.m_compiled_function = runtime::gpu::GPUCompiledFunction::make(func, m_context);
instance.m_compiled_function->m_emit_timing = enable_timing;
instance.m_compiled_function->compile();
instance.m_runtime = instance.m_compiled_function->m_runtime;
instance.m_inputs.resize(func->get_parameters().size());
instance.m_outputs.resize(func->get_output_size());
}
set_parameters_and_results(*func);
}
void runtime::gpu::GPU_Executable::initialize_io(void** target,
const vector<shared_ptr<runtime::Tensor>>& source)
{
for (size_t i = 0; i < source.size(); i++)
{
shared_ptr<runtime::gpu::GPUTensor> tv =
dynamic_pointer_cast<runtime::gpu::GPUTensor>(source[i]);
if (tv)
{
target[i] = tv->m_allocated_buffer_pool;
}
else
{
throw invalid_argument("Tensors passed to GPU backend must be GPU Tensors");
}
}
}
bool runtime::gpu::GPU_Executable::call(const vector<shared_ptr<runtime::Tensor>>& outputs,
const vector<shared_ptr<runtime::Tensor>>& inputs)
{
FunctionInstance& instance = m_function_instance;
if (instance.m_compiled_function == nullptr)
{
throw runtime_error("compile() must be called before call().");
}
// ensure the GPURuntimeContext primitive pointers are valid
m_context->prepare_runtime_context();
// Device tensors
initialize_io(instance.m_inputs.data(), inputs);
initialize_io(instance.m_outputs.data(), outputs);
auto ctx = m_context->m_runtime_context.get();
instance.m_runtime(instance.m_inputs.data(), instance.m_outputs.data(), ctx);
return true;
}
// void runtime::gpu::GPU_Backend::remove_compiled_function(shared_ptr<Function> func)
// {
// m_function_map.erase(func);
// }
vector<runtime::PerformanceCounter> runtime::gpu::GPU_Executable::get_performance_data() const
{
std::vector<runtime::PerformanceCounter> rc;
const FunctionInstance& instance = m_function_instance;
if (instance.m_compiled_function != nullptr)
{
instance.m_compiled_function->get_performance_data(rc);
}
return rc;
}
bool runtime::gpu::GPU_Backend::is_supported(const Node& op) const
{ {
set<string> unsupported_ops = {"Quantize", set<string> unsupported_ops = {"Quantize",
"Dequantize", "Dequantize",
......
...@@ -35,15 +35,16 @@ namespace ngraph ...@@ -35,15 +35,16 @@ namespace ngraph
class GPUPrimitiveEmitter; class GPUPrimitiveEmitter;
struct GPURuntimeContext; struct GPURuntimeContext;
class CudaContextManager; class CudaContextManager;
class GPUExecutable;
using EntryPoint_t = void(void** inputs, void** outputs, GPURuntimeContext* ctx); using EntryPoint_t = void(void** inputs, void** outputs, GPURuntimeContext* ctx);
using EntryPoint = std::function<EntryPoint_t>; using EntryPoint = std::function<EntryPoint_t>;
BackendConstructor GPU_BACKEND_API get_backend_constructor_pointer(); BackendConstructor GPU_BACKEND_API get_backend_constructor_pointer();
class GPU_Backend : public Backend class GPUBackend : public Backend
{ {
public: public:
GPU_Backend(); GPUBackend();
std::shared_ptr<ngraph::runtime::Tensor> std::shared_ptr<ngraph::runtime::Tensor>
create_tensor(const ngraph::element::Type& element_type, create_tensor(const ngraph::element::Type& element_type,
...@@ -77,40 +78,6 @@ namespace ngraph ...@@ -77,40 +78,6 @@ namespace ngraph
private: private:
std::map<std::shared_ptr<Function>, std::shared_ptr<Executable>> m_exec_map; std::map<std::shared_ptr<Function>, std::shared_ptr<Executable>> m_exec_map;
}; };
class GPU_Executable : public Executable
{
public:
GPU_Executable(std::shared_ptr<Function> func, bool enable_timing);
bool call(const std::vector<std::shared_ptr<runtime::Tensor>>& outputs,
const std::vector<std::shared_ptr<runtime::Tensor>>& inputs) override;
// void remove_compiled_function(std::shared_ptr<Function> func) override;
std::vector<PerformanceCounter> get_performance_data() const override;
private:
class FunctionInstance
{
public:
std::shared_ptr<GPUCompiledFunction> m_compiled_function;
bool m_performance_counters_enabled = false;
EntryPoint m_runtime;
std::vector<void*> m_inputs;
std::vector<void*> m_outputs;
} m_function_instance;
/// \brief Convert a vector of Tensor into a vector of void* where each void*
/// points to a Tensor's data buffer.
/// \param target Pointer to a pre-allocated array of void* with
/// size >= source.size()
/// \param source Source vector of Tensors
static void
initialize_io(void** target,
const std::vector<std::shared_ptr<runtime::Tensor>>& source);
std::shared_ptr<GPU_Backend::BackendContext> m_context;
};
} }
} }
} }
...@@ -81,7 +81,7 @@ static GPUStaticInitializers s_static_initializers; ...@@ -81,7 +81,7 @@ static GPUStaticInitializers s_static_initializers;
runtime::gpu::GPUCompiledFunction::GPUCompiledFunction( runtime::gpu::GPUCompiledFunction::GPUCompiledFunction(
const shared_ptr<ngraph::Function>& function, const shared_ptr<ngraph::Function>& function,
const std::shared_ptr<GPU_Backend::BackendContext>& shared_context) const std::shared_ptr<GPUBackend::BackendContext>& shared_context)
: m_runtime(nullptr) : m_runtime(nullptr)
, m_function(function) , m_function(function)
, m_emit_timing(false) , m_emit_timing(false)
...@@ -119,7 +119,7 @@ std::vector<std::string> get_case_variants(std::vector<std::string> cases) ...@@ -119,7 +119,7 @@ std::vector<std::string> get_case_variants(std::vector<std::string> cases)
std::shared_ptr<runtime::gpu::GPUCompiledFunction> runtime::gpu::GPUCompiledFunction::make( std::shared_ptr<runtime::gpu::GPUCompiledFunction> runtime::gpu::GPUCompiledFunction::make(
const std::shared_ptr<ngraph::Function>& function, const std::shared_ptr<ngraph::Function>& function,
const std::shared_ptr<GPU_Backend::BackendContext>& shared_context) const std::shared_ptr<GPUBackend::BackendContext>& shared_context)
{ {
return std::make_shared<runtime::gpu::GPUInternalFunction>(function, shared_context); return std::make_shared<runtime::gpu::GPUInternalFunction>(function, shared_context);
} }
......
...@@ -48,18 +48,18 @@ namespace ngraph ...@@ -48,18 +48,18 @@ namespace ngraph
class GPUCompiledFunction class GPUCompiledFunction
{ {
friend class GPU_Backend; friend class GPUBackend;
friend class GPU_Executable; friend class GPUExecutable;
public: public:
GPUCompiledFunction( GPUCompiledFunction(
const std::shared_ptr<ngraph::Function>& function, const std::shared_ptr<ngraph::Function>& function,
const std::shared_ptr<GPU_Backend::BackendContext>& shared_context); const std::shared_ptr<GPUBackend::BackendContext>& shared_context);
virtual ~GPUCompiledFunction(); virtual ~GPUCompiledFunction();
static std::shared_ptr<GPUCompiledFunction> static std::shared_ptr<GPUCompiledFunction>
make(const std::shared_ptr<ngraph::Function>& function, make(const std::shared_ptr<ngraph::Function>& function,
const std::shared_ptr<GPU_Backend::BackendContext>& shared_context); const std::shared_ptr<GPUBackend::BackendContext>& shared_context);
std::unique_ptr<runtime::gpu::GPURuntimeContext>& ctx(); std::unique_ptr<runtime::gpu::GPURuntimeContext>& ctx();
const std::unique_ptr<GPUPrimitiveEmitter>& get_primitive_emitter() const const std::unique_ptr<GPUPrimitiveEmitter>& get_primitive_emitter() const
{ {
...@@ -110,7 +110,7 @@ namespace ngraph ...@@ -110,7 +110,7 @@ namespace ngraph
std::string m_function_name; std::string m_function_name;
std::unordered_map<std::string, size_t> m_tensor_memory_buffers; std::unordered_map<std::string, size_t> m_tensor_memory_buffers;
std::shared_ptr<GPU_Backend::BackendContext> m_shared_context; std::shared_ptr<GPUBackend::BackendContext> m_shared_context;
}; };
} }
} }
......
//*****************************************************************************
// Copyright 2017-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#include <cublas_v2.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <cudnn.h>
#include "ngraph/graph_util.hpp"
#include "ngraph/op/batch_norm.hpp"
#include "ngraph/runtime/backend_manager.hpp"
#include "ngraph/runtime/gpu/gpu_executable.hpp"
#include "ngraph/runtime/gpu/gpu_external_function.hpp"
#include "ngraph/runtime/gpu/gpu_internal_function.hpp"
#include "ngraph/runtime/gpu/gpu_primitive_emitter.hpp"
#include "ngraph/runtime/gpu/gpu_tensor.hpp"
#include "ngraph/runtime/gpu/gpu_util.hpp"
#include "ngraph/util.hpp"
using namespace ngraph;
using namespace std;
runtime::gpu::GPUExecutable::GPUExecutable(shared_ptr<Function> func, bool enable_timing)
: m_context(new GPUBackend::BackendContext())
{
if (m_compiled_function == nullptr)
{
m_context->bind_cuda_context_to_thread();
m_compiled_function = runtime::gpu::GPUCompiledFunction::make(func, m_context);
m_compiled_function->m_emit_timing = enable_timing;
m_compiled_function->compile();
m_runtime = m_compiled_function->m_runtime;
m_inputs.resize(func->get_parameters().size());
m_outputs.resize(func->get_output_size());
}
set_parameters_and_results(*func);
}
void runtime::gpu::GPUExecutable::initialize_io(void** target,
const vector<shared_ptr<runtime::Tensor>>& source)
{
for (size_t i = 0; i < source.size(); i++)
{
shared_ptr<runtime::gpu::GPUTensor> tv =
dynamic_pointer_cast<runtime::gpu::GPUTensor>(source[i]);
if (tv)
{
target[i] = tv->m_allocated_buffer_pool;
}
else
{
throw invalid_argument("Tensors passed to GPU backend must be GPU Tensors");
}
}
}
bool runtime::gpu::GPUExecutable::call(const vector<shared_ptr<runtime::Tensor>>& outputs,
const vector<shared_ptr<runtime::Tensor>>& inputs)
{
if (m_compiled_function == nullptr)
{
throw runtime_error("compile() must be called before call().");
}
// ensure the GPURuntimeContext primitive pointers are valid
m_context->prepare_runtime_context();
// Device tensors
initialize_io(m_inputs.data(), inputs);
initialize_io(m_outputs.data(), outputs);
auto ctx = m_context->m_runtime_context.get();
m_runtime(m_inputs.data(), m_outputs.data(), ctx);
return true;
}
vector<runtime::PerformanceCounter> runtime::gpu::GPUExecutable::get_performance_data() const
{
std::vector<runtime::PerformanceCounter> rc;
if (m_compiled_function != nullptr)
{
m_compiled_function->get_performance_data(rc);
}
return rc;
}
//*****************************************************************************
// Copyright 2017-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#pragma once
#include <map>
#include <memory>
#include "gpu_backend_visibility.hpp"
#include "ngraph/runtime/backend.hpp"
#include "ngraph/runtime/backend_manager.hpp"
#include "ngraph/runtime/gpu/gpu_backend.hpp"
namespace ngraph
{
namespace runtime
{
namespace gpu
{
class GPUPrimitiveEmitter;
struct GPURuntimeContext;
class CudaContextManager;
using EntryPoint_t = void(void** inputs, void** outputs, GPURuntimeContext* ctx);
using EntryPoint = std::function<EntryPoint_t>;
class GPUExecutable : public Executable
{
public:
GPUExecutable(std::shared_ptr<Function> func, bool enable_timing);
bool call(const std::vector<std::shared_ptr<runtime::Tensor>>& outputs,
const std::vector<std::shared_ptr<runtime::Tensor>>& inputs) override;
// void remove_compiled_function(std::shared_ptr<Function> func) override;
std::vector<PerformanceCounter> get_performance_data() const override;
private:
std::shared_ptr<GPUCompiledFunction> m_compiled_function;
bool m_performance_counters_enabled = false;
EntryPoint m_runtime;
std::vector<void*> m_inputs;
std::vector<void*> m_outputs;
/// \brief Convert a vector of Tensor into a vector of void* where each void*
/// points to a Tensor's data buffer.
/// \param target Pointer to a pre-allocated array of void* with
/// size >= source.size()
/// \param source Source vector of Tensors
static void
initialize_io(void** target,
const std::vector<std::shared_ptr<runtime::Tensor>>& source);
std::shared_ptr<GPUBackend::BackendContext> m_context;
};
}
}
}
...@@ -153,7 +153,7 @@ std::string runtime::gpu::GPUExternalFunction::emit_op(GPUCompiledFunction* exte ...@@ -153,7 +153,7 @@ std::string runtime::gpu::GPUExternalFunction::emit_op(GPUCompiledFunction* exte
runtime::gpu::GPUExternalFunction::GPUExternalFunction( runtime::gpu::GPUExternalFunction::GPUExternalFunction(
const shared_ptr<ngraph::Function>& function, const shared_ptr<ngraph::Function>& function,
const std::shared_ptr<GPU_Backend::BackendContext>& shared_context) const std::shared_ptr<GPUBackend::BackendContext>& shared_context)
: GPUCompiledFunction(function, shared_context) : GPUCompiledFunction(function, shared_context)
{ {
} }
......
...@@ -52,7 +52,7 @@ namespace ngraph ...@@ -52,7 +52,7 @@ namespace ngraph
public: public:
GPUExternalFunction( GPUExternalFunction(
const std::shared_ptr<ngraph::Function>& function, const std::shared_ptr<ngraph::Function>& function,
const std::shared_ptr<GPU_Backend::BackendContext>& shared_context); const std::shared_ptr<GPUBackend::BackendContext>& shared_context);
virtual ~GPUExternalFunction(); virtual ~GPUExternalFunction();
virtual std::string virtual std::string
......
...@@ -128,7 +128,7 @@ std::string runtime::gpu::GPUInternalFunction::emit_op(GPUCompiledFunction* comp ...@@ -128,7 +128,7 @@ std::string runtime::gpu::GPUInternalFunction::emit_op(GPUCompiledFunction* comp
runtime::gpu::GPUInternalFunction::GPUInternalFunction( runtime::gpu::GPUInternalFunction::GPUInternalFunction(
const shared_ptr<ngraph::Function>& function, const shared_ptr<ngraph::Function>& function,
const std::shared_ptr<GPU_Backend::BackendContext>& shared_context) const std::shared_ptr<GPUBackend::BackendContext>& shared_context)
: GPUCompiledFunction(function, shared_context) : GPUCompiledFunction(function, shared_context)
{ {
} }
......
...@@ -49,7 +49,7 @@ namespace ngraph ...@@ -49,7 +49,7 @@ namespace ngraph
public: public:
GPUInternalFunction( GPUInternalFunction(
const std::shared_ptr<ngraph::Function>& function, const std::shared_ptr<ngraph::Function>& function,
const std::shared_ptr<GPU_Backend::BackendContext>& shared_context); const std::shared_ptr<GPUBackend::BackendContext>& shared_context);
virtual ~GPUInternalFunction(); virtual ~GPUInternalFunction();
virtual std::string virtual std::string
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment