Commit 55a25d41 authored by Chris Sullivan's avatar Chris Sullivan Committed by Robert Kimball

Refactored GPU backend state into BackendContext (#1186)

* Refactored GPU backend state into BackendContext and moved it to the highest level GPU_Backend.
Some bugs have appeared in so doing. Needs investigation.

* extra *block_size

* change grid_size to threads

* Bug fix in softmax cache parameters.

* Additional bug fix for maxpool1d cache parameters.

* Bug fix in softmax cache parameters.

* Additional bug fix for maxpool1d cache parameters.

* Remove temporary print statements.

* Use nthreads in primitive hash.

* Switched from using stack references for cudnn and cublas handles to heap pointers held only the c-struct GPURuntimeContext but managed by the GPU_Backend.

* Refactored the use of GPURuntimeContext* ctx throughout the emitters.

* Use std::prev instead of operator-- for memory iteratory capture

* bug fix from abaf1d7
parent 8bde818c
This diff is collapsed.
......@@ -38,8 +38,7 @@ namespace ngraph
friend class GPUPrimitiveEmitter;
public:
size_t build_pad(const GPURuntimeContext* ctx,
const std::array<std::string, 2>& dtypes,
size_t build_pad(const std::array<std::string, 2>& dtypes,
GPUShape input_shape,
GPUShape output_shape,
GPUShape pad_below,
......@@ -47,22 +46,19 @@ namespace ngraph
GPUShape pad_interior,
const std::string& pad_value = "");
size_t build_pad_dynamic(const GPURuntimeContext* ctx,
const std::array<std::string, 2>& dtypes,
size_t build_pad_dynamic(const std::array<std::string, 2>& dtypes,
GPUShape input_shape,
GPUShape output_shape,
GPUShape padding_below,
GPUShape padding_interior);
size_t build_1d_max_pool(const GPURuntimeContext* ctx,
const std::array<std::string, 2>& dtypes,
size_t build_1d_max_pool(const std::array<std::string, 2>& dtypes,
GPUShape input_shape,
GPUShape output_shape,
size_t window_width,
size_t window_stride);
size_t build_avg_pool(const GPURuntimeContext* ctx,
const std::array<std::string, 2>& dtypes,
size_t build_avg_pool(const std::array<std::string, 2>& dtypes,
GPUShape input_shape,
GPUShape output_shape,
GPUShape window_shape,
......@@ -70,23 +66,20 @@ namespace ngraph
GPUShape padding_below,
bool include_pad = false);
size_t build_slice(const GPURuntimeContext* ctx,
const std::array<std::string, 2>& dtypes,
size_t build_slice(const std::array<std::string, 2>& dtypes,
GPUShape input_shape,
GPUShape lower_bounds,
GPUShape slice_strides,
GPUShape output_shape);
size_t build_reduce_window(const GPURuntimeContext* ctx,
const OpName op_name,
size_t build_reduce_window(const OpName op_name,
const std::vector<std::string>& dtypes,
GPUShape input_shape,
GPUShape output_shape,
GPUShape reduce_window_shape,
GPUShape reduce_window_strides);
size_t build_reverse_sequence(const runtime::gpu::GPURuntimeContext* ctx,
const std::array<std::string, 3>& dtypes,
size_t build_reverse_sequence(const std::array<std::string, 3>& dtypes,
GPUShape input_shape0,
GPUShape input_shape1,
GPUShape output_shape,
......@@ -94,24 +87,21 @@ namespace ngraph
size_t sequence_axis);
template <typename T>
size_t build_elementwise(const GPURuntimeContext* ctx,
const std::vector<std::string>& dtypes,
size_t build_elementwise(const std::vector<std::string>& dtypes,
GPUShape tensor_shape)
{
return build_elementwise_n_to_1(
ctx, dtypes, tensor_shape, CudaOpMap<T>::op, CudaOpMap<T>::math_kernel);
dtypes, tensor_shape, CudaOpMap<T>::op, CudaOpMap<T>::math_kernel);
}
template <typename ELEMENTWISE_OP_TYPE, typename REDUCE_OP_TYPE = ngraph::op::Nop>
size_t build_elementwise_collective(const GPURuntimeContext* ctx,
const std::vector<std::string>& dtypes,
size_t build_elementwise_collective(const std::vector<std::string>& dtypes,
GPUShape tensor_shape,
const std::set<size_t>& reduced_tensors = {},
const std::set<size_t>& axes = {},
bool save_elementwise = false)
{
return build_fused_ew_to_collective(ctx,
dtypes,
return build_fused_ew_to_collective(dtypes,
tensor_shape,
reduced_tensors,
axes,
......@@ -121,26 +111,22 @@ namespace ngraph
save_elementwise);
}
size_t build_replace_slice(const GPURuntimeContext* ctx,
const std::array<std::string, 3>& dtypes,
size_t build_replace_slice(const std::array<std::string, 3>& dtypes,
GPUShape tensor_shape,
GPUShape source_shape,
GPUShape lower_bounds,
GPUShape upper_bounds,
GPUShape slice_stride);
size_t build_broadcast(const GPURuntimeContext* ctx,
const std::array<std::string, 2>& dtypes,
size_t build_broadcast(const std::array<std::string, 2>& dtypes,
GPUShape result_shape,
const std::set<size_t>& bcast_axes);
size_t build_reshape(const GPURuntimeContext* ctx,
const std::array<std::string, 2>& dtypes,
size_t build_reshape(const std::array<std::string, 2>& dtypes,
GPUShape input_shape,
GPUShape input_order);
size_t build_convolution(const GPURuntimeContext* ctx,
const std::array<std::string, 3>& dtypes,
size_t build_convolution(const std::array<std::string, 3>& dtypes,
GPUShape input_shape,
GPUShape input_pad_below,
GPUShape input_dilation,
......@@ -150,19 +136,17 @@ namespace ngraph
GPUShape output_shape);
private:
CUDAEmitter(GPUPrimitiveEmitter* emitter);
CUDAEmitter(GPUPrimitiveEmitter* emitter, GPURuntimeContext* ctx);
uint32_t align_to_block_size(uint32_t threads, uint32_t block_size);
void print_tensor_from_gpu(codegen::CodeWriter& writer,
const std::string& tensor_name,
GPUShape shape);
std::string include_helpers();
size_t build_elementwise_n_to_1(const GPURuntimeContext* ctx,
const std::vector<std::string>& dtypes,
size_t build_elementwise_n_to_1(const std::vector<std::string>& dtypes,
GPUShape tensor_shape,
const char* op,
const char* kernel);
size_t build_fused_ew_to_collective(const GPURuntimeContext* ctx,
const std::vector<std::string>& dtypes,
size_t build_fused_ew_to_collective(const std::vector<std::string>& dtypes,
GPUShape tensor_shape,
const std::set<size_t>& reduced_tensors,
const std::set<size_t>& axes,
......@@ -172,6 +156,7 @@ namespace ngraph
bool save_elementwise);
GPUPrimitiveEmitter* m_primitive_emitter;
GPURuntimeContext* m_ctx;
};
}
}
......
This diff is collapsed.
......@@ -56,8 +56,7 @@ namespace ngraph
Backward
};
size_t build_convolution(const runtime::gpu::GPURuntimeContext* ctx,
const std::string& dtype,
size_t build_convolution(const std::string& dtype,
const Shape& input_tensor_shape,
const Shape& input_filter_shape,
const Shape& output_tensor_shape,
......@@ -65,8 +64,7 @@ namespace ngraph
const Strides& window_dilation_strides,
const Shape& padding_below);
size_t build_convolution_backward_data(const runtime::gpu::GPURuntimeContext* ctx,
const std::string& dtype,
size_t build_convolution_backward_data(const std::string& dtype,
const Shape& input_filter_shape,
const Shape& input_tensor_shape,
const Shape& output_tensor_shape,
......@@ -74,8 +72,7 @@ namespace ngraph
const Strides& window_dilation_strides,
const Shape& padding_below);
size_t build_convolution_backward_filter(const runtime::gpu::GPURuntimeContext* ctx,
const std::string& dtype,
size_t build_convolution_backward_filter(const std::string& dtype,
const Shape& input_tensor_shape_0,
const Shape& input_tensor_shape_1,
const Shape& output_filter_shape,
......@@ -83,22 +80,19 @@ namespace ngraph
const Strides& window_dilation_strides,
const Shape& padding_below);
size_t build_reduce_forward(const GPURuntimeContext* ctx,
const cudnnReduceTensorOp_t& reduce_op,
size_t build_reduce_forward(const cudnnReduceTensorOp_t& reduce_op,
const std::string& dtype,
const Shape& input_shape,
const AxisSet& reduction_axes);
size_t build_tensor_op(const GPURuntimeContext* ctx,
const cudnnOpTensorOp_t& tensor_op,
size_t build_tensor_op(const cudnnOpTensorOp_t& tensor_op,
const std::string& dtype,
const Shape& input_shape,
const double alpha0,
const double alpha1,
const double beta);
size_t build_pooling(const GPURuntimeContext* ctx,
const cudnnPoolingMode_t& pool_op,
size_t build_pooling(const cudnnPoolingMode_t& pool_op,
const std::string& dtype,
const Prop& direction,
const ngraph::Shape& input_shape,
......@@ -108,23 +102,21 @@ namespace ngraph
const ngraph::Shape& padding_below,
const ngraph::Shape& padding_above);
size_t build_batchnorm(const runtime::gpu::GPURuntimeContext* ctx,
const cudnnBatchNormMode_t& bn_op,
size_t build_batchnorm(const cudnnBatchNormMode_t& bn_op,
const std::string& dtype,
const Prop& direction,
const Shape& tensor_shape,
const Shape& param_shape,
double epsilon);
size_t build_softmax(const runtime::gpu::GPURuntimeContext* ctx,
const cudnnSoftmaxAlgorithm_t& algorithm,
size_t build_softmax(const cudnnSoftmaxAlgorithm_t& algorithm,
const cudnnSoftmaxMode_t& mode,
const std::string& dtype,
const Prop& direction,
const Shape& tensor_shape);
private:
CUDNNEmitter(GPUPrimitiveEmitter* emitter);
CUDNNEmitter(GPUPrimitiveEmitter* emitter, GPURuntimeContext* ctx);
void* get_data_by_type(cudnnDataType_t data_type, double value);
......@@ -149,6 +141,7 @@ namespace ngraph
CUDNNHostParameters m_host_parameters;
GPUPrimitiveEmitter* m_primitive_emitter;
GPURuntimeContext* m_ctx;
};
}
}
......
......@@ -14,9 +14,15 @@
* limitations under the License.
*******************************************************************************/
#include "ngraph/runtime/gpu/gpu_backend.hpp"
#include <cublas_v2.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <cudnn.h>
#include "ngraph/graph_util.hpp"
#include "ngraph/runtime/gpu/gpu_backend.hpp"
#include "ngraph/runtime/gpu/gpu_external_function.hpp"
#include "ngraph/runtime/gpu/gpu_primitive_emitter.hpp"
#include "ngraph/runtime/gpu/gpu_tensor_view.hpp"
#include "ngraph/util.hpp"
......@@ -38,6 +44,57 @@ extern "C" void delete_backend(runtime::Backend* backend)
delete backend;
}
runtime::gpu::GPU_Backend::GPU_Backend()
: runtime::Backend()
, m_context(new BackendContext())
{
}
runtime::gpu::GPU_Backend::BackendContext::BackendContext()
: m_runtime_context(new GPURuntimeContext)
, m_primitive_emitter(new GPUPrimitiveEmitter(m_runtime_context))
{
// Create context use driver API and make it current, the runtime call will pickup the context
// http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html
// #interoperability-between-runtime-and-driver-apis
ngraph::runtime::gpu::CudaContextManager::Instance().SetContextCurrent();
m_runtime_context->cublas_handle = new cublasHandle_t;
cublasStatus_t cublasStatus = cublasCreate(m_runtime_context->cublas_handle);
if (cublasStatus != CUBLAS_STATUS_SUCCESS)
{
throw runtime_error("cuBLAS create handle failed");
}
// Pass scalars as reference on the Device
cublasSetPointerMode(*m_runtime_context->cublas_handle, CUBLAS_POINTER_MODE_DEVICE);
m_runtime_context->cudnn_handle = new cudnnHandle_t;
cudnnStatus_t cudnnStatus = cudnnCreate(m_runtime_context->cudnn_handle);
if (cudnnStatus != CUDNN_STATUS_SUCCESS)
{
throw runtime_error("cuDNN create handle failed");
}
// register with c-api runtime context
m_runtime_context->compiled_kernel_pool = new CudaFunctionPool;
}
void runtime::gpu::GPU_Backend::BackendContext::prepare_runtime_context()
{
// add pointers to gpu primitives into the gpu runtime context
m_runtime_context->gpu_primitives = m_primitive_emitter->get_primitives().data();
m_runtime_context->gpu_memory_primitives = m_primitive_emitter->get_memory_primitives().data();
}
runtime::gpu::GPU_Backend::BackendContext::~BackendContext()
{
cublasDestroy(*m_runtime_context->cublas_handle);
delete m_runtime_context->cublas_handle;
cudnnDestroy(*m_runtime_context->cudnn_handle);
delete m_runtime_context->cudnn_handle;
delete m_runtime_context->compiled_kernel_pool;
}
shared_ptr<runtime::gpu::GPU_CallFrame> runtime::gpu::GPU_Backend::make_call_frame(
const shared_ptr<GPU_ExternalFunction>& external_function)
{
......@@ -61,7 +118,7 @@ bool runtime::gpu::GPU_Backend::compile(shared_ptr<Function> func)
FunctionInstance& instance = m_function_map[func];
if (instance.m_external_function == nullptr)
{
instance.m_external_function = make_shared<GPU_ExternalFunction>(func);
instance.m_external_function = make_shared<GPU_ExternalFunction>(func, m_context);
instance.m_external_function->m_emit_timing = instance.m_performance_counters_enabled;
auto cf = instance.m_external_function->make_call_frame();
instance.m_call_frame = dynamic_pointer_cast<GPU_CallFrame>(cf);
......@@ -83,7 +140,9 @@ bool runtime::gpu::GPU_Backend::call(shared_ptr<Function> func,
rc = compile(func);
}
instance.m_call_frame->call(outputs, inputs);
// ensure the GPURuntimeContext primitive pointers are valid
m_context->prepare_runtime_context();
instance.m_call_frame->call(outputs, inputs, m_context->m_runtime_context.get());
return rc;
}
......
......@@ -31,10 +31,13 @@ namespace ngraph
class GPU_ExternalFunction;
class GPU_CallFrame;
class GPUPrimitiveEmitter;
struct GPURuntimeContext;
class GPU_Backend : public Backend
{
public:
GPU_Backend();
std::shared_ptr<ngraph::runtime::gpu::GPU_CallFrame> make_call_frame(
const std::shared_ptr<ngraph::runtime::gpu::GPU_ExternalFunction>&
external_function);
......@@ -59,6 +62,17 @@ namespace ngraph
std::vector<PerformanceCounter>
get_performance_data(std::shared_ptr<Function> func) const override;
class BackendContext
{
public:
BackendContext();
~BackendContext();
void prepare_runtime_context();
std::unique_ptr<GPURuntimeContext> m_runtime_context;
std::unique_ptr<GPUPrimitiveEmitter> m_primitive_emitter;
};
private:
class FunctionInstance
{
......@@ -69,6 +83,7 @@ namespace ngraph
};
std::map<std::shared_ptr<Function>, FunctionInstance> m_function_map;
std::shared_ptr<BackendContext> m_context;
};
}
}
......
......@@ -32,17 +32,16 @@ runtime::gpu::GPU_CallFrame::GPU_CallFrame(std::shared_ptr<GPU_ExternalFunction>
: m_external_function(external_function)
, m_compiled_function(compiled_function)
{
setup_runtime_context();
}
runtime::gpu::GPU_CallFrame::~GPU_CallFrame()
{
cleanup_runtime_context();
}
void runtime::gpu::GPU_CallFrame::call(
const std::vector<std::shared_ptr<runtime::TensorView>>& output_tvs,
const std::vector<std::shared_ptr<runtime::TensorView>>& input_tvs)
const std::vector<std::shared_ptr<runtime::TensorView>>& input_tvs,
GPURuntimeContext* ctx)
{
//Device tensors
vector<void*> inputs;
......@@ -61,18 +60,5 @@ void runtime::gpu::GPU_CallFrame::call(
outputs.push_back(tv->m_allocated_buffer_pool);
}
m_compiled_function(inputs.data(), outputs.data(), m_external_function->m_ctx.get());
}
void runtime::gpu::GPU_CallFrame::setup_runtime_context()
{
// add pointers to gpu primitives into the gpu runtime context
const auto& primitive_emitter = m_external_function->get_primitive_emitter();
m_external_function->m_ctx->gpu_primitives = primitive_emitter->get_primitives().data();
m_external_function->m_ctx->gpu_memory_primitives =
primitive_emitter->get_memory_primitives().data();
}
void runtime::gpu::GPU_CallFrame::cleanup_runtime_context()
{
m_compiled_function(inputs.data(), outputs.data(), ctx);
}
......@@ -53,10 +53,8 @@ namespace ngraph
///
/// Tuples will be expanded into their tensor views to build the call frame.
void call(const std::vector<std::shared_ptr<runtime::TensorView>>& outputs,
const std::vector<std::shared_ptr<runtime::TensorView>>& inputs);
void setup_runtime_context();
void cleanup_runtime_context();
const std::vector<std::shared_ptr<runtime::TensorView>>& inputs,
GPURuntimeContext* ctx);
protected:
std::shared_ptr<GPU_ExternalFunction> m_external_function;
......
This diff is collapsed.
......@@ -85,8 +85,8 @@ namespace ngraph
dtypes.push_back(arg.get_type());
}
dtypes.push_back(out[0].get_type());
auto ew_index = cuda_emitter->build_elementwise<T>(
external_function->ctx().get(), dtypes, out[0].get_shape());
auto ew_index =
cuda_emitter->build_elementwise<T>(dtypes, out[0].get_shape());
writer << "gpu::invoke_primitive(ctx, " << ew_index << ", ";
writer << "std::vector<void*>{" << args.front().get_name();
for (size_t i = 1; i < args.size(); i++)
......
......@@ -237,44 +237,21 @@ static const runtime::gpu::OpMap dispatcher{
{TI(ngraph::op::Or), &runtime::gpu::GPU_Emitter::emit_elementwise<ngraph::op::Or>}};
runtime::gpu::GPU_ExternalFunction::GPU_ExternalFunction(
const shared_ptr<ngraph::Function>& function, bool release_function)
const shared_ptr<ngraph::Function>& function,
std::shared_ptr<GPU_Backend::BackendContext>& shared_context,
bool release_function)
: m_compiled_function(nullptr)
, m_ctx(new GPURuntimeContext)
, m_function(function)
, m_emit_timing(false)
, m_is_compiled(false)
, m_release_function(release_function)
, m_temporaries_used(false)
, m_shared_context(shared_context)
{
// Create context use driver API and make it current, the runtime call will pickup the context
// http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html
// #interoperability-between-runtime-and-driver-apis
ngraph::runtime::gpu::CudaContextManager::Instance().SetContextCurrent();
cublasStatus_t cublasStatus = cublasCreate(&m_cublas_handle);
if (cublasStatus != CUBLAS_STATUS_SUCCESS)
{
throw runtime_error("cuBLAS create handle failed");
}
cudnnStatus_t cudnnStatus = cudnnCreate(&m_cudnn_handle);
if (cudnnStatus != CUDNN_STATUS_SUCCESS)
{
throw runtime_error("cuDNN create handle failed");
}
// Pass scalars as reference on the Device
cublasSetPointerMode(m_cublas_handle, CUBLAS_POINTER_MODE_DEVICE);
// register with c-api runtime context
m_ctx->cublas_handle = &m_cublas_handle;
m_ctx->cudnn_handle = &m_cudnn_handle;
m_ctx->compiled_kernel_pool = new CudaFunctionPool;
}
runtime::gpu::GPU_ExternalFunction::~GPU_ExternalFunction()
{
cublasDestroy(m_cublas_handle);
cudnnDestroy(m_cudnn_handle);
delete m_ctx->compiled_kernel_pool;
}
void runtime::gpu::GPU_ExternalFunction::emit_header()
......@@ -387,7 +364,8 @@ void runtime::gpu::GPU_ExternalFunction::emit_constant_declarations()
{
shared_ptr<descriptor::TensorView> tv = node->get_outputs()[0].get_tensor_view();
// get an allocator for transient per kernel gpu memory
GPUAllocator allocator = this->m_primitive_emitter->get_memory_allocator();
GPUAllocator allocator =
m_shared_context->m_primitive_emitter->get_memory_allocator();
size_t idx = allocator.reserve_argspace(
c->get_data_ptr(),
tv->get_tensor().size() * tv->get_tensor().get_element_type().size());
......@@ -698,8 +676,6 @@ void runtime::gpu::GPU_ExternalFunction::compile()
return;
}
m_primitive_emitter.reset(new GPUPrimitiveEmitter());
m_function_name = m_function->get_name();
string dump_filename = file_util::path_join(s_output_dir, m_function_name + "_ops.txt");
......@@ -722,8 +698,9 @@ void runtime::gpu::GPU_ExternalFunction::compile()
emit_function_declarations();
collect_unique_functions();
emit_functions();
// allocate device buffers for primitive arguments and workspace
m_primitive_emitter->allocate_primitive_memory();
m_shared_context->m_primitive_emitter->allocate_primitive_memory();
string code = m_writer.get_code();
store_emitted_functions(code);
......@@ -781,11 +758,6 @@ void runtime::gpu::GPU_ExternalFunction::emit_debug_function_exit(Node* node)
}
}
unique_ptr<runtime::gpu::GPURuntimeContext>& runtime::gpu::GPU_ExternalFunction::ctx()
{
return m_ctx;
}
string runtime::gpu::GPU_ExternalFunction::emit_op_as_function(const Node& node,
const string& function_name)
{
......
......@@ -32,6 +32,7 @@
#include "ngraph/pass/manager.hpp"
#include "ngraph/pass/memory_layout.hpp"
#include "ngraph/pass/result_copy_elimination.hpp"
#include "ngraph/runtime/gpu/gpu_backend.hpp"
#include "ngraph/runtime/gpu/gpu_call_frame.hpp"
#include "ngraph/runtime/gpu/gpu_primitive_emitter.hpp"
#include "ngraph/runtime/gpu/gpu_tensor_view_wrapper.hpp"
......@@ -62,6 +63,7 @@ namespace ngraph
public:
GPU_ExternalFunction(const std::shared_ptr<ngraph::Function>& function,
std::shared_ptr<GPU_Backend::BackendContext>& shared_context,
bool release_function = true);
~GPU_ExternalFunction();
......@@ -69,7 +71,7 @@ namespace ngraph
std::unique_ptr<runtime::gpu::GPURuntimeContext>& ctx();
const std::unique_ptr<GPUPrimitiveEmitter>& get_primitive_emitter() const
{
return m_primitive_emitter;
return m_shared_context->m_primitive_emitter;
}
protected:
......@@ -98,8 +100,6 @@ namespace ngraph
std::unique_ptr<codegen::Compiler> m_compiler;
std::unique_ptr<codegen::ExecutionEngine> m_execution_engine;
std::unique_ptr<GPUPrimitiveEmitter> m_primitive_emitter;
std::unique_ptr<GPURuntimeContext> m_ctx;
std::shared_ptr<ngraph::Function> m_function;
std::map<std::string, size_t> m_name_index_map;
......@@ -116,8 +116,7 @@ namespace ngraph
std::string m_function_name;
std::string m_pch_header_source;
cublasHandle_t m_cublas_handle;
cudnnHandle_t m_cudnn_handle;
std::shared_ptr<GPU_Backend::BackendContext> m_shared_context;
};
}
}
......
......@@ -28,17 +28,36 @@ runtime::gpu::GPUMemoryManager::GPUMemoryManager(GPUPrimitiveEmitter* emitter)
: m_buffer_offset(0)
, m_buffered_mem(initial_buffer_size)
, m_workspace_manager(alignment)
, m_argspace(nullptr)
, m_workspace(nullptr)
, m_allocation_size(0)
, m_argspace_mem(1, {nullptr, 0})
, m_workspace_mem(1, {nullptr, 0})
, m_primitive_emitter(emitter)
{
}
size_t runtime::gpu::GPUMemoryManager::get_allocation_size() const
{
size_t allocation_size = 0;
for (auto const& alloc : m_argspace_mem)
{
allocation_size += alloc.size;
}
for (auto const& alloc : m_workspace_mem)
{
allocation_size += alloc.size;
}
return allocation_size;
}
runtime::gpu::GPUMemoryManager::~GPUMemoryManager()
{
runtime::gpu::free_gpu_buffer(m_argspace);
runtime::gpu::free_gpu_buffer(m_workspace);
for (auto& alloc : m_argspace_mem)
{
runtime::gpu::free_gpu_buffer(alloc.ptr);
}
for (auto& alloc : m_workspace_mem)
{
runtime::gpu::free_gpu_buffer(alloc.ptr);
}
}
void runtime::gpu::GPUMemoryManager::allocate()
......@@ -46,15 +65,25 @@ void runtime::gpu::GPUMemoryManager::allocate()
if (m_buffer_offset)
{
m_buffer_offset = pass::MemoryManager::align(m_buffer_offset, alignment);
m_argspace = runtime::gpu::create_gpu_buffer(m_buffer_offset);
runtime::gpu::cuda_memcpyHtD(m_argspace, m_buffered_mem.data(), m_buffer_offset);
m_allocation_size += m_buffer_offset;
// the back most node is always empty, fill it here
m_argspace_mem.back().ptr = runtime::gpu::create_gpu_buffer(m_buffer_offset);
m_argspace_mem.back().size = m_buffer_offset;
// copy buffered kernel arguments to device
runtime::gpu::cuda_memcpyHtD(
m_argspace_mem.back().ptr, m_buffered_mem.data(), m_buffer_offset);
// add an empty node to the end of the list and zero offset
m_argspace_mem.push_back({nullptr, 0});
m_buffer_offset = 0;
}
auto workspace_size = m_workspace_manager.max_allocated();
if (workspace_size)
{
m_workspace = runtime::gpu::create_gpu_buffer(workspace_size);
m_allocation_size += workspace_size;
m_workspace_mem.back().ptr = runtime::gpu::create_gpu_buffer(workspace_size);
m_workspace_mem.back().size = workspace_size;
m_workspace_mem.push_back({nullptr, 0});
// construct a new manager if the current one was used
m_workspace_manager = pass::MemoryManager(alignment);
}
}
......@@ -86,17 +115,16 @@ size_t runtime::gpu::GPUAllocator::reserve_argspace(const void* data, size_t siz
// add parameter data to host buffer that will be transfered to device
size = pass::MemoryManager::align(size, runtime::gpu::GPUMemoryManager::alignment);
size_t offset = m_manager->queue_for_transfer(data, size);
// required to capture m_manager pointer
// directly rather than `this` pointer
auto manager = m_manager;
auto local = std::prev(m_manager->m_argspace_mem.end());
// return a lambda that will yield the gpu memory address. this
// should only be evaluated by the runtime invoked primitive
gpu::memory_primitive mem_primitive = [=]() {
if (manager->m_argspace == nullptr)
void* argspace = (*local).ptr;
if (argspace == nullptr)
{
throw std::runtime_error("An attempt was made to use unallocated device memory.");
}
auto gpu_mem = static_cast<uint8_t*>(manager->m_argspace);
auto gpu_mem = static_cast<uint8_t*>(argspace);
return static_cast<void*>(gpu_mem + offset);
};
return m_manager->m_primitive_emitter->insert(mem_primitive);
......@@ -106,23 +134,22 @@ size_t runtime::gpu::GPUAllocator::reserve_workspace(size_t size, bool zero_init
{
size_t offset = m_manager->m_workspace_manager.allocate(size);
m_active.push(offset);
// required to capture m_manager pointer
// directly rather than `this` pointer
auto manager = m_manager;
auto local = std::prev(m_manager->m_workspace_mem.end());
// return a lambda that will yield the gpu memory address. this
// should only be evaluated by the runtime invoked primitive
gpu::memory_primitive mem_primitive = [=]() {
if (manager->m_workspace == nullptr)
void* workspace = (*local).ptr;
if (workspace == nullptr)
{
throw std::runtime_error("An attempt was made to use unallocated device memory.");
}
auto gpu_mem = static_cast<uint8_t*>(manager->m_workspace);
auto workspace = static_cast<void*>(gpu_mem + offset);
auto gpu_mem = static_cast<uint8_t*>(workspace);
auto workspace_ptr = static_cast<void*>(gpu_mem + offset);
if (zero_initialize)
{
runtime::gpu::cuda_memset(workspace, 0, size);
runtime::gpu::cuda_memset(workspace_ptr, 0, size);
}
return workspace;
return workspace_ptr;
};
return m_manager->m_primitive_emitter->insert(mem_primitive);
}
......
......@@ -16,6 +16,7 @@
#pragma once
#include <list>
#include <memory>
#include <stack>
#include <vector>
......@@ -65,7 +66,7 @@ namespace ngraph
~GPUMemoryManager();
void allocate();
size_t get_allocation_size() { return m_allocation_size; }
size_t get_allocation_size() const;
GPUAllocator build_allocator() { return GPUAllocator(this); }
private:
GPUMemoryManager(GPUPrimitiveEmitter* emitter);
......@@ -75,10 +76,15 @@ namespace ngraph
std::vector<uint8_t> m_buffered_mem;
pass::MemoryManager m_workspace_manager;
static constexpr const uint16_t alignment = 8;
void* m_argspace;
void* m_workspace;
size_t m_allocation_size;
struct allocation
{
void* ptr;
size_t size;
};
std::list<allocation> m_argspace_mem;
std::list<allocation> m_workspace_mem;
GPUPrimitiveEmitter* m_primitive_emitter;
};
}
......
......@@ -23,8 +23,15 @@ using namespace ngraph;
using namespace ngraph::runtime::gpu;
GPUPrimitiveEmitter::GPUPrimitiveEmitter()
: m_cuda_emitter(new CUDAEmitter(this))
, m_cudnn_emitter(new CUDNNEmitter(this))
: m_cuda_emitter(new CUDAEmitter(this, nullptr))
, m_cudnn_emitter(new CUDNNEmitter(this, nullptr))
, m_memory_manager(this)
{
}
GPUPrimitiveEmitter::GPUPrimitiveEmitter(const std::unique_ptr<GPURuntimeContext>& ctx)
: m_cuda_emitter(new CUDAEmitter(this, ctx.get()))
, m_cudnn_emitter(new CUDNNEmitter(this, ctx.get()))
, m_memory_manager(this)
{
}
......
......@@ -31,11 +31,11 @@ namespace ngraph
{
class CUDAEmitter;
class CUDNNEmitter;
class GPUPrimitiveEmitter
{
public:
GPUPrimitiveEmitter();
GPUPrimitiveEmitter(const std::unique_ptr<GPURuntimeContext>& ctx);
std::unique_ptr<CUDAEmitter>& get_cuda_emitter();
std::unique_ptr<CUDNNEmitter>& get_cudnn_emitter();
std::vector<gpu::primitive*>& get_primitives() { return m_gpu_primitives; }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment