Commit 55a25d41 authored by Chris Sullivan's avatar Chris Sullivan Committed by Robert Kimball

Refactored GPU backend state into BackendContext (#1186)

* Refactored GPU backend state into BackendContext and moved it to the highest level GPU_Backend.
Some bugs have appeared in so doing. Needs investigation.

* extra *block_size

* change grid_size to threads

* Bug fix in softmax cache parameters.

* Additional bug fix for maxpool1d cache parameters.

* Bug fix in softmax cache parameters.

* Additional bug fix for maxpool1d cache parameters.

* Remove temporary print statements.

* Use nthreads in primitive hash.

* Switched from using stack references for cudnn and cublas handles to heap pointers held only the c-struct GPURuntimeContext but managed by the GPU_Backend.

* Refactored the use of GPURuntimeContext* ctx throughout the emitters.

* Use std::prev instead of operator-- for memory iteratory capture

* bug fix from abaf1d7
parent 8bde818c
...@@ -62,13 +62,14 @@ std::ostream& operator<<(std::ostream& os, pooling_op_shape& shape) ...@@ -62,13 +62,14 @@ std::ostream& operator<<(std::ostream& os, pooling_op_shape& shape)
<< shape.PAD_D << "_" << shape.PAD_H << "_" << shape.PAD_W; << shape.PAD_D << "_" << shape.PAD_H << "_" << shape.PAD_W;
} }
runtime::gpu::CUDAEmitter::CUDAEmitter(runtime::gpu::GPUPrimitiveEmitter* emitter) runtime::gpu::CUDAEmitter::CUDAEmitter(runtime::gpu::GPUPrimitiveEmitter* emitter,
runtime::gpu::GPURuntimeContext* ctx)
: m_primitive_emitter(emitter) : m_primitive_emitter(emitter)
{ {
m_ctx = ctx;
} }
size_t runtime::gpu::CUDAEmitter::build_pad(const runtime::gpu::GPURuntimeContext* ctx, size_t runtime::gpu::CUDAEmitter::build_pad(const std::array<std::string, 2>& dtypes,
const std::array<std::string, 2>& dtypes,
GPUShape input_shape, GPUShape input_shape,
GPUShape output_shape, GPUShape output_shape,
GPUShape padding_below, GPUShape padding_below,
...@@ -103,7 +104,7 @@ size_t runtime::gpu::CUDAEmitter::build_pad(const runtime::gpu::GPURuntimeContex ...@@ -103,7 +104,7 @@ size_t runtime::gpu::CUDAEmitter::build_pad(const runtime::gpu::GPURuntimeContex
uint32_t aligned_grid_size_x = align_to_block_size(nthreads, block_size_x); uint32_t aligned_grid_size_x = align_to_block_size(nthreads, block_size_x);
// if the kernel has not been compiled, build it // if the kernel has not been compiled, build it
auto compiled_kernel = ctx->compiled_kernel_pool->get(hash); auto compiled_kernel = m_ctx->compiled_kernel_pool->get(hash);
if (compiled_kernel == nullptr) if (compiled_kernel == nullptr)
{ {
// normalize pad dimensions to shape dimensions // normalize pad dimensions to shape dimensions
...@@ -186,7 +187,7 @@ size_t runtime::gpu::CUDAEmitter::build_pad(const runtime::gpu::GPURuntimeContex ...@@ -186,7 +187,7 @@ size_t runtime::gpu::CUDAEmitter::build_pad(const runtime::gpu::GPURuntimeContex
} }
writer.block_end(); writer.block_end();
compiled_kernel = ctx->compiled_kernel_pool->set(hash, writer.get_code()); compiled_kernel = m_ctx->compiled_kernel_pool->set(hash, writer.get_code());
} }
std::unique_ptr<gpu::primitive> pad; std::unique_ptr<gpu::primitive> pad;
...@@ -233,8 +234,7 @@ size_t runtime::gpu::CUDAEmitter::build_pad(const runtime::gpu::GPURuntimeContex ...@@ -233,8 +234,7 @@ size_t runtime::gpu::CUDAEmitter::build_pad(const runtime::gpu::GPURuntimeContex
return primitive_index; return primitive_index;
} }
size_t runtime::gpu::CUDAEmitter::build_pad_dynamic(const runtime::gpu::GPURuntimeContext* ctx, size_t runtime::gpu::CUDAEmitter::build_pad_dynamic(const std::array<std::string, 2>& dtypes,
const std::array<std::string, 2>& dtypes,
GPUShape input_shape, GPUShape input_shape,
GPUShape output_shape, GPUShape output_shape,
GPUShape padding_below, GPUShape padding_below,
...@@ -261,13 +261,13 @@ size_t runtime::gpu::CUDAEmitter::build_pad_dynamic(const runtime::gpu::GPURunti ...@@ -261,13 +261,13 @@ size_t runtime::gpu::CUDAEmitter::build_pad_dynamic(const runtime::gpu::GPURunti
// a launch primitive for it based on the input tensor shape // a launch primitive for it based on the input tensor shape
// but do not recompile the kernel. otherwise, do it all: // but do not recompile the kernel. otherwise, do it all:
// recompile the kernel and then create the primitive // recompile the kernel and then create the primitive
auto compiled_kernel = ctx->compiled_kernel_pool->get(kernel_name.str()); auto compiled_kernel = m_ctx->compiled_kernel_pool->get(kernel_name.str());
if (compiled_kernel == nullptr) if (compiled_kernel == nullptr)
{ {
codegen::CodeWriter writer; codegen::CodeWriter writer;
CudaKernelBuilder::add_pod_typedefs(writer); CudaKernelBuilder::add_pod_typedefs(writer);
CudaKernelBuilder::get_pad_dynamic_op(writer, kernel_name.str(), dtypes); CudaKernelBuilder::get_pad_dynamic_op(writer, kernel_name.str(), dtypes);
compiled_kernel = ctx->compiled_kernel_pool->set(kernel_name.str(), writer.get_code()); compiled_kernel = m_ctx->compiled_kernel_pool->set(kernel_name.str(), writer.get_code());
} }
uint32_t rank = static_cast<uint32_t>(input_shape.size()); uint32_t rank = static_cast<uint32_t>(input_shape.size());
...@@ -300,11 +300,12 @@ size_t runtime::gpu::CUDAEmitter::build_pad_dynamic(const runtime::gpu::GPURunti ...@@ -300,11 +300,12 @@ size_t runtime::gpu::CUDAEmitter::build_pad_dynamic(const runtime::gpu::GPURunti
// create the launch primitive // create the launch primitive
std::unique_ptr<gpu::primitive> pad_dynamic(new gpu::primitive{[=](void** inputs, std::unique_ptr<gpu::primitive> pad_dynamic(new gpu::primitive{[=](void** inputs,
void** outputs) mutable { void** outputs) mutable {
void* param_input_strides = runtime::gpu::invoke_memory_primitive(ctx, idx_input_strides); void* param_input_strides = runtime::gpu::invoke_memory_primitive(m_ctx, idx_input_strides);
void* param_output_strides = runtime::gpu::invoke_memory_primitive(ctx, idx_output_strides); void* param_output_strides =
void* param_padding_below = runtime::gpu::invoke_memory_primitive(ctx, idx_padding_below); runtime::gpu::invoke_memory_primitive(m_ctx, idx_output_strides);
void* param_padding_below = runtime::gpu::invoke_memory_primitive(m_ctx, idx_padding_below);
void* param_padding_interior = void* param_padding_interior =
runtime::gpu::invoke_memory_primitive(ctx, idx_padding_interior); runtime::gpu::invoke_memory_primitive(m_ctx, idx_padding_interior);
std::vector<void*> args_list{&inputs[0], std::vector<void*> args_list{&inputs[0],
&outputs[0], &outputs[0],
&param_input_strides, &param_input_strides,
...@@ -332,9 +333,7 @@ size_t runtime::gpu::CUDAEmitter::build_pad_dynamic(const runtime::gpu::GPURunti ...@@ -332,9 +333,7 @@ size_t runtime::gpu::CUDAEmitter::build_pad_dynamic(const runtime::gpu::GPURunti
m_primitive_emitter->cache(hash, primitive_index); m_primitive_emitter->cache(hash, primitive_index);
return primitive_index; return primitive_index;
} }
size_t runtime::gpu::CUDAEmitter::build_reshape(const std::array<std::string, 2>& dtypes,
size_t runtime::gpu::CUDAEmitter::build_reshape(const runtime::gpu::GPURuntimeContext* ctx,
const std::array<std::string, 2>& dtypes,
GPUShape input_shape, GPUShape input_shape,
GPUShape input_order) GPUShape input_order)
{ {
...@@ -359,13 +358,13 @@ size_t runtime::gpu::CUDAEmitter::build_reshape(const runtime::gpu::GPURuntimeCo ...@@ -359,13 +358,13 @@ size_t runtime::gpu::CUDAEmitter::build_reshape(const runtime::gpu::GPURuntimeCo
// a launch primitive for it based on the input tensor shape // a launch primitive for it based on the input tensor shape
// but do not recompile the kernel. otherwise, do it all: // but do not recompile the kernel. otherwise, do it all:
// recompile the kernel and then create the primitive // recompile the kernel and then create the primitive
auto compiled_kernel = ctx->compiled_kernel_pool->get(kernel_name.str()); auto compiled_kernel = m_ctx->compiled_kernel_pool->get(kernel_name.str());
if (compiled_kernel == nullptr) if (compiled_kernel == nullptr)
{ {
codegen::CodeWriter writer; codegen::CodeWriter writer;
CudaKernelBuilder::add_pod_typedefs(writer); CudaKernelBuilder::add_pod_typedefs(writer);
CudaKernelBuilder::get_reshape_op(writer, kernel_name.str(), dtypes, rank); CudaKernelBuilder::get_reshape_op(writer, kernel_name.str(), dtypes, rank);
compiled_kernel = ctx->compiled_kernel_pool->set(kernel_name.str(), writer.get_code()); compiled_kernel = m_ctx->compiled_kernel_pool->set(kernel_name.str(), writer.get_code());
} }
uint32_t nthreads = static_cast<uint32_t>(shape_size(input_shape)); uint32_t nthreads = static_cast<uint32_t>(shape_size(input_shape));
...@@ -396,8 +395,8 @@ size_t runtime::gpu::CUDAEmitter::build_reshape(const runtime::gpu::GPURuntimeCo ...@@ -396,8 +395,8 @@ size_t runtime::gpu::CUDAEmitter::build_reshape(const runtime::gpu::GPURuntimeCo
// create the launch primitive // create the launch primitive
std::unique_ptr<gpu::primitive> kernel_launch(new gpu::primitive{[=](void** inputs, std::unique_ptr<gpu::primitive> kernel_launch(new gpu::primitive{[=](void** inputs,
void** outputs) mutable { void** outputs) mutable {
void* param_input_strides = runtime::gpu::invoke_memory_primitive(ctx, idx_input_strides); void* param_input_strides = runtime::gpu::invoke_memory_primitive(m_ctx, idx_input_strides);
void* param_trans_strides = runtime::gpu::invoke_memory_primitive(ctx, idx_trans_strides); void* param_trans_strides = runtime::gpu::invoke_memory_primitive(m_ctx, idx_trans_strides);
std::vector<void*> args_list{ std::vector<void*> args_list{
&inputs[0], &outputs[0], &param_input_strides, &param_trans_strides, &nthreads}; &inputs[0], &outputs[0], &param_input_strides, &param_trans_strides, &nthreads};
...@@ -420,8 +419,7 @@ size_t runtime::gpu::CUDAEmitter::build_reshape(const runtime::gpu::GPURuntimeCo ...@@ -420,8 +419,7 @@ size_t runtime::gpu::CUDAEmitter::build_reshape(const runtime::gpu::GPURuntimeCo
return primitive_index; return primitive_index;
} }
size_t runtime::gpu::CUDAEmitter::build_slice(const runtime::gpu::GPURuntimeContext* ctx, size_t runtime::gpu::CUDAEmitter::build_slice(const std::array<std::string, 2>& dtypes,
const std::array<std::string, 2>& dtypes,
GPUShape input_shape, GPUShape input_shape,
GPUShape lower_bounds, GPUShape lower_bounds,
GPUShape slice_strides, GPUShape slice_strides,
...@@ -448,13 +446,13 @@ size_t runtime::gpu::CUDAEmitter::build_slice(const runtime::gpu::GPURuntimeCont ...@@ -448,13 +446,13 @@ size_t runtime::gpu::CUDAEmitter::build_slice(const runtime::gpu::GPURuntimeCont
// a launch primitive for it based on the input tensor shape // a launch primitive for it based on the input tensor shape
// but do not recompile the kernel. otherwise, do it all: // but do not recompile the kernel. otherwise, do it all:
// recompile the kernel and then create the primitive // recompile the kernel and then create the primitive
auto compiled_kernel = ctx->compiled_kernel_pool->get(kernel_name.str()); auto compiled_kernel = m_ctx->compiled_kernel_pool->get(kernel_name.str());
if (compiled_kernel == nullptr) if (compiled_kernel == nullptr)
{ {
codegen::CodeWriter writer; codegen::CodeWriter writer;
CudaKernelBuilder::add_pod_typedefs(writer); CudaKernelBuilder::add_pod_typedefs(writer);
CudaKernelBuilder::get_slice_op(writer, kernel_name.str(), dtypes, output_shape.size()); CudaKernelBuilder::get_slice_op(writer, kernel_name.str(), dtypes, output_shape.size());
compiled_kernel = ctx->compiled_kernel_pool->set(kernel_name.str(), writer.get_code()); compiled_kernel = m_ctx->compiled_kernel_pool->set(kernel_name.str(), writer.get_code());
} }
uint32_t nthreads = static_cast<uint32_t>(shape_size(output_shape)); uint32_t nthreads = static_cast<uint32_t>(shape_size(output_shape));
...@@ -478,10 +476,11 @@ size_t runtime::gpu::CUDAEmitter::build_slice(const runtime::gpu::GPURuntimeCont ...@@ -478,10 +476,11 @@ size_t runtime::gpu::CUDAEmitter::build_slice(const runtime::gpu::GPURuntimeCont
// create the launch primitive // create the launch primitive
std::unique_ptr<gpu::primitive> kernel_launch(new gpu::primitive{[=](void** inputs, std::unique_ptr<gpu::primitive> kernel_launch(new gpu::primitive{[=](void** inputs,
void** outputs) mutable { void** outputs) mutable {
void* param_input_strides = runtime::gpu::invoke_memory_primitive(ctx, idx_input_strides); void* param_input_strides = runtime::gpu::invoke_memory_primitive(m_ctx, idx_input_strides);
void* param_output_strides = runtime::gpu::invoke_memory_primitive(ctx, idx_output_strides); void* param_output_strides =
void* param_lower_bounds = runtime::gpu::invoke_memory_primitive(ctx, idx_lower_bounds); runtime::gpu::invoke_memory_primitive(m_ctx, idx_output_strides);
void* param_slice_strides = runtime::gpu::invoke_memory_primitive(ctx, idx_slice_strides); void* param_lower_bounds = runtime::gpu::invoke_memory_primitive(m_ctx, idx_lower_bounds);
void* param_slice_strides = runtime::gpu::invoke_memory_primitive(m_ctx, idx_slice_strides);
std::vector<void*> args_list{&inputs[0], std::vector<void*> args_list{&inputs[0],
&outputs[0], &outputs[0],
&param_input_strides, &param_input_strides,
...@@ -509,8 +508,7 @@ size_t runtime::gpu::CUDAEmitter::build_slice(const runtime::gpu::GPURuntimeCont ...@@ -509,8 +508,7 @@ size_t runtime::gpu::CUDAEmitter::build_slice(const runtime::gpu::GPURuntimeCont
return primitive_index; return primitive_index;
} }
size_t runtime::gpu::CUDAEmitter::build_reverse_sequence(const runtime::gpu::GPURuntimeContext* ctx, size_t runtime::gpu::CUDAEmitter::build_reverse_sequence(const std::array<std::string, 3>& dtypes,
const std::array<std::string, 3>& dtypes,
GPUShape input_shape0, GPUShape input_shape0,
GPUShape input_shape1, GPUShape input_shape1,
GPUShape output_shape, GPUShape output_shape,
...@@ -538,14 +536,14 @@ size_t runtime::gpu::CUDAEmitter::build_reverse_sequence(const runtime::gpu::GPU ...@@ -538,14 +536,14 @@ size_t runtime::gpu::CUDAEmitter::build_reverse_sequence(const runtime::gpu::GPU
// a launch primitive for it based on the input tensor shape // a launch primitive for it based on the input tensor shape
// but do not recompile the kernel. otherwise, do it all: // but do not recompile the kernel. otherwise, do it all:
// recompile the kernel and then create the primitive // recompile the kernel and then create the primitive
auto compiled_kernel = ctx->compiled_kernel_pool->get(kernel_name.str()); auto compiled_kernel = m_ctx->compiled_kernel_pool->get(kernel_name.str());
if (compiled_kernel == nullptr) if (compiled_kernel == nullptr)
{ {
codegen::CodeWriter writer; codegen::CodeWriter writer;
CudaKernelBuilder::add_pod_typedefs(writer); CudaKernelBuilder::add_pod_typedefs(writer);
CudaKernelBuilder::get_reverse_sequence_op( CudaKernelBuilder::get_reverse_sequence_op(
writer, kernel_name.str(), dtypes, batch_axis, sequence_axis, output_shape.size()); writer, kernel_name.str(), dtypes, batch_axis, sequence_axis, output_shape.size());
compiled_kernel = ctx->compiled_kernel_pool->set(kernel_name.str(), writer.get_code()); compiled_kernel = m_ctx->compiled_kernel_pool->set(kernel_name.str(), writer.get_code());
} }
uint32_t nthreads = static_cast<uint32_t>(shape_size(output_shape)); uint32_t nthreads = static_cast<uint32_t>(shape_size(output_shape));
...@@ -564,8 +562,9 @@ size_t runtime::gpu::CUDAEmitter::build_reverse_sequence(const runtime::gpu::GPU ...@@ -564,8 +562,9 @@ size_t runtime::gpu::CUDAEmitter::build_reverse_sequence(const runtime::gpu::GPU
// create the launch primitive // create the launch primitive
std::unique_ptr<gpu::primitive> kernel_launch(new gpu::primitive{[=](void** inputs, std::unique_ptr<gpu::primitive> kernel_launch(new gpu::primitive{[=](void** inputs,
void** outputs) mutable { void** outputs) mutable {
void* param_output_shape = runtime::gpu::invoke_memory_primitive(ctx, idx_output_shape); void* param_output_shape = runtime::gpu::invoke_memory_primitive(m_ctx, idx_output_shape);
void* param_output_strides = runtime::gpu::invoke_memory_primitive(ctx, idx_output_strides); void* param_output_strides =
runtime::gpu::invoke_memory_primitive(m_ctx, idx_output_strides);
std::vector<void*> args_list{&inputs[0], std::vector<void*> args_list{&inputs[0],
&inputs[1], &inputs[1],
&outputs[0], &outputs[0],
...@@ -592,8 +591,7 @@ size_t runtime::gpu::CUDAEmitter::build_reverse_sequence(const runtime::gpu::GPU ...@@ -592,8 +591,7 @@ size_t runtime::gpu::CUDAEmitter::build_reverse_sequence(const runtime::gpu::GPU
return primitive_index; return primitive_index;
} }
size_t runtime::gpu::CUDAEmitter::build_1d_max_pool(const GPURuntimeContext* ctx, size_t runtime::gpu::CUDAEmitter::build_1d_max_pool(const std::array<std::string, 2>& dtypes,
const std::array<std::string, 2>& dtypes,
GPUShape input_shape, GPUShape input_shape,
GPUShape output_shape, GPUShape output_shape,
size_t window_width, size_t window_width,
...@@ -617,13 +615,13 @@ size_t runtime::gpu::CUDAEmitter::build_1d_max_pool(const GPURuntimeContext* ctx ...@@ -617,13 +615,13 @@ size_t runtime::gpu::CUDAEmitter::build_1d_max_pool(const GPURuntimeContext* ctx
} }
// if the kernel has not been compiled, build it // if the kernel has not been compiled, build it
auto compiled_kernel = ctx->compiled_kernel_pool->get(hash); auto compiled_kernel = m_ctx->compiled_kernel_pool->get(hash);
if (compiled_kernel == nullptr) if (compiled_kernel == nullptr)
{ {
codegen::CodeWriter writer; codegen::CodeWriter writer;
CudaKernelBuilder::get_max_pool_1d( CudaKernelBuilder::get_max_pool_1d(
writer, kernel_name, dtypes, input_width, output_width, window_width, window_stride); writer, kernel_name, dtypes, input_width, output_width, window_width, window_stride);
compiled_kernel = ctx->compiled_kernel_pool->set(kernel_name, writer.get_code()); compiled_kernel = m_ctx->compiled_kernel_pool->set(kernel_name, writer.get_code());
} }
//TODO: currently we set it to 64, will add tuning method later //TODO: currently we set it to 64, will add tuning method later
...@@ -722,8 +720,7 @@ pooling_op_shape ...@@ -722,8 +720,7 @@ pooling_op_shape
return shape; return shape;
} }
size_t runtime::gpu::CUDAEmitter::build_avg_pool(const GPURuntimeContext* ctx, size_t runtime::gpu::CUDAEmitter::build_avg_pool(const std::array<std::string, 2>& dtypes,
const std::array<std::string, 2>& dtypes,
GPUShape input_shape, GPUShape input_shape,
GPUShape output_shape, GPUShape output_shape,
GPUShape window_shape, GPUShape window_shape,
...@@ -750,13 +747,13 @@ size_t runtime::gpu::CUDAEmitter::build_avg_pool(const GPURuntimeContext* ctx, ...@@ -750,13 +747,13 @@ size_t runtime::gpu::CUDAEmitter::build_avg_pool(const GPURuntimeContext* ctx,
// if the kernel has not been compiled, build it // if the kernel has not been compiled, build it
kernel_name += "_ip" + std::to_string(int(include_pad)); kernel_name += "_ip" + std::to_string(int(include_pad));
auto compiled_kernel = ctx->compiled_kernel_pool->get(kernel_name); auto compiled_kernel = m_ctx->compiled_kernel_pool->get(kernel_name);
if (compiled_kernel == nullptr) if (compiled_kernel == nullptr)
{ {
codegen::CodeWriter writer; codegen::CodeWriter writer;
writer << include_helpers(); writer << include_helpers();
CudaKernelBuilder::get_avg_pool(writer, kernel_name, dtypes, include_pad); CudaKernelBuilder::get_avg_pool(writer, kernel_name, dtypes, include_pad);
compiled_kernel = ctx->compiled_kernel_pool->set(kernel_name, writer.get_code()); compiled_kernel = m_ctx->compiled_kernel_pool->set(kernel_name, writer.get_code());
} }
// precompute for fast constant memory access // precompute for fast constant memory access
...@@ -843,8 +840,7 @@ size_t runtime::gpu::CUDAEmitter::build_avg_pool(const GPURuntimeContext* ctx, ...@@ -843,8 +840,7 @@ size_t runtime::gpu::CUDAEmitter::build_avg_pool(const GPURuntimeContext* ctx,
return primitive_index; return primitive_index;
} }
size_t runtime::gpu::CUDAEmitter::build_elementwise_n_to_1(const GPURuntimeContext* ctx, size_t runtime::gpu::CUDAEmitter::build_elementwise_n_to_1(const std::vector<std::string>& dtypes,
const std::vector<std::string>& dtypes,
GPUShape tensor_shape, GPUShape tensor_shape,
const char* op, const char* op,
const char* kernel) const char* kernel)
...@@ -870,7 +866,7 @@ size_t runtime::gpu::CUDAEmitter::build_elementwise_n_to_1(const GPURuntimeConte ...@@ -870,7 +866,7 @@ size_t runtime::gpu::CUDAEmitter::build_elementwise_n_to_1(const GPURuntimeConte
// a launch primitive for it based on the input tensor shape // a launch primitive for it based on the input tensor shape
// but do not recompile the kernel. otherwise, do it all: // but do not recompile the kernel. otherwise, do it all:
// recompile the kernel and then create the primitive // recompile the kernel and then create the primitive
auto compiled_kernel = ctx->compiled_kernel_pool->get(kernel_name.str()); auto compiled_kernel = m_ctx->compiled_kernel_pool->get(kernel_name.str());
if (compiled_kernel == nullptr) if (compiled_kernel == nullptr)
{ {
codegen::CodeWriter writer; codegen::CodeWriter writer;
...@@ -882,7 +878,7 @@ size_t runtime::gpu::CUDAEmitter::build_elementwise_n_to_1(const GPURuntimeConte ...@@ -882,7 +878,7 @@ size_t runtime::gpu::CUDAEmitter::build_elementwise_n_to_1(const GPURuntimeConte
CudaKernelBuilder::get_elementwise_op(writer, kernel_name.str(), op, dtypes); CudaKernelBuilder::get_elementwise_op(writer, kernel_name.str(), op, dtypes);
compiled_kernel = ctx->compiled_kernel_pool->set(kernel_name.str(), writer.get_code()); compiled_kernel = m_ctx->compiled_kernel_pool->set(kernel_name.str(), writer.get_code());
} }
size_t nthreads = shape_size(tensor_shape); size_t nthreads = shape_size(tensor_shape);
//TODO: currently we set it to 64, will add tuning method later //TODO: currently we set it to 64, will add tuning method later
...@@ -920,8 +916,7 @@ size_t runtime::gpu::CUDAEmitter::build_elementwise_n_to_1(const GPURuntimeConte ...@@ -920,8 +916,7 @@ size_t runtime::gpu::CUDAEmitter::build_elementwise_n_to_1(const GPURuntimeConte
} }
size_t size_t
runtime::gpu::CUDAEmitter::build_fused_ew_to_collective(const GPURuntimeContext* ctx, runtime::gpu::CUDAEmitter::build_fused_ew_to_collective(const std::vector<std::string>& dtypes,
const std::vector<std::string>& dtypes,
GPUShape tensor_shape, GPUShape tensor_shape,
const std::set<size_t>& reduced_tensors, const std::set<size_t>& reduced_tensors,
const std::set<size_t>& axes, const std::set<size_t>& axes,
...@@ -954,7 +949,7 @@ size_t ...@@ -954,7 +949,7 @@ size_t
// a launch primitive for it based on the input tensor shape // a launch primitive for it based on the input tensor shape
// but do not recompile the kernel. otherwise, do it all: // but do not recompile the kernel. otherwise, do it all:
// recompile the kernel and then create the primitive // recompile the kernel and then create the primitive
auto compiled_kernel = ctx->compiled_kernel_pool->get(kernel_name.str()); auto compiled_kernel = m_ctx->compiled_kernel_pool->get(kernel_name.str());
if (compiled_kernel == nullptr) if (compiled_kernel == nullptr)
{ {
codegen::CodeWriter writer; codegen::CodeWriter writer;
...@@ -972,7 +967,7 @@ size_t ...@@ -972,7 +967,7 @@ size_t
reduced_tensors, reduced_tensors,
save_elementwise, save_elementwise,
tensor_shape.size()); tensor_shape.size());
compiled_kernel = ctx->compiled_kernel_pool->set(kernel_name.str(), writer.get_code()); compiled_kernel = m_ctx->compiled_kernel_pool->set(kernel_name.str(), writer.get_code());
} }
// calculate strides // calculate strides
...@@ -1016,10 +1011,10 @@ size_t ...@@ -1016,10 +1011,10 @@ size_t
// TODO: check if mutable is necessary // TODO: check if mutable is necessary
std::unique_ptr<gpu::primitive> ew_collective(new gpu::primitive{[=](void** inputs, std::unique_ptr<gpu::primitive> ew_collective(new gpu::primitive{[=](void** inputs,
void** outputs) mutable { void** outputs) mutable {
void* strides_d = runtime::gpu::invoke_memory_primitive(ctx, idx_strides); void* strides_d = runtime::gpu::invoke_memory_primitive(m_ctx, idx_strides);
void* stride_magic_d = runtime::gpu::invoke_memory_primitive(ctx, idx_stride_magic); void* stride_magic_d = runtime::gpu::invoke_memory_primitive(m_ctx, idx_stride_magic);
void* stride_shift_d = runtime::gpu::invoke_memory_primitive(ctx, idx_stride_shift); void* stride_shift_d = runtime::gpu::invoke_memory_primitive(m_ctx, idx_stride_shift);
void* reduced_strides_d = runtime::gpu::invoke_memory_primitive(ctx, idx_reduced_strides); void* reduced_strides_d = runtime::gpu::invoke_memory_primitive(m_ctx, idx_reduced_strides);
std::vector<void*> args_list; std::vector<void*> args_list;
for (auto i = 0u; i < dtypes.size() - 1; i++) for (auto i = 0u; i < dtypes.size() - 1; i++)
...@@ -1056,8 +1051,7 @@ size_t ...@@ -1056,8 +1051,7 @@ size_t
return primitive_index; return primitive_index;
} }
size_t runtime::gpu::CUDAEmitter::build_reduce_window(const GPURuntimeContext* ctx, size_t runtime::gpu::CUDAEmitter::build_reduce_window(const OpName op_name,
const OpName op_name,
const std::vector<std::string>& dtypes, const std::vector<std::string>& dtypes,
GPUShape input_shape, GPUShape input_shape,
GPUShape output_shape, GPUShape output_shape,
...@@ -1106,7 +1100,7 @@ size_t runtime::gpu::CUDAEmitter::build_reduce_window(const GPURuntimeContext* c ...@@ -1106,7 +1100,7 @@ size_t runtime::gpu::CUDAEmitter::build_reduce_window(const GPURuntimeContext* c
// a launch primitive for it based on the input tensor shape // a launch primitive for it based on the input tensor shape
// but do not recompile the kernel. otherwise, do it all: // but do not recompile the kernel. otherwise, do it all:
// recompile the kernel and then create the primitive // recompile the kernel and then create the primitive
auto compiled_kernel = ctx->compiled_kernel_pool->get(kernel_name.str()); auto compiled_kernel = m_ctx->compiled_kernel_pool->get(kernel_name.str());
if (compiled_kernel == nullptr) if (compiled_kernel == nullptr)
{ {
codegen::CodeWriter writer; codegen::CodeWriter writer;
...@@ -1116,7 +1110,7 @@ size_t runtime::gpu::CUDAEmitter::build_reduce_window(const GPURuntimeContext* c ...@@ -1116,7 +1110,7 @@ size_t runtime::gpu::CUDAEmitter::build_reduce_window(const GPURuntimeContext* c
CudaKernelBuilder::get_device_helper(writer, op, kernel, dtypes); CudaKernelBuilder::get_device_helper(writer, op, kernel, dtypes);
} }
CudaKernelBuilder::get_reduce_window_op(writer, kernel_name.str(), op, dtypes, rank); CudaKernelBuilder::get_reduce_window_op(writer, kernel_name.str(), op, dtypes, rank);
compiled_kernel = ctx->compiled_kernel_pool->set(kernel_name.str(), writer.get_code()); compiled_kernel = m_ctx->compiled_kernel_pool->set(kernel_name.str(), writer.get_code());
} }
size_t nthreads = shape_size(output_shape); size_t nthreads = shape_size(output_shape);
...@@ -1136,12 +1130,12 @@ size_t runtime::gpu::CUDAEmitter::build_reduce_window(const GPURuntimeContext* c ...@@ -1136,12 +1130,12 @@ size_t runtime::gpu::CUDAEmitter::build_reduce_window(const GPURuntimeContext* c
// create the launch primitive // create the launch primitive
std::unique_ptr<gpu::primitive> f(new gpu::primitive{[=](void** inputs, std::unique_ptr<gpu::primitive> f(new gpu::primitive{[=](void** inputs,
void** outputs) mutable { void** outputs) mutable {
void* param_input_strides = runtime::gpu::invoke_memory_primitive(ctx, idx_input_strides); void* param_input_strides = runtime::gpu::invoke_memory_primitive(m_ctx, idx_input_strides);
void* param_output_shape = runtime::gpu::invoke_memory_primitive(ctx, idx_output_shape); void* param_output_shape = runtime::gpu::invoke_memory_primitive(m_ctx, idx_output_shape);
void* param_reduce_window_shape = void* param_reduce_window_shape =
runtime::gpu::invoke_memory_primitive(ctx, idx_reduce_window_shape); runtime::gpu::invoke_memory_primitive(m_ctx, idx_reduce_window_shape);
void* param_reduce_window_strides = void* param_reduce_window_strides =
runtime::gpu::invoke_memory_primitive(ctx, idx_reduce_window_strides); runtime::gpu::invoke_memory_primitive(m_ctx, idx_reduce_window_strides);
std::vector<void*> args_list(7, NULL); std::vector<void*> args_list(7, NULL);
args_list[0] = &inputs[0]; args_list[0] = &inputs[0];
...@@ -1172,8 +1166,7 @@ size_t runtime::gpu::CUDAEmitter::build_reduce_window(const GPURuntimeContext* c ...@@ -1172,8 +1166,7 @@ size_t runtime::gpu::CUDAEmitter::build_reduce_window(const GPURuntimeContext* c
return primitive_index; return primitive_index;
} }
size_t runtime::gpu::CUDAEmitter::build_replace_slice(const GPURuntimeContext* ctx, size_t runtime::gpu::CUDAEmitter::build_replace_slice(const std::array<std::string, 3>& dtypes,
const std::array<std::string, 3>& dtypes,
GPUShape tensor_shape, GPUShape tensor_shape,
GPUShape source_shape, GPUShape source_shape,
GPUShape lower_bounds, GPUShape lower_bounds,
...@@ -1200,14 +1193,14 @@ size_t runtime::gpu::CUDAEmitter::build_replace_slice(const GPURuntimeContext* c ...@@ -1200,14 +1193,14 @@ size_t runtime::gpu::CUDAEmitter::build_replace_slice(const GPURuntimeContext* c
constexpr const int nthreads_per_block = 32; constexpr const int nthreads_per_block = 32;
// if the kernel has not been compiled, build it // if the kernel has not been compiled, build it
auto compiled_kernel = ctx->compiled_kernel_pool->get(kernel_name); auto compiled_kernel = m_ctx->compiled_kernel_pool->get(kernel_name);
if (compiled_kernel == nullptr) if (compiled_kernel == nullptr)
{ {
codegen::CodeWriter writer; codegen::CodeWriter writer;
writer << include_helpers(); writer << include_helpers();
runtime::gpu::CudaKernelBuilder::get_replace_slice_op( runtime::gpu::CudaKernelBuilder::get_replace_slice_op(
writer, kernel_name, dtypes, nthreads_per_block); writer, kernel_name, dtypes, nthreads_per_block);
compiled_kernel = ctx->compiled_kernel_pool->set(kernel_name, writer.get_code()); compiled_kernel = m_ctx->compiled_kernel_pool->set(kernel_name, writer.get_code());
} }
// calculate strides // calculate strides
...@@ -1261,58 +1254,57 @@ size_t runtime::gpu::CUDAEmitter::build_replace_slice(const GPURuntimeContext* c ...@@ -1261,58 +1254,57 @@ size_t runtime::gpu::CUDAEmitter::build_replace_slice(const GPURuntimeContext* c
float alpha = 1.0f; float alpha = 1.0f;
float beta = 0.0f; float beta = 0.0f;
std::unique_ptr<gpu::primitive> replace_slice( std::unique_ptr<gpu::primitive> replace_slice(new gpu::primitive{[=](void** inputs,
new gpu::primitive{[=](void** inputs, void** outputs) mutable { void** outputs) mutable {
void* param_dstr = runtime::gpu::invoke_memory_primitive(ctx, idx_input_strides); void* param_dstr = runtime::gpu::invoke_memory_primitive(m_ctx, idx_input_strides);
void* param_dmagic = runtime::gpu::invoke_memory_primitive(ctx, idx_dmagics); void* param_dmagic = runtime::gpu::invoke_memory_primitive(m_ctx, idx_dmagics);
void* param_dshift = runtime::gpu::invoke_memory_primitive(ctx, idx_dshifts); void* param_dshift = runtime::gpu::invoke_memory_primitive(m_ctx, idx_dshifts);
void* param_lbound = runtime::gpu::invoke_memory_primitive(ctx, idx_lower_bounds); void* param_lbound = runtime::gpu::invoke_memory_primitive(m_ctx, idx_lower_bounds);
void* param_ubound = runtime::gpu::invoke_memory_primitive(ctx, idx_upper_bounds); void* param_ubound = runtime::gpu::invoke_memory_primitive(m_ctx, idx_upper_bounds);
void* param_slice_str = runtime::gpu::invoke_memory_primitive(ctx, idx_slice_strides); void* param_slice_str = runtime::gpu::invoke_memory_primitive(m_ctx, idx_slice_strides);
void* param_slice_magic = runtime::gpu::invoke_memory_primitive(ctx, idx_smagics); void* param_slice_magic = runtime::gpu::invoke_memory_primitive(m_ctx, idx_smagics);
void* param_slice_shift = runtime::gpu::invoke_memory_primitive(ctx, idx_sshifts); void* param_slice_shift = runtime::gpu::invoke_memory_primitive(m_ctx, idx_sshifts);
void* param_dsource = runtime::gpu::invoke_memory_primitive(ctx, idx_source_shape); void* param_dsource = runtime::gpu::invoke_memory_primitive(m_ctx, idx_source_shape);
void* param_sourcestr = runtime::gpu::invoke_memory_primitive(ctx, idx_source_strides); void* param_sourcestr = runtime::gpu::invoke_memory_primitive(m_ctx, idx_source_strides);
void* args_list[] = {&inputs[0], void* args_list[] = {&inputs[0],
&inputs[1], &inputs[1],
&outputs[0], &outputs[0],
&alpha, &alpha,
&beta, &beta,
&param_dstr, &param_dstr,
&param_dmagic, &param_dmagic,
&param_dshift, &param_dshift,
&param_lbound, &param_lbound,
&param_ubound, &param_ubound,
&param_slice_str, &param_slice_str,
&param_slice_magic, &param_slice_magic,
&param_slice_shift, &param_slice_shift,
&param_dsource, &param_dsource,
&param_sourcestr, &param_sourcestr,
&rank, &rank,
&nthreads}; &nthreads};
CUDA_SAFE_CALL(cuLaunchKernel(*compiled_kernel.get(), CUDA_SAFE_CALL(cuLaunchKernel(*compiled_kernel.get(),
nblocks, nblocks,
1, 1,
1, 1,
nthreads_per_block, nthreads_per_block,
1, 1,
1, 1,
rank * nthreads_per_block * sizeof(int), rank * nthreads_per_block * sizeof(int),
NULL, NULL,
args_list, args_list,
0)); 0));
CUDA_SAFE_CALL(cuCtxSynchronize()); CUDA_SAFE_CALL(cuCtxSynchronize());
}}); }});
primitive_index = this->m_primitive_emitter->insert(std::move(replace_slice)); primitive_index = this->m_primitive_emitter->insert(std::move(replace_slice));
m_primitive_emitter->cache(hash, primitive_index); m_primitive_emitter->cache(hash, primitive_index);
return primitive_index; return primitive_index;
} }
size_t runtime::gpu::CUDAEmitter::build_broadcast(const GPURuntimeContext* ctx, size_t runtime::gpu::CUDAEmitter::build_broadcast(const std::array<std::string, 2>& dtypes,
const std::array<std::string, 2>& dtypes,
GPUShape result_shape, GPUShape result_shape,
const std::set<size_t>& reduce_axes) const std::set<size_t>& reduce_axes)
{ {
...@@ -1333,14 +1325,14 @@ size_t runtime::gpu::CUDAEmitter::build_broadcast(const GPURuntimeContext* ctx, ...@@ -1333,14 +1325,14 @@ size_t runtime::gpu::CUDAEmitter::build_broadcast(const GPURuntimeContext* ctx,
} }
// if the kernel has not been compiled, build it // if the kernel has not been compiled, build it
auto compiled_kernel = ctx->compiled_kernel_pool->get(kernel_name); auto compiled_kernel = m_ctx->compiled_kernel_pool->get(kernel_name);
if (compiled_kernel == nullptr) if (compiled_kernel == nullptr)
{ {
codegen::CodeWriter writer; codegen::CodeWriter writer;
writer << include_helpers(); writer << include_helpers();
runtime::gpu::CudaKernelBuilder::get_broadcast_op( runtime::gpu::CudaKernelBuilder::get_broadcast_op(
writer, kernel_name, dtypes, result_shape.size()); writer, kernel_name, dtypes, result_shape.size());
compiled_kernel = ctx->compiled_kernel_pool->set(kernel_name, writer.get_code()); compiled_kernel = m_ctx->compiled_kernel_pool->set(kernel_name, writer.get_code());
} }
// calculate strides // calculate strides
...@@ -1389,10 +1381,10 @@ size_t runtime::gpu::CUDAEmitter::build_broadcast(const GPURuntimeContext* ctx, ...@@ -1389,10 +1381,10 @@ size_t runtime::gpu::CUDAEmitter::build_broadcast(const GPURuntimeContext* ctx,
std::unique_ptr<gpu::primitive> broadcast(new gpu::primitive{[=](void** inputs, std::unique_ptr<gpu::primitive> broadcast(new gpu::primitive{[=](void** inputs,
void** outputs) mutable { void** outputs) mutable {
void* strides_d = runtime::gpu::invoke_memory_primitive(ctx, idx_strides); void* strides_d = runtime::gpu::invoke_memory_primitive(m_ctx, idx_strides);
void* stride_magic_d = runtime::gpu::invoke_memory_primitive(ctx, idx_stride_magic); void* stride_magic_d = runtime::gpu::invoke_memory_primitive(m_ctx, idx_stride_magic);
void* stride_shift_d = runtime::gpu::invoke_memory_primitive(ctx, idx_stride_shift); void* stride_shift_d = runtime::gpu::invoke_memory_primitive(m_ctx, idx_stride_shift);
void* reduced_strides_d = runtime::gpu::invoke_memory_primitive(ctx, idx_reduced_strides); void* reduced_strides_d = runtime::gpu::invoke_memory_primitive(m_ctx, idx_reduced_strides);
void* args_list[] = {&inputs[0], void* args_list[] = {&inputs[0],
&outputs[0], &outputs[0],
...@@ -1423,8 +1415,7 @@ size_t runtime::gpu::CUDAEmitter::build_broadcast(const GPURuntimeContext* ctx, ...@@ -1423,8 +1415,7 @@ size_t runtime::gpu::CUDAEmitter::build_broadcast(const GPURuntimeContext* ctx,
return primitive_index; return primitive_index;
} }
size_t runtime::gpu::CUDAEmitter::build_convolution(const GPURuntimeContext* ctx, size_t runtime::gpu::CUDAEmitter::build_convolution(const std::array<std::string, 3>& dtypes,
const std::array<std::string, 3>& dtypes,
GPUShape input_shape, GPUShape input_shape,
GPUShape input_pad_below, GPUShape input_pad_below,
GPUShape input_dilation, GPUShape input_dilation,
...@@ -1489,14 +1480,14 @@ size_t runtime::gpu::CUDAEmitter::build_convolution(const GPURuntimeContext* ctx ...@@ -1489,14 +1480,14 @@ size_t runtime::gpu::CUDAEmitter::build_convolution(const GPURuntimeContext* ctx
} }
// if the kernel has not been compiled, build it // if the kernel has not been compiled, build it
auto compiled_kernel = ctx->compiled_kernel_pool->get(kernel_name); auto compiled_kernel = m_ctx->compiled_kernel_pool->get(kernel_name);
if (compiled_kernel == nullptr) if (compiled_kernel == nullptr)
{ {
codegen::CodeWriter writer; codegen::CodeWriter writer;
writer << include_helpers(); writer << include_helpers();
CudaKernelBuilder::get_convolution_forward( CudaKernelBuilder::get_convolution_forward(
writer, kernel_name, dtypes, N, K, filter_size, rank, sm_tile_size, reg_tile_size); writer, kernel_name, dtypes, N, K, filter_size, rank, sm_tile_size, reg_tile_size);
compiled_kernel = ctx->compiled_kernel_pool->set(kernel_name, writer.get_code()); compiled_kernel = m_ctx->compiled_kernel_pool->set(kernel_name, writer.get_code());
} }
// ----- build primitive arguments ----- // ----- build primitive arguments -----
...@@ -1609,24 +1600,28 @@ size_t runtime::gpu::CUDAEmitter::build_convolution(const GPURuntimeContext* ctx ...@@ -1609,24 +1600,28 @@ size_t runtime::gpu::CUDAEmitter::build_convolution(const GPURuntimeContext* ctx
std::unique_ptr<gpu::primitive> conv(new gpu::primitive{[=](void** inputs, std::unique_ptr<gpu::primitive> conv(new gpu::primitive{[=](void** inputs,
void** outputs) mutable { void** outputs) mutable {
void* pad_d = runtime::gpu::invoke_memory_primitive(ctx, idx_pad); void* pad_d = runtime::gpu::invoke_memory_primitive(m_ctx, idx_pad);
void* data_dilation_d = runtime::gpu::invoke_memory_primitive(ctx, idx_data_dilation); void* data_dilation_d = runtime::gpu::invoke_memory_primitive(m_ctx, idx_data_dilation);
void* data_dilation_magic_d = void* data_dilation_magic_d =
runtime::gpu::invoke_memory_primitive(ctx, idx_data_dilation_magic); runtime::gpu::invoke_memory_primitive(m_ctx, idx_data_dilation_magic);
void* data_dilation_shift_d = void* data_dilation_shift_d =
runtime::gpu::invoke_memory_primitive(ctx, idx_data_dilation_shift); runtime::gpu::invoke_memory_primitive(m_ctx, idx_data_dilation_shift);
void* filter_strides_d = runtime::gpu::invoke_memory_primitive(ctx, idx_filter_strides); void* filter_strides_d = runtime::gpu::invoke_memory_primitive(m_ctx, idx_filter_strides);
void* filter_dilation_d = runtime::gpu::invoke_memory_primitive(ctx, idx_filter_dilation); void* filter_dilation_d = runtime::gpu::invoke_memory_primitive(m_ctx, idx_filter_dilation);
void* input_shape_d = runtime::gpu::invoke_memory_primitive(ctx, idx_input_shape); void* input_shape_d = runtime::gpu::invoke_memory_primitive(m_ctx, idx_input_shape);
void* input_shape_str_d = runtime::gpu::invoke_memory_primitive(ctx, idx_input_shape_str); void* input_shape_str_d = runtime::gpu::invoke_memory_primitive(m_ctx, idx_input_shape_str);
void* output_dim_strides_d = void* output_dim_strides_d =
runtime::gpu::invoke_memory_primitive(ctx, idx_output_dim_strides); runtime::gpu::invoke_memory_primitive(m_ctx, idx_output_dim_strides);
void* output_str_magic_d = runtime::gpu::invoke_memory_primitive(ctx, idx_output_str_magic); void* output_str_magic_d =
void* output_str_shift_d = runtime::gpu::invoke_memory_primitive(ctx, idx_output_str_shift); runtime::gpu::invoke_memory_primitive(m_ctx, idx_output_str_magic);
void* output_str_shift_d =
runtime::gpu::invoke_memory_primitive(m_ctx, idx_output_str_shift);
void* filter_dim_strides_d = void* filter_dim_strides_d =
runtime::gpu::invoke_memory_primitive(ctx, idx_filter_dim_strides); runtime::gpu::invoke_memory_primitive(m_ctx, idx_filter_dim_strides);
void* filter_str_magic_d = runtime::gpu::invoke_memory_primitive(ctx, idx_filter_str_magic); void* filter_str_magic_d =
void* filter_str_shift_d = runtime::gpu::invoke_memory_primitive(ctx, idx_filter_str_shift); runtime::gpu::invoke_memory_primitive(m_ctx, idx_filter_str_magic);
void* filter_str_shift_d =
runtime::gpu::invoke_memory_primitive(m_ctx, idx_filter_str_shift);
void* args_list[] = {&inputs[0], void* args_list[] = {&inputs[0],
&inputs[1], &inputs[1],
......
...@@ -38,8 +38,7 @@ namespace ngraph ...@@ -38,8 +38,7 @@ namespace ngraph
friend class GPUPrimitiveEmitter; friend class GPUPrimitiveEmitter;
public: public:
size_t build_pad(const GPURuntimeContext* ctx, size_t build_pad(const std::array<std::string, 2>& dtypes,
const std::array<std::string, 2>& dtypes,
GPUShape input_shape, GPUShape input_shape,
GPUShape output_shape, GPUShape output_shape,
GPUShape pad_below, GPUShape pad_below,
...@@ -47,22 +46,19 @@ namespace ngraph ...@@ -47,22 +46,19 @@ namespace ngraph
GPUShape pad_interior, GPUShape pad_interior,
const std::string& pad_value = ""); const std::string& pad_value = "");
size_t build_pad_dynamic(const GPURuntimeContext* ctx, size_t build_pad_dynamic(const std::array<std::string, 2>& dtypes,
const std::array<std::string, 2>& dtypes,
GPUShape input_shape, GPUShape input_shape,
GPUShape output_shape, GPUShape output_shape,
GPUShape padding_below, GPUShape padding_below,
GPUShape padding_interior); GPUShape padding_interior);
size_t build_1d_max_pool(const GPURuntimeContext* ctx, size_t build_1d_max_pool(const std::array<std::string, 2>& dtypes,
const std::array<std::string, 2>& dtypes,
GPUShape input_shape, GPUShape input_shape,
GPUShape output_shape, GPUShape output_shape,
size_t window_width, size_t window_width,
size_t window_stride); size_t window_stride);
size_t build_avg_pool(const GPURuntimeContext* ctx, size_t build_avg_pool(const std::array<std::string, 2>& dtypes,
const std::array<std::string, 2>& dtypes,
GPUShape input_shape, GPUShape input_shape,
GPUShape output_shape, GPUShape output_shape,
GPUShape window_shape, GPUShape window_shape,
...@@ -70,23 +66,20 @@ namespace ngraph ...@@ -70,23 +66,20 @@ namespace ngraph
GPUShape padding_below, GPUShape padding_below,
bool include_pad = false); bool include_pad = false);
size_t build_slice(const GPURuntimeContext* ctx, size_t build_slice(const std::array<std::string, 2>& dtypes,
const std::array<std::string, 2>& dtypes,
GPUShape input_shape, GPUShape input_shape,
GPUShape lower_bounds, GPUShape lower_bounds,
GPUShape slice_strides, GPUShape slice_strides,
GPUShape output_shape); GPUShape output_shape);
size_t build_reduce_window(const GPURuntimeContext* ctx, size_t build_reduce_window(const OpName op_name,
const OpName op_name,
const std::vector<std::string>& dtypes, const std::vector<std::string>& dtypes,
GPUShape input_shape, GPUShape input_shape,
GPUShape output_shape, GPUShape output_shape,
GPUShape reduce_window_shape, GPUShape reduce_window_shape,
GPUShape reduce_window_strides); GPUShape reduce_window_strides);
size_t build_reverse_sequence(const runtime::gpu::GPURuntimeContext* ctx, size_t build_reverse_sequence(const std::array<std::string, 3>& dtypes,
const std::array<std::string, 3>& dtypes,
GPUShape input_shape0, GPUShape input_shape0,
GPUShape input_shape1, GPUShape input_shape1,
GPUShape output_shape, GPUShape output_shape,
...@@ -94,24 +87,21 @@ namespace ngraph ...@@ -94,24 +87,21 @@ namespace ngraph
size_t sequence_axis); size_t sequence_axis);
template <typename T> template <typename T>
size_t build_elementwise(const GPURuntimeContext* ctx, size_t build_elementwise(const std::vector<std::string>& dtypes,
const std::vector<std::string>& dtypes,
GPUShape tensor_shape) GPUShape tensor_shape)
{ {
return build_elementwise_n_to_1( return build_elementwise_n_to_1(
ctx, dtypes, tensor_shape, CudaOpMap<T>::op, CudaOpMap<T>::math_kernel); dtypes, tensor_shape, CudaOpMap<T>::op, CudaOpMap<T>::math_kernel);
} }
template <typename ELEMENTWISE_OP_TYPE, typename REDUCE_OP_TYPE = ngraph::op::Nop> template <typename ELEMENTWISE_OP_TYPE, typename REDUCE_OP_TYPE = ngraph::op::Nop>
size_t build_elementwise_collective(const GPURuntimeContext* ctx, size_t build_elementwise_collective(const std::vector<std::string>& dtypes,
const std::vector<std::string>& dtypes,
GPUShape tensor_shape, GPUShape tensor_shape,
const std::set<size_t>& reduced_tensors = {}, const std::set<size_t>& reduced_tensors = {},
const std::set<size_t>& axes = {}, const std::set<size_t>& axes = {},
bool save_elementwise = false) bool save_elementwise = false)
{ {
return build_fused_ew_to_collective(ctx, return build_fused_ew_to_collective(dtypes,
dtypes,
tensor_shape, tensor_shape,
reduced_tensors, reduced_tensors,
axes, axes,
...@@ -121,26 +111,22 @@ namespace ngraph ...@@ -121,26 +111,22 @@ namespace ngraph
save_elementwise); save_elementwise);
} }
size_t build_replace_slice(const GPURuntimeContext* ctx, size_t build_replace_slice(const std::array<std::string, 3>& dtypes,
const std::array<std::string, 3>& dtypes,
GPUShape tensor_shape, GPUShape tensor_shape,
GPUShape source_shape, GPUShape source_shape,
GPUShape lower_bounds, GPUShape lower_bounds,
GPUShape upper_bounds, GPUShape upper_bounds,
GPUShape slice_stride); GPUShape slice_stride);
size_t build_broadcast(const GPURuntimeContext* ctx, size_t build_broadcast(const std::array<std::string, 2>& dtypes,
const std::array<std::string, 2>& dtypes,
GPUShape result_shape, GPUShape result_shape,
const std::set<size_t>& bcast_axes); const std::set<size_t>& bcast_axes);
size_t build_reshape(const GPURuntimeContext* ctx, size_t build_reshape(const std::array<std::string, 2>& dtypes,
const std::array<std::string, 2>& dtypes,
GPUShape input_shape, GPUShape input_shape,
GPUShape input_order); GPUShape input_order);
size_t build_convolution(const GPURuntimeContext* ctx, size_t build_convolution(const std::array<std::string, 3>& dtypes,
const std::array<std::string, 3>& dtypes,
GPUShape input_shape, GPUShape input_shape,
GPUShape input_pad_below, GPUShape input_pad_below,
GPUShape input_dilation, GPUShape input_dilation,
...@@ -150,19 +136,17 @@ namespace ngraph ...@@ -150,19 +136,17 @@ namespace ngraph
GPUShape output_shape); GPUShape output_shape);
private: private:
CUDAEmitter(GPUPrimitiveEmitter* emitter); CUDAEmitter(GPUPrimitiveEmitter* emitter, GPURuntimeContext* ctx);
uint32_t align_to_block_size(uint32_t threads, uint32_t block_size); uint32_t align_to_block_size(uint32_t threads, uint32_t block_size);
void print_tensor_from_gpu(codegen::CodeWriter& writer, void print_tensor_from_gpu(codegen::CodeWriter& writer,
const std::string& tensor_name, const std::string& tensor_name,
GPUShape shape); GPUShape shape);
std::string include_helpers(); std::string include_helpers();
size_t build_elementwise_n_to_1(const GPURuntimeContext* ctx, size_t build_elementwise_n_to_1(const std::vector<std::string>& dtypes,
const std::vector<std::string>& dtypes,
GPUShape tensor_shape, GPUShape tensor_shape,
const char* op, const char* op,
const char* kernel); const char* kernel);
size_t build_fused_ew_to_collective(const GPURuntimeContext* ctx, size_t build_fused_ew_to_collective(const std::vector<std::string>& dtypes,
const std::vector<std::string>& dtypes,
GPUShape tensor_shape, GPUShape tensor_shape,
const std::set<size_t>& reduced_tensors, const std::set<size_t>& reduced_tensors,
const std::set<size_t>& axes, const std::set<size_t>& axes,
...@@ -172,6 +156,7 @@ namespace ngraph ...@@ -172,6 +156,7 @@ namespace ngraph
bool save_elementwise); bool save_elementwise);
GPUPrimitiveEmitter* m_primitive_emitter; GPUPrimitiveEmitter* m_primitive_emitter;
GPURuntimeContext* m_ctx;
}; };
} }
} }
......
...@@ -107,9 +107,10 @@ std::vector<int> ...@@ -107,9 +107,10 @@ std::vector<int>
return low_vec; return low_vec;
} }
runtime::gpu::CUDNNEmitter::CUDNNEmitter(GPUPrimitiveEmitter* emitter) runtime::gpu::CUDNNEmitter::CUDNNEmitter(GPUPrimitiveEmitter* emitter, GPURuntimeContext* ctx)
: m_primitive_emitter(emitter) : m_primitive_emitter(emitter)
{ {
m_ctx = ctx;
} }
cudnnDataType_t runtime::gpu::CUDNNEmitter::get_cudnn_datatype(std::string dtype) cudnnDataType_t runtime::gpu::CUDNNEmitter::get_cudnn_datatype(std::string dtype)
...@@ -128,8 +129,7 @@ cudnnDataType_t runtime::gpu::CUDNNEmitter::get_cudnn_datatype(std::string dtype ...@@ -128,8 +129,7 @@ cudnnDataType_t runtime::gpu::CUDNNEmitter::get_cudnn_datatype(std::string dtype
return p->second; return p->second;
} }
size_t runtime::gpu::CUDNNEmitter::build_reduce_forward(const runtime::gpu::GPURuntimeContext* ctx, size_t runtime::gpu::CUDNNEmitter::build_reduce_forward(const cudnnReduceTensorOp_t& reduce_op,
const cudnnReduceTensorOp_t& reduce_op,
const std::string& dtype, const std::string& dtype,
const Shape& input_shape, const Shape& input_shape,
const AxisSet& reduction_axes) const AxisSet& reduction_axes)
...@@ -162,7 +162,7 @@ size_t runtime::gpu::CUDNNEmitter::build_reduce_forward(const runtime::gpu::GPUR ...@@ -162,7 +162,7 @@ size_t runtime::gpu::CUDNNEmitter::build_reduce_forward(const runtime::gpu::GPUR
GPUAllocator allocator = this->m_primitive_emitter->get_memory_allocator(); GPUAllocator allocator = this->m_primitive_emitter->get_memory_allocator();
size_t workspace_size = 0; size_t workspace_size = 0;
CUDNN_SAFE_CALL(cudnnGetReductionWorkspaceSize( CUDNN_SAFE_CALL(cudnnGetReductionWorkspaceSize(
*ctx->cudnn_handle, desc, input_desc, output_desc, &workspace_size)); *m_ctx->cudnn_handle, desc, input_desc, output_desc, &workspace_size));
size_t workspace_idx = allocator.reserve_workspace(workspace_size); size_t workspace_idx = allocator.reserve_workspace(workspace_size);
void* alpha = m_host_parameters.allocate_by_datatype(data_type, 1.0); void* alpha = m_host_parameters.allocate_by_datatype(data_type, 1.0);
void* beta = m_host_parameters.allocate_by_datatype(data_type, 0); void* beta = m_host_parameters.allocate_by_datatype(data_type, 0);
...@@ -177,8 +177,8 @@ size_t runtime::gpu::CUDNNEmitter::build_reduce_forward(const runtime::gpu::GPUR ...@@ -177,8 +177,8 @@ size_t runtime::gpu::CUDNNEmitter::build_reduce_forward(const runtime::gpu::GPUR
CUDNN_REDUCE_TENSOR_NO_INDICES, CUDNN_REDUCE_TENSOR_NO_INDICES,
CUDNN_32BIT_INDICES)); CUDNN_32BIT_INDICES));
void* workspace_ptr = runtime::gpu::invoke_memory_primitive(ctx, workspace_idx); void* workspace_ptr = runtime::gpu::invoke_memory_primitive(m_ctx, workspace_idx);
CUDNN_SAFE_CALL(cudnnReduceTensor(*ctx->cudnn_handle, CUDNN_SAFE_CALL(cudnnReduceTensor(*m_ctx->cudnn_handle,
desc, desc,
nullptr, nullptr,
0, 0,
...@@ -197,8 +197,7 @@ size_t runtime::gpu::CUDNNEmitter::build_reduce_forward(const runtime::gpu::GPUR ...@@ -197,8 +197,7 @@ size_t runtime::gpu::CUDNNEmitter::build_reduce_forward(const runtime::gpu::GPUR
return primitive_index; return primitive_index;
} }
size_t runtime::gpu::CUDNNEmitter::build_tensor_op(const GPURuntimeContext* ctx, size_t runtime::gpu::CUDNNEmitter::build_tensor_op(const cudnnOpTensorOp_t& tensor_op,
const cudnnOpTensorOp_t& tensor_op,
const std::string& dtype, const std::string& dtype,
const Shape& input_shape, const Shape& input_shape,
const double alpha0, const double alpha0,
...@@ -231,7 +230,7 @@ size_t runtime::gpu::CUDNNEmitter::build_tensor_op(const GPURuntimeContext* ctx, ...@@ -231,7 +230,7 @@ size_t runtime::gpu::CUDNNEmitter::build_tensor_op(const GPURuntimeContext* ctx,
CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor( CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(
opTensorDesc, tensor_op, data_type, CUDNN_NOT_PROPAGATE_NAN)); opTensorDesc, tensor_op, data_type, CUDNN_NOT_PROPAGATE_NAN));
CUDNN_SAFE_CALL(cudnnOpTensor(*ctx->cudnn_handle, CUDNN_SAFE_CALL(cudnnOpTensor(*m_ctx->cudnn_handle,
opTensorDesc, opTensorDesc,
alpha_dt0, alpha_dt0,
descriptor, descriptor,
...@@ -326,8 +325,7 @@ cudnnConvolutionDescriptor_t& runtime::gpu::CUDNNEmitter::get_cudnn_convolution_ ...@@ -326,8 +325,7 @@ cudnnConvolutionDescriptor_t& runtime::gpu::CUDNNEmitter::get_cudnn_convolution_
return conv_descriptor; return conv_descriptor;
} }
size_t runtime::gpu::CUDNNEmitter::build_convolution(const runtime::gpu::GPURuntimeContext* ctx, size_t runtime::gpu::CUDNNEmitter::build_convolution(const std::string& dtype,
const std::string& dtype,
const Shape& input_tensor_shape, const Shape& input_tensor_shape,
const Shape& input_filter_shape, const Shape& input_filter_shape,
const Shape& output_tensor_shape, const Shape& output_tensor_shape,
...@@ -366,7 +364,7 @@ size_t runtime::gpu::CUDNNEmitter::build_convolution(const runtime::gpu::GPURunt ...@@ -366,7 +364,7 @@ size_t runtime::gpu::CUDNNEmitter::build_convolution(const runtime::gpu::GPURunt
void* beta = m_host_parameters.allocate_by_datatype(data_type, 0); void* beta = m_host_parameters.allocate_by_datatype(data_type, 0);
size_t workspace_size_in_bytes = 0; size_t workspace_size_in_bytes = 0;
CUDNN_SAFE_CALL(cudnnGetConvolutionForwardWorkspaceSize(*ctx->cudnn_handle, CUDNN_SAFE_CALL(cudnnGetConvolutionForwardWorkspaceSize(*m_ctx->cudnn_handle,
tensor_desc_0, tensor_desc_0,
filter_desc, filter_desc,
conv_desc, conv_desc,
...@@ -382,8 +380,8 @@ size_t runtime::gpu::CUDNNEmitter::build_convolution(const runtime::gpu::GPURunt ...@@ -382,8 +380,8 @@ size_t runtime::gpu::CUDNNEmitter::build_convolution(const runtime::gpu::GPURunt
std::unique_ptr<gpu::primitive> conv; std::unique_ptr<gpu::primitive> conv;
conv.reset(new gpu::primitive{[=, &conv_desc, &tensor_desc_0, &filter_desc, &tensor_desc_1]( conv.reset(new gpu::primitive{[=, &conv_desc, &tensor_desc_0, &filter_desc, &tensor_desc_1](
void** inputs, void** outputs) { void** inputs, void** outputs) {
void* workspace_ptr = runtime::gpu::invoke_memory_primitive(ctx, workspace_idx); void* workspace_ptr = runtime::gpu::invoke_memory_primitive(m_ctx, workspace_idx);
CUDNN_SAFE_CALL(cudnnConvolutionForward(*ctx->cudnn_handle, CUDNN_SAFE_CALL(cudnnConvolutionForward(*m_ctx->cudnn_handle,
alpha, alpha,
tensor_desc_0, tensor_desc_0,
inputs[0], inputs[0],
...@@ -404,7 +402,6 @@ size_t runtime::gpu::CUDNNEmitter::build_convolution(const runtime::gpu::GPURunt ...@@ -404,7 +402,6 @@ size_t runtime::gpu::CUDNNEmitter::build_convolution(const runtime::gpu::GPURunt
} }
size_t runtime::gpu::CUDNNEmitter::build_convolution_backward_data( size_t runtime::gpu::CUDNNEmitter::build_convolution_backward_data(
const runtime::gpu::GPURuntimeContext* ctx,
const std::string& dtype, const std::string& dtype,
const Shape& input_filter_shape, const Shape& input_filter_shape,
const Shape& input_tensor_shape, const Shape& input_tensor_shape,
...@@ -443,7 +440,7 @@ size_t runtime::gpu::CUDNNEmitter::build_convolution_backward_data( ...@@ -443,7 +440,7 @@ size_t runtime::gpu::CUDNNEmitter::build_convolution_backward_data(
void* beta = m_host_parameters.allocate_by_datatype(data_type, 0); void* beta = m_host_parameters.allocate_by_datatype(data_type, 0);
size_t workspace_size_in_bytes = 0; size_t workspace_size_in_bytes = 0;
CUDNN_SAFE_CALL(cudnnGetConvolutionBackwardDataWorkspaceSize(*ctx->cudnn_handle, CUDNN_SAFE_CALL(cudnnGetConvolutionBackwardDataWorkspaceSize(*m_ctx->cudnn_handle,
filter_desc, filter_desc,
tensor_desc_0, tensor_desc_0,
conv_desc, conv_desc,
...@@ -459,8 +456,8 @@ size_t runtime::gpu::CUDNNEmitter::build_convolution_backward_data( ...@@ -459,8 +456,8 @@ size_t runtime::gpu::CUDNNEmitter::build_convolution_backward_data(
std::unique_ptr<gpu::primitive> conv; std::unique_ptr<gpu::primitive> conv;
conv.reset(new gpu::primitive{[=, &conv_desc, &tensor_desc_0, &filter_desc, &tensor_desc_1]( conv.reset(new gpu::primitive{[=, &conv_desc, &tensor_desc_0, &filter_desc, &tensor_desc_1](
void** inputs, void** outputs) { void** inputs, void** outputs) {
void* workspace_ptr = runtime::gpu::invoke_memory_primitive(ctx, workspace_idx); void* workspace_ptr = runtime::gpu::invoke_memory_primitive(m_ctx, workspace_idx);
CUDNN_SAFE_CALL(cudnnConvolutionBackwardData(*ctx->cudnn_handle, CUDNN_SAFE_CALL(cudnnConvolutionBackwardData(*m_ctx->cudnn_handle,
alpha, alpha,
filter_desc, filter_desc,
inputs[0], inputs[0],
...@@ -481,7 +478,6 @@ size_t runtime::gpu::CUDNNEmitter::build_convolution_backward_data( ...@@ -481,7 +478,6 @@ size_t runtime::gpu::CUDNNEmitter::build_convolution_backward_data(
} }
size_t runtime::gpu::CUDNNEmitter::build_convolution_backward_filter( size_t runtime::gpu::CUDNNEmitter::build_convolution_backward_filter(
const runtime::gpu::GPURuntimeContext* ctx,
const std::string& dtype, const std::string& dtype,
const Shape& input_tensor_shape_0, const Shape& input_tensor_shape_0,
const Shape& input_tensor_shape_1, const Shape& input_tensor_shape_1,
...@@ -520,7 +516,7 @@ size_t runtime::gpu::CUDNNEmitter::build_convolution_backward_filter( ...@@ -520,7 +516,7 @@ size_t runtime::gpu::CUDNNEmitter::build_convolution_backward_filter(
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0; CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
size_t workspace_size_in_bytes = 0; size_t workspace_size_in_bytes = 0;
CUDNN_SAFE_CALL(cudnnGetConvolutionBackwardFilterWorkspaceSize(*ctx->cudnn_handle, CUDNN_SAFE_CALL(cudnnGetConvolutionBackwardFilterWorkspaceSize(*m_ctx->cudnn_handle,
tensor_desc_0, tensor_desc_0,
tensor_desc_1, tensor_desc_1,
conv_desc, conv_desc,
...@@ -538,8 +534,8 @@ size_t runtime::gpu::CUDNNEmitter::build_convolution_backward_filter( ...@@ -538,8 +534,8 @@ size_t runtime::gpu::CUDNNEmitter::build_convolution_backward_filter(
std::unique_ptr<gpu::primitive> conv; std::unique_ptr<gpu::primitive> conv;
conv.reset(new gpu::primitive{[=, &conv_desc, &tensor_desc_0, &filter_desc, &tensor_desc_1]( conv.reset(new gpu::primitive{[=, &conv_desc, &tensor_desc_0, &filter_desc, &tensor_desc_1](
void** inputs, void** outputs) { void** inputs, void** outputs) {
void* workspace_ptr = runtime::gpu::invoke_memory_primitive(ctx, workspace_idx); void* workspace_ptr = runtime::gpu::invoke_memory_primitive(m_ctx, workspace_idx);
CUDNN_SAFE_CALL(cudnnConvolutionBackwardFilter(*ctx->cudnn_handle, CUDNN_SAFE_CALL(cudnnConvolutionBackwardFilter(*m_ctx->cudnn_handle,
alpha, alpha,
tensor_desc_0, tensor_desc_0,
inputs[0], inputs[0],
...@@ -558,8 +554,7 @@ size_t runtime::gpu::CUDNNEmitter::build_convolution_backward_filter( ...@@ -558,8 +554,7 @@ size_t runtime::gpu::CUDNNEmitter::build_convolution_backward_filter(
return primitive_index; return primitive_index;
} }
size_t runtime::gpu::CUDNNEmitter::build_pooling(const runtime::gpu::GPURuntimeContext* ctx, size_t runtime::gpu::CUDNNEmitter::build_pooling(const cudnnPoolingMode_t& pool_op,
const cudnnPoolingMode_t& pool_op,
const std::string& dtype, const std::string& dtype,
const Prop& direction, const Prop& direction,
const Shape& input_shape, const Shape& input_shape,
...@@ -638,7 +633,7 @@ size_t runtime::gpu::CUDNNEmitter::build_pooling(const runtime::gpu::GPURuntimeC ...@@ -638,7 +633,7 @@ size_t runtime::gpu::CUDNNEmitter::build_pooling(const runtime::gpu::GPURuntimeC
{ {
pool.reset(new gpu::primitive{ pool.reset(new gpu::primitive{
[=, &desc, &input_desc, &output_desc](void** inputs, void** outputs) { [=, &desc, &input_desc, &output_desc](void** inputs, void** outputs) {
CUDNN_SAFE_CALL(cudnnPoolingForward(*ctx->cudnn_handle, CUDNN_SAFE_CALL(cudnnPoolingForward(*m_ctx->cudnn_handle,
desc, desc,
alpha, alpha,
input_desc, input_desc,
...@@ -660,7 +655,7 @@ size_t runtime::gpu::CUDNNEmitter::build_pooling(const runtime::gpu::GPURuntimeC ...@@ -660,7 +655,7 @@ size_t runtime::gpu::CUDNNEmitter::build_pooling(const runtime::gpu::GPURuntimeC
// cuDNN requires the output tensor of the maxpool fprop to be passed even though // cuDNN requires the output tensor of the maxpool fprop to be passed even though
// it is not mathematically necessary. It appears, however, that it is not actually // it is not mathematically necessary. It appears, however, that it is not actually
// used as the adjoints are passed in place and the correct result is achieved. // used as the adjoints are passed in place and the correct result is achieved.
CUDNN_SAFE_CALL(cudnnPoolingBackward(*ctx->cudnn_handle, CUDNN_SAFE_CALL(cudnnPoolingBackward(*m_ctx->cudnn_handle,
desc, desc,
alpha, alpha,
// output (wrt maxpool) tensor // output (wrt maxpool) tensor
...@@ -686,8 +681,7 @@ size_t runtime::gpu::CUDNNEmitter::build_pooling(const runtime::gpu::GPURuntimeC ...@@ -686,8 +681,7 @@ size_t runtime::gpu::CUDNNEmitter::build_pooling(const runtime::gpu::GPURuntimeC
return primitive_index; return primitive_index;
} }
size_t runtime::gpu::CUDNNEmitter::build_batchnorm(const runtime::gpu::GPURuntimeContext* ctx, size_t runtime::gpu::CUDNNEmitter::build_batchnorm(const cudnnBatchNormMode_t& bn_op,
const cudnnBatchNormMode_t& bn_op,
const std::string& dtype, const std::string& dtype,
const Prop& direction, const Prop& direction,
const Shape& tensor_shape, const Shape& tensor_shape,
...@@ -728,7 +722,7 @@ size_t runtime::gpu::CUDNNEmitter::build_batchnorm(const runtime::gpu::GPURuntim ...@@ -728,7 +722,7 @@ size_t runtime::gpu::CUDNNEmitter::build_batchnorm(const runtime::gpu::GPURuntim
{ {
batchnorm.reset(new gpu::primitive{ batchnorm.reset(new gpu::primitive{
[=, &tensor_desc, &derived_param_desc](void** inputs, void** outputs) { [=, &tensor_desc, &derived_param_desc](void** inputs, void** outputs) {
CUDNN_SAFE_CALL(cudnnBatchNormalizationForwardInference(*ctx->cudnn_handle, CUDNN_SAFE_CALL(cudnnBatchNormalizationForwardInference(*m_ctx->cudnn_handle,
bn_op, bn_op,
alpha, alpha,
beta, beta,
...@@ -762,7 +756,7 @@ size_t runtime::gpu::CUDNNEmitter::build_batchnorm(const runtime::gpu::GPURuntim ...@@ -762,7 +756,7 @@ size_t runtime::gpu::CUDNNEmitter::build_batchnorm(const runtime::gpu::GPURuntim
void* bias_factor = m_host_parameters.allocate_by_datatype(data_type, (m - 1) / m); void* bias_factor = m_host_parameters.allocate_by_datatype(data_type, (m - 1) / m);
batchnorm.reset(new gpu::primitive{ batchnorm.reset(new gpu::primitive{
[=, &op_desc, &tensor_desc, &derived_param_desc](void** inputs, void** outputs) { [=, &op_desc, &tensor_desc, &derived_param_desc](void** inputs, void** outputs) {
CUDNN_SAFE_CALL(cudnnBatchNormalizationForwardTraining(*ctx->cudnn_handle, CUDNN_SAFE_CALL(cudnnBatchNormalizationForwardTraining(*m_ctx->cudnn_handle,
bn_op, bn_op,
alpha, alpha,
beta, beta,
...@@ -781,7 +775,7 @@ size_t runtime::gpu::CUDNNEmitter::build_batchnorm(const runtime::gpu::GPURuntim ...@@ -781,7 +775,7 @@ size_t runtime::gpu::CUDNNEmitter::build_batchnorm(const runtime::gpu::GPURuntim
NULL)); NULL));
// convert to biased variance // convert to biased variance
CUDNN_SAFE_CALL(cudnnOpTensor(*ctx->cudnn_handle, CUDNN_SAFE_CALL(cudnnOpTensor(*m_ctx->cudnn_handle,
op_desc, op_desc,
beta, beta,
derived_param_desc, derived_param_desc,
...@@ -800,7 +794,7 @@ size_t runtime::gpu::CUDNNEmitter::build_batchnorm(const runtime::gpu::GPURuntim ...@@ -800,7 +794,7 @@ size_t runtime::gpu::CUDNNEmitter::build_batchnorm(const runtime::gpu::GPURuntim
batchnorm.reset(new gpu::primitive{ batchnorm.reset(new gpu::primitive{
[=, &tensor_desc, &derived_param_desc](void** inputs, void** outputs) { [=, &tensor_desc, &derived_param_desc](void** inputs, void** outputs) {
CUDNN_SAFE_CALL(cudnnBatchNormalizationBackward( CUDNN_SAFE_CALL(cudnnBatchNormalizationBackward(
*ctx->cudnn_handle, *m_ctx->cudnn_handle,
bn_op, bn_op,
alpha, alpha,
beta, beta,
...@@ -829,8 +823,7 @@ size_t runtime::gpu::CUDNNEmitter::build_batchnorm(const runtime::gpu::GPURuntim ...@@ -829,8 +823,7 @@ size_t runtime::gpu::CUDNNEmitter::build_batchnorm(const runtime::gpu::GPURuntim
return primitive_index; return primitive_index;
} }
size_t runtime::gpu::CUDNNEmitter::build_softmax(const runtime::gpu::GPURuntimeContext* ctx, size_t runtime::gpu::CUDNNEmitter::build_softmax(const cudnnSoftmaxAlgorithm_t& algorithm,
const cudnnSoftmaxAlgorithm_t& algorithm,
const cudnnSoftmaxMode_t& mode, const cudnnSoftmaxMode_t& mode,
const std::string& dtype, const std::string& dtype,
const Prop& direction, const Prop& direction,
...@@ -862,7 +855,7 @@ size_t runtime::gpu::CUDNNEmitter::build_softmax(const runtime::gpu::GPURuntimeC ...@@ -862,7 +855,7 @@ size_t runtime::gpu::CUDNNEmitter::build_softmax(const runtime::gpu::GPURuntimeC
case Prop::Inference: case Prop::Inference:
{ {
softmax.reset(new gpu::primitive{[=, &tensor_desc](void** inputs, void** outputs) { softmax.reset(new gpu::primitive{[=, &tensor_desc](void** inputs, void** outputs) {
CUDNN_SAFE_CALL(cudnnSoftmaxForward(*ctx->cudnn_handle, CUDNN_SAFE_CALL(cudnnSoftmaxForward(*m_ctx->cudnn_handle,
algorithm, algorithm,
mode, mode,
alpha, alpha,
...@@ -877,7 +870,7 @@ size_t runtime::gpu::CUDNNEmitter::build_softmax(const runtime::gpu::GPURuntimeC ...@@ -877,7 +870,7 @@ size_t runtime::gpu::CUDNNEmitter::build_softmax(const runtime::gpu::GPURuntimeC
case Prop::Backward: case Prop::Backward:
{ {
softmax.reset(new gpu::primitive{[=, &tensor_desc](void** inputs, void** outputs) { softmax.reset(new gpu::primitive{[=, &tensor_desc](void** inputs, void** outputs) {
CUDNN_SAFE_CALL(cudnnSoftmaxBackward(*ctx->cudnn_handle, CUDNN_SAFE_CALL(cudnnSoftmaxBackward(*m_ctx->cudnn_handle,
algorithm, algorithm,
mode, mode,
alpha, alpha,
......
...@@ -56,8 +56,7 @@ namespace ngraph ...@@ -56,8 +56,7 @@ namespace ngraph
Backward Backward
}; };
size_t build_convolution(const runtime::gpu::GPURuntimeContext* ctx, size_t build_convolution(const std::string& dtype,
const std::string& dtype,
const Shape& input_tensor_shape, const Shape& input_tensor_shape,
const Shape& input_filter_shape, const Shape& input_filter_shape,
const Shape& output_tensor_shape, const Shape& output_tensor_shape,
...@@ -65,8 +64,7 @@ namespace ngraph ...@@ -65,8 +64,7 @@ namespace ngraph
const Strides& window_dilation_strides, const Strides& window_dilation_strides,
const Shape& padding_below); const Shape& padding_below);
size_t build_convolution_backward_data(const runtime::gpu::GPURuntimeContext* ctx, size_t build_convolution_backward_data(const std::string& dtype,
const std::string& dtype,
const Shape& input_filter_shape, const Shape& input_filter_shape,
const Shape& input_tensor_shape, const Shape& input_tensor_shape,
const Shape& output_tensor_shape, const Shape& output_tensor_shape,
...@@ -74,8 +72,7 @@ namespace ngraph ...@@ -74,8 +72,7 @@ namespace ngraph
const Strides& window_dilation_strides, const Strides& window_dilation_strides,
const Shape& padding_below); const Shape& padding_below);
size_t build_convolution_backward_filter(const runtime::gpu::GPURuntimeContext* ctx, size_t build_convolution_backward_filter(const std::string& dtype,
const std::string& dtype,
const Shape& input_tensor_shape_0, const Shape& input_tensor_shape_0,
const Shape& input_tensor_shape_1, const Shape& input_tensor_shape_1,
const Shape& output_filter_shape, const Shape& output_filter_shape,
...@@ -83,22 +80,19 @@ namespace ngraph ...@@ -83,22 +80,19 @@ namespace ngraph
const Strides& window_dilation_strides, const Strides& window_dilation_strides,
const Shape& padding_below); const Shape& padding_below);
size_t build_reduce_forward(const GPURuntimeContext* ctx, size_t build_reduce_forward(const cudnnReduceTensorOp_t& reduce_op,
const cudnnReduceTensorOp_t& reduce_op,
const std::string& dtype, const std::string& dtype,
const Shape& input_shape, const Shape& input_shape,
const AxisSet& reduction_axes); const AxisSet& reduction_axes);
size_t build_tensor_op(const GPURuntimeContext* ctx, size_t build_tensor_op(const cudnnOpTensorOp_t& tensor_op,
const cudnnOpTensorOp_t& tensor_op,
const std::string& dtype, const std::string& dtype,
const Shape& input_shape, const Shape& input_shape,
const double alpha0, const double alpha0,
const double alpha1, const double alpha1,
const double beta); const double beta);
size_t build_pooling(const GPURuntimeContext* ctx, size_t build_pooling(const cudnnPoolingMode_t& pool_op,
const cudnnPoolingMode_t& pool_op,
const std::string& dtype, const std::string& dtype,
const Prop& direction, const Prop& direction,
const ngraph::Shape& input_shape, const ngraph::Shape& input_shape,
...@@ -108,23 +102,21 @@ namespace ngraph ...@@ -108,23 +102,21 @@ namespace ngraph
const ngraph::Shape& padding_below, const ngraph::Shape& padding_below,
const ngraph::Shape& padding_above); const ngraph::Shape& padding_above);
size_t build_batchnorm(const runtime::gpu::GPURuntimeContext* ctx, size_t build_batchnorm(const cudnnBatchNormMode_t& bn_op,
const cudnnBatchNormMode_t& bn_op,
const std::string& dtype, const std::string& dtype,
const Prop& direction, const Prop& direction,
const Shape& tensor_shape, const Shape& tensor_shape,
const Shape& param_shape, const Shape& param_shape,
double epsilon); double epsilon);
size_t build_softmax(const runtime::gpu::GPURuntimeContext* ctx, size_t build_softmax(const cudnnSoftmaxAlgorithm_t& algorithm,
const cudnnSoftmaxAlgorithm_t& algorithm,
const cudnnSoftmaxMode_t& mode, const cudnnSoftmaxMode_t& mode,
const std::string& dtype, const std::string& dtype,
const Prop& direction, const Prop& direction,
const Shape& tensor_shape); const Shape& tensor_shape);
private: private:
CUDNNEmitter(GPUPrimitiveEmitter* emitter); CUDNNEmitter(GPUPrimitiveEmitter* emitter, GPURuntimeContext* ctx);
void* get_data_by_type(cudnnDataType_t data_type, double value); void* get_data_by_type(cudnnDataType_t data_type, double value);
...@@ -149,6 +141,7 @@ namespace ngraph ...@@ -149,6 +141,7 @@ namespace ngraph
CUDNNHostParameters m_host_parameters; CUDNNHostParameters m_host_parameters;
GPUPrimitiveEmitter* m_primitive_emitter; GPUPrimitiveEmitter* m_primitive_emitter;
GPURuntimeContext* m_ctx;
}; };
} }
} }
......
...@@ -14,9 +14,15 @@ ...@@ -14,9 +14,15 @@
* limitations under the License. * limitations under the License.
*******************************************************************************/ *******************************************************************************/
#include "ngraph/runtime/gpu/gpu_backend.hpp" #include <cublas_v2.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <cudnn.h>
#include "ngraph/graph_util.hpp" #include "ngraph/graph_util.hpp"
#include "ngraph/runtime/gpu/gpu_backend.hpp"
#include "ngraph/runtime/gpu/gpu_external_function.hpp" #include "ngraph/runtime/gpu/gpu_external_function.hpp"
#include "ngraph/runtime/gpu/gpu_primitive_emitter.hpp"
#include "ngraph/runtime/gpu/gpu_tensor_view.hpp" #include "ngraph/runtime/gpu/gpu_tensor_view.hpp"
#include "ngraph/util.hpp" #include "ngraph/util.hpp"
...@@ -38,6 +44,57 @@ extern "C" void delete_backend(runtime::Backend* backend) ...@@ -38,6 +44,57 @@ extern "C" void delete_backend(runtime::Backend* backend)
delete backend; delete backend;
} }
runtime::gpu::GPU_Backend::GPU_Backend()
: runtime::Backend()
, m_context(new BackendContext())
{
}
runtime::gpu::GPU_Backend::BackendContext::BackendContext()
: m_runtime_context(new GPURuntimeContext)
, m_primitive_emitter(new GPUPrimitiveEmitter(m_runtime_context))
{
// Create context use driver API and make it current, the runtime call will pickup the context
// http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html
// #interoperability-between-runtime-and-driver-apis
ngraph::runtime::gpu::CudaContextManager::Instance().SetContextCurrent();
m_runtime_context->cublas_handle = new cublasHandle_t;
cublasStatus_t cublasStatus = cublasCreate(m_runtime_context->cublas_handle);
if (cublasStatus != CUBLAS_STATUS_SUCCESS)
{
throw runtime_error("cuBLAS create handle failed");
}
// Pass scalars as reference on the Device
cublasSetPointerMode(*m_runtime_context->cublas_handle, CUBLAS_POINTER_MODE_DEVICE);
m_runtime_context->cudnn_handle = new cudnnHandle_t;
cudnnStatus_t cudnnStatus = cudnnCreate(m_runtime_context->cudnn_handle);
if (cudnnStatus != CUDNN_STATUS_SUCCESS)
{
throw runtime_error("cuDNN create handle failed");
}
// register with c-api runtime context
m_runtime_context->compiled_kernel_pool = new CudaFunctionPool;
}
void runtime::gpu::GPU_Backend::BackendContext::prepare_runtime_context()
{
// add pointers to gpu primitives into the gpu runtime context
m_runtime_context->gpu_primitives = m_primitive_emitter->get_primitives().data();
m_runtime_context->gpu_memory_primitives = m_primitive_emitter->get_memory_primitives().data();
}
runtime::gpu::GPU_Backend::BackendContext::~BackendContext()
{
cublasDestroy(*m_runtime_context->cublas_handle);
delete m_runtime_context->cublas_handle;
cudnnDestroy(*m_runtime_context->cudnn_handle);
delete m_runtime_context->cudnn_handle;
delete m_runtime_context->compiled_kernel_pool;
}
shared_ptr<runtime::gpu::GPU_CallFrame> runtime::gpu::GPU_Backend::make_call_frame( shared_ptr<runtime::gpu::GPU_CallFrame> runtime::gpu::GPU_Backend::make_call_frame(
const shared_ptr<GPU_ExternalFunction>& external_function) const shared_ptr<GPU_ExternalFunction>& external_function)
{ {
...@@ -61,7 +118,7 @@ bool runtime::gpu::GPU_Backend::compile(shared_ptr<Function> func) ...@@ -61,7 +118,7 @@ bool runtime::gpu::GPU_Backend::compile(shared_ptr<Function> func)
FunctionInstance& instance = m_function_map[func]; FunctionInstance& instance = m_function_map[func];
if (instance.m_external_function == nullptr) if (instance.m_external_function == nullptr)
{ {
instance.m_external_function = make_shared<GPU_ExternalFunction>(func); instance.m_external_function = make_shared<GPU_ExternalFunction>(func, m_context);
instance.m_external_function->m_emit_timing = instance.m_performance_counters_enabled; instance.m_external_function->m_emit_timing = instance.m_performance_counters_enabled;
auto cf = instance.m_external_function->make_call_frame(); auto cf = instance.m_external_function->make_call_frame();
instance.m_call_frame = dynamic_pointer_cast<GPU_CallFrame>(cf); instance.m_call_frame = dynamic_pointer_cast<GPU_CallFrame>(cf);
...@@ -83,7 +140,9 @@ bool runtime::gpu::GPU_Backend::call(shared_ptr<Function> func, ...@@ -83,7 +140,9 @@ bool runtime::gpu::GPU_Backend::call(shared_ptr<Function> func,
rc = compile(func); rc = compile(func);
} }
instance.m_call_frame->call(outputs, inputs); // ensure the GPURuntimeContext primitive pointers are valid
m_context->prepare_runtime_context();
instance.m_call_frame->call(outputs, inputs, m_context->m_runtime_context.get());
return rc; return rc;
} }
......
...@@ -31,10 +31,13 @@ namespace ngraph ...@@ -31,10 +31,13 @@ namespace ngraph
class GPU_ExternalFunction; class GPU_ExternalFunction;
class GPU_CallFrame; class GPU_CallFrame;
class GPUPrimitiveEmitter;
struct GPURuntimeContext;
class GPU_Backend : public Backend class GPU_Backend : public Backend
{ {
public: public:
GPU_Backend();
std::shared_ptr<ngraph::runtime::gpu::GPU_CallFrame> make_call_frame( std::shared_ptr<ngraph::runtime::gpu::GPU_CallFrame> make_call_frame(
const std::shared_ptr<ngraph::runtime::gpu::GPU_ExternalFunction>& const std::shared_ptr<ngraph::runtime::gpu::GPU_ExternalFunction>&
external_function); external_function);
...@@ -59,6 +62,17 @@ namespace ngraph ...@@ -59,6 +62,17 @@ namespace ngraph
std::vector<PerformanceCounter> std::vector<PerformanceCounter>
get_performance_data(std::shared_ptr<Function> func) const override; get_performance_data(std::shared_ptr<Function> func) const override;
class BackendContext
{
public:
BackendContext();
~BackendContext();
void prepare_runtime_context();
std::unique_ptr<GPURuntimeContext> m_runtime_context;
std::unique_ptr<GPUPrimitiveEmitter> m_primitive_emitter;
};
private: private:
class FunctionInstance class FunctionInstance
{ {
...@@ -69,6 +83,7 @@ namespace ngraph ...@@ -69,6 +83,7 @@ namespace ngraph
}; };
std::map<std::shared_ptr<Function>, FunctionInstance> m_function_map; std::map<std::shared_ptr<Function>, FunctionInstance> m_function_map;
std::shared_ptr<BackendContext> m_context;
}; };
} }
} }
......
...@@ -32,17 +32,16 @@ runtime::gpu::GPU_CallFrame::GPU_CallFrame(std::shared_ptr<GPU_ExternalFunction> ...@@ -32,17 +32,16 @@ runtime::gpu::GPU_CallFrame::GPU_CallFrame(std::shared_ptr<GPU_ExternalFunction>
: m_external_function(external_function) : m_external_function(external_function)
, m_compiled_function(compiled_function) , m_compiled_function(compiled_function)
{ {
setup_runtime_context();
} }
runtime::gpu::GPU_CallFrame::~GPU_CallFrame() runtime::gpu::GPU_CallFrame::~GPU_CallFrame()
{ {
cleanup_runtime_context();
} }
void runtime::gpu::GPU_CallFrame::call( void runtime::gpu::GPU_CallFrame::call(
const std::vector<std::shared_ptr<runtime::TensorView>>& output_tvs, const std::vector<std::shared_ptr<runtime::TensorView>>& output_tvs,
const std::vector<std::shared_ptr<runtime::TensorView>>& input_tvs) const std::vector<std::shared_ptr<runtime::TensorView>>& input_tvs,
GPURuntimeContext* ctx)
{ {
//Device tensors //Device tensors
vector<void*> inputs; vector<void*> inputs;
...@@ -61,18 +60,5 @@ void runtime::gpu::GPU_CallFrame::call( ...@@ -61,18 +60,5 @@ void runtime::gpu::GPU_CallFrame::call(
outputs.push_back(tv->m_allocated_buffer_pool); outputs.push_back(tv->m_allocated_buffer_pool);
} }
m_compiled_function(inputs.data(), outputs.data(), m_external_function->m_ctx.get()); m_compiled_function(inputs.data(), outputs.data(), ctx);
}
void runtime::gpu::GPU_CallFrame::setup_runtime_context()
{
// add pointers to gpu primitives into the gpu runtime context
const auto& primitive_emitter = m_external_function->get_primitive_emitter();
m_external_function->m_ctx->gpu_primitives = primitive_emitter->get_primitives().data();
m_external_function->m_ctx->gpu_memory_primitives =
primitive_emitter->get_memory_primitives().data();
}
void runtime::gpu::GPU_CallFrame::cleanup_runtime_context()
{
} }
...@@ -53,10 +53,8 @@ namespace ngraph ...@@ -53,10 +53,8 @@ namespace ngraph
/// ///
/// Tuples will be expanded into their tensor views to build the call frame. /// Tuples will be expanded into their tensor views to build the call frame.
void call(const std::vector<std::shared_ptr<runtime::TensorView>>& outputs, void call(const std::vector<std::shared_ptr<runtime::TensorView>>& outputs,
const std::vector<std::shared_ptr<runtime::TensorView>>& inputs); const std::vector<std::shared_ptr<runtime::TensorView>>& inputs,
GPURuntimeContext* ctx);
void setup_runtime_context();
void cleanup_runtime_context();
protected: protected:
std::shared_ptr<GPU_ExternalFunction> m_external_function; std::shared_ptr<GPU_ExternalFunction> m_external_function;
......
...@@ -123,13 +123,8 @@ namespace ngraph ...@@ -123,13 +123,8 @@ namespace ngraph
{ {
auto& cudnn_emitter = auto& cudnn_emitter =
external_function->get_primitive_emitter()->get_cudnn_emitter(); external_function->get_primitive_emitter()->get_cudnn_emitter();
auto index = cudnn_emitter->build_tensor_op(external_function->ctx().get(), auto index = cudnn_emitter->build_tensor_op(
CUDNN_OP_TENSOR_ADD, CUDNN_OP_TENSOR_ADD, out[0].get_type(), args[0].get_shape(), 1.0, 1.0, 0);
out[0].get_type(),
args[0].get_shape(),
1.0,
1.0,
0);
writer << "gpu::invoke_primitive(ctx, " << index << ", "; writer << "gpu::invoke_primitive(ctx, " << index << ", ";
writer << "std::vector<void*>{" << args[0].get_name() << "," writer << "std::vector<void*>{" << args[0].get_name() << ","
...@@ -187,22 +182,16 @@ namespace ngraph ...@@ -187,22 +182,16 @@ namespace ngraph
auto& cuda_emitter = auto& cuda_emitter =
external_function->get_primitive_emitter()->get_cuda_emitter(); external_function->get_primitive_emitter()->get_cuda_emitter();
size_t reshape_data_index = size_t reshape_data_index = cuda_emitter->build_reshape(
cuda_emitter->build_reshape(external_function->ctx().get(), {{args[0].get_type(), args[0].get_type()}}, input_shape, input_order);
{{args[0].get_type(), args[0].get_type()}},
input_shape,
input_order);
writer << "void* data = gpu::invoke_memory_primitive(ctx, " writer << "void* data = gpu::invoke_memory_primitive(ctx, "
<< transposed_data_idx << ");\n"; << transposed_data_idx << ");\n";
writer << "gpu::invoke_primitive(ctx, " << reshape_data_index << ", "; writer << "gpu::invoke_primitive(ctx, " << reshape_data_index << ", ";
writer << "std::vector<void*>{" << args[0].get_name() << "}.data(), "; writer << "std::vector<void*>{" << args[0].get_name() << "}.data(), ";
writer << "std::vector<void*>{data}.data());\n"; writer << "std::vector<void*>{data}.data());\n";
size_t reshape_filter_index = size_t reshape_filter_index = cuda_emitter->build_reshape(
cuda_emitter->build_reshape(external_function->ctx().get(), {{args[1].get_type(), args[1].get_type()}}, filter_shape, input_order);
{{args[1].get_type(), args[1].get_type()}},
filter_shape,
input_order);
writer << "void* filter = gpu::invoke_memory_primitive(ctx, " writer << "void* filter = gpu::invoke_memory_primitive(ctx, "
<< transposed_filter_idx << ");\n"; << transposed_filter_idx << ");\n";
writer << "gpu::invoke_primitive(ctx, " << reshape_filter_index << ", "; writer << "gpu::invoke_primitive(ctx, " << reshape_filter_index << ", ";
...@@ -227,7 +216,6 @@ namespace ngraph ...@@ -227,7 +216,6 @@ namespace ngraph
output_shape = reshape(output_shape, input_order); output_shape = reshape(output_shape, input_order);
size_t conv_index = cuda_emitter->build_convolution( size_t conv_index = cuda_emitter->build_convolution(
external_function->ctx().get(),
{{args[0].get_type(), args[1].get_type(), out[0].get_type()}}, {{args[0].get_type(), args[1].get_type(), out[0].get_type()}},
input_shape, input_shape,
padding_below_diff, padding_below_diff,
...@@ -250,11 +238,8 @@ namespace ngraph ...@@ -250,11 +238,8 @@ namespace ngraph
input_order.push_back(i); input_order.push_back(i);
} }
size_t reshape_output_index = size_t reshape_output_index = cuda_emitter->build_reshape(
cuda_emitter->build_reshape(external_function->ctx().get(), {{args[1].get_type(), args[1].get_type()}}, output_shape, input_order);
{{args[1].get_type(), args[1].get_type()}},
output_shape,
input_order);
writer << "gpu::invoke_primitive(ctx, " << reshape_output_index << ", "; writer << "gpu::invoke_primitive(ctx, " << reshape_output_index << ", ";
writer << "std::vector<void*>{output}.data(), "; writer << "std::vector<void*>{output}.data(), ";
writer << "std::vector<void*>{" << out[0].get_name() << "}.data());\n"; writer << "std::vector<void*>{" << out[0].get_name() << "}.data());\n";
...@@ -304,7 +289,6 @@ namespace ngraph ...@@ -304,7 +289,6 @@ namespace ngraph
auto& cuda_emitter = auto& cuda_emitter =
external_function->get_primitive_emitter()->get_cuda_emitter(); external_function->get_primitive_emitter()->get_cuda_emitter();
auto pad_dynamic_index = cuda_emitter->build_pad_dynamic( auto pad_dynamic_index = cuda_emitter->build_pad_dynamic(
external_function->ctx().get(),
{{args[0].get_type(), out[0].get_type()}}, {{args[0].get_type(), out[0].get_type()}},
input_shape, input_shape,
input_shape_padded, input_shape_padded,
...@@ -322,8 +306,7 @@ namespace ngraph ...@@ -322,8 +306,7 @@ namespace ngraph
auto& cudnn_emitter = auto& cudnn_emitter =
external_function->get_primitive_emitter()->get_cudnn_emitter(); external_function->get_primitive_emitter()->get_cudnn_emitter();
size_t index = cudnn_emitter->build_convolution(external_function->ctx().get(), size_t index = cudnn_emitter->build_convolution(out[0].get_type(),
out[0].get_type(),
input_shape_padded, input_shape_padded,
args[1].get_shape(), args[1].get_shape(),
out[0].get_shape(), out[0].get_shape(),
...@@ -424,8 +407,7 @@ namespace ngraph ...@@ -424,8 +407,7 @@ namespace ngraph
auto& cuda_emitter = auto& cuda_emitter =
external_function->get_primitive_emitter()->get_cuda_emitter(); external_function->get_primitive_emitter()->get_cuda_emitter();
auto pad_dynamic_index = auto pad_dynamic_index =
cuda_emitter->build_pad_dynamic(external_function->ctx().get(), cuda_emitter->build_pad_dynamic({{args[0].get_type(), out[0].get_type()}},
{{args[0].get_type(), out[0].get_type()}},
output_shape, output_shape,
output_shape_padded, output_shape_padded,
padding_below, padding_below,
...@@ -442,8 +424,7 @@ namespace ngraph ...@@ -442,8 +424,7 @@ namespace ngraph
auto& cudnn_emitter = auto& cudnn_emitter =
external_function->get_primitive_emitter()->get_cudnn_emitter(); external_function->get_primitive_emitter()->get_cudnn_emitter();
size_t index = size_t index =
cudnn_emitter->build_convolution_backward_data(external_function->ctx().get(), cudnn_emitter->build_convolution_backward_data(out[0].get_type(),
out[0].get_type(),
args[0].get_shape(), args[0].get_shape(),
args[1].get_shape(), args[1].get_shape(),
output_shape_padded, output_shape_padded,
...@@ -469,8 +450,7 @@ namespace ngraph ...@@ -469,8 +450,7 @@ namespace ngraph
auto& cuda_emitter = auto& cuda_emitter =
external_function->get_primitive_emitter()->get_cuda_emitter(); external_function->get_primitive_emitter()->get_cuda_emitter();
auto slice_index = auto slice_index =
cuda_emitter->build_slice(external_function->ctx().get(), cuda_emitter->build_slice({{args[0].get_type(), out[0].get_type()}},
{{args[0].get_type(), out[0].get_type()}},
output_shape_padded, output_shape_padded,
padding_below_back, padding_below_back,
padding_interior_back, padding_interior_back,
...@@ -550,8 +530,7 @@ namespace ngraph ...@@ -550,8 +530,7 @@ namespace ngraph
auto& cuda_emitter = auto& cuda_emitter =
external_function->get_primitive_emitter()->get_cuda_emitter(); external_function->get_primitive_emitter()->get_cuda_emitter();
auto pad_dynamic_index = auto pad_dynamic_index =
cuda_emitter->build_pad_dynamic(external_function->ctx().get(), cuda_emitter->build_pad_dynamic({{args[0].get_type(), out[0].get_type()}},
{{args[0].get_type(), out[0].get_type()}},
input_shape, input_shape,
input_shape_padded, input_shape_padded,
padding_below, padding_below,
...@@ -569,8 +548,7 @@ namespace ngraph ...@@ -569,8 +548,7 @@ namespace ngraph
auto& cudnn_emitter = auto& cudnn_emitter =
external_function->get_primitive_emitter()->get_cudnn_emitter(); external_function->get_primitive_emitter()->get_cudnn_emitter();
size_t index = size_t index =
cudnn_emitter->build_convolution_backward_filter(external_function->ctx().get(), cudnn_emitter->build_convolution_backward_filter(out[0].get_type(),
out[0].get_type(),
input_shape_padded, input_shape_padded,
args[1].get_shape(), args[1].get_shape(),
out[0].get_shape(), out[0].get_shape(),
...@@ -770,13 +748,8 @@ namespace ngraph ...@@ -770,13 +748,8 @@ namespace ngraph
{ {
auto& cudnn_emitter = auto& cudnn_emitter =
external_function->get_primitive_emitter()->get_cudnn_emitter(); external_function->get_primitive_emitter()->get_cudnn_emitter();
auto index = cudnn_emitter->build_tensor_op(external_function->ctx().get(), auto index = cudnn_emitter->build_tensor_op(
CUDNN_OP_TENSOR_MAX, CUDNN_OP_TENSOR_MAX, out[0].get_type(), args[0].get_shape(), 1.0, 1.0, 0);
out[0].get_type(),
args[0].get_shape(),
1.0,
1.0,
0);
writer << "gpu::invoke_primitive(ctx, " << index << ", "; writer << "gpu::invoke_primitive(ctx, " << index << ", ";
writer << "std::vector<void*>{" << args[0].get_name() << "," writer << "std::vector<void*>{" << args[0].get_name() << ","
...@@ -798,13 +771,8 @@ namespace ngraph ...@@ -798,13 +771,8 @@ namespace ngraph
{ {
auto& cudnn_emitter = auto& cudnn_emitter =
external_function->get_primitive_emitter()->get_cudnn_emitter(); external_function->get_primitive_emitter()->get_cudnn_emitter();
auto index = cudnn_emitter->build_tensor_op(external_function->ctx().get(), auto index = cudnn_emitter->build_tensor_op(
CUDNN_OP_TENSOR_MIN, CUDNN_OP_TENSOR_MIN, out[0].get_type(), args[0].get_shape(), 1.0, 1.0, 0);
out[0].get_type(),
args[0].get_shape(),
1.0,
1.0,
0);
writer << "gpu::invoke_primitive(ctx, " << index << ", "; writer << "gpu::invoke_primitive(ctx, " << index << ", ";
writer << "std::vector<void*>{" << args[0].get_name() << "," writer << "std::vector<void*>{" << args[0].get_name() << ","
...@@ -838,11 +806,8 @@ namespace ngraph ...@@ -838,11 +806,8 @@ namespace ngraph
auto& cuda_emitter = external_function->get_primitive_emitter()->get_cuda_emitter(); auto& cuda_emitter = external_function->get_primitive_emitter()->get_cuda_emitter();
auto bcast_index = auto bcast_index = cuda_emitter->build_broadcast(
cuda_emitter->build_broadcast(external_function->ctx().get(), {{args[0].get_type(), out[0].get_type()}}, result_shape, axes);
{{args[0].get_type(), out[0].get_type()}},
result_shape,
axes);
writer << "gpu::invoke_primitive(ctx, " << bcast_index << ", "; writer << "gpu::invoke_primitive(ctx, " << bcast_index << ", ";
writer << "std::vector<void*>{" << args[0].get_name() << "}.data(), "; writer << "std::vector<void*>{" << args[0].get_name() << "}.data(), ";
writer << "std::vector<void*>{" << out[0].get_name() << "}.data()"; writer << "std::vector<void*>{" << out[0].get_name() << "}.data()";
...@@ -954,11 +919,8 @@ namespace ngraph ...@@ -954,11 +919,8 @@ namespace ngraph
{ {
auto& cuda_emitter = auto& cuda_emitter =
external_function->get_primitive_emitter()->get_cuda_emitter(); external_function->get_primitive_emitter()->get_cuda_emitter();
auto index = auto index = cuda_emitter->build_reshape(
cuda_emitter->build_reshape(external_function->ctx().get(), {{args[0].get_type(), out[0].get_type()}}, arg_shape, input_order);
{{args[0].get_type(), out[0].get_type()}},
arg_shape,
input_order);
writer << "gpu::invoke_primitive(ctx, " << index << ", "; writer << "gpu::invoke_primitive(ctx, " << index << ", ";
writer << "std::vector<void*>{" << args[0].get_name() << "}.data(), "; writer << "std::vector<void*>{" << args[0].get_name() << "}.data(), ";
...@@ -1031,8 +993,7 @@ namespace ngraph ...@@ -1031,8 +993,7 @@ namespace ngraph
auto& cuda_emitter = auto& cuda_emitter =
external_function->get_primitive_emitter()->get_cuda_emitter(); external_function->get_primitive_emitter()->get_cuda_emitter();
auto index = auto index =
cuda_emitter->build_slice(external_function->ctx().get(), cuda_emitter->build_slice({{args[0].get_type(), out[0].get_type()}},
{{args[0].get_type(), out[0].get_type()}},
arg_shape, arg_shape,
lower_bounds, lower_bounds,
slice_strides, slice_strides,
...@@ -1117,7 +1078,6 @@ namespace ngraph ...@@ -1117,7 +1078,6 @@ namespace ngraph
auto& cuda_emitter = external_function->get_primitive_emitter()->get_cuda_emitter(); auto& cuda_emitter = external_function->get_primitive_emitter()->get_cuda_emitter();
auto rs_index = cuda_emitter->build_reverse_sequence( auto rs_index = cuda_emitter->build_reverse_sequence(
external_function->ctx().get(),
{{args[0].get_type(), args[1].get_type(), out[0].get_type()}}, {{args[0].get_type(), args[1].get_type(), out[0].get_type()}},
arg_shape0, arg_shape0,
arg_shape1, arg_shape1,
...@@ -1142,13 +1102,8 @@ namespace ngraph ...@@ -1142,13 +1102,8 @@ namespace ngraph
{ {
auto& cudnn_emitter = auto& cudnn_emitter =
external_function->get_primitive_emitter()->get_cudnn_emitter(); external_function->get_primitive_emitter()->get_cudnn_emitter();
auto index = cudnn_emitter->build_tensor_op(external_function->ctx().get(), auto index = cudnn_emitter->build_tensor_op(
CUDNN_OP_TENSOR_MUL, CUDNN_OP_TENSOR_MUL, out[0].get_type(), args[0].get_shape(), 1.0, 1.0, 0);
out[0].get_type(),
args[0].get_shape(),
1.0,
1.0,
0);
writer << "gpu::invoke_primitive(ctx, " << index << ", "; writer << "gpu::invoke_primitive(ctx, " << index << ", ";
writer << "std::vector<void*>{" << args[0].get_name() << "," writer << "std::vector<void*>{" << args[0].get_name() << ","
...@@ -1201,13 +1156,8 @@ namespace ngraph ...@@ -1201,13 +1156,8 @@ namespace ngraph
{ {
auto& cudnn_emitter = auto& cudnn_emitter =
external_function->get_primitive_emitter()->get_cudnn_emitter(); external_function->get_primitive_emitter()->get_cudnn_emitter();
auto index = cudnn_emitter->build_tensor_op(external_function->ctx().get(), auto index = cudnn_emitter->build_tensor_op(
CUDNN_OP_TENSOR_SQRT, CUDNN_OP_TENSOR_SQRT, out[0].get_type(), args[0].get_shape(), 1.0, 0, 0);
out[0].get_type(),
args[0].get_shape(),
1.0,
0,
0);
writer << "gpu::invoke_primitive(ctx, " << index << ", "; writer << "gpu::invoke_primitive(ctx, " << index << ", ";
writer << "std::vector<void*>{" << args[0].get_name() << "," writer << "std::vector<void*>{" << args[0].get_name() << ","
...@@ -1259,8 +1209,7 @@ namespace ngraph ...@@ -1259,8 +1209,7 @@ namespace ngraph
auto& cudnn_emitter = auto& cudnn_emitter =
external_function->get_primitive_emitter()->get_cudnn_emitter(); external_function->get_primitive_emitter()->get_cudnn_emitter();
auto max_index = auto max_index =
cudnn_emitter->build_reduce_forward(external_function->ctx().get(), cudnn_emitter->build_reduce_forward(CUDNN_REDUCE_TENSOR_MAX,
CUDNN_REDUCE_TENSOR_MAX,
out[0].get_type(), out[0].get_type(),
args[0].get_shape(), args[0].get_shape(),
max_op->get_reduction_axes()); max_op->get_reduction_axes());
...@@ -1308,8 +1257,7 @@ namespace ngraph ...@@ -1308,8 +1257,7 @@ namespace ngraph
auto& cudnn_emitter = auto& cudnn_emitter =
external_function->get_primitive_emitter()->get_cudnn_emitter(); external_function->get_primitive_emitter()->get_cudnn_emitter();
auto min_index = auto min_index =
cudnn_emitter->build_reduce_forward(external_function->ctx().get(), cudnn_emitter->build_reduce_forward(CUDNN_REDUCE_TENSOR_MIN,
CUDNN_REDUCE_TENSOR_MIN,
out[0].get_type(), out[0].get_type(),
args[0].get_shape(), args[0].get_shape(),
min_op->get_reduction_axes()); min_op->get_reduction_axes());
...@@ -1348,8 +1296,7 @@ namespace ngraph ...@@ -1348,8 +1296,7 @@ namespace ngraph
auto& cudnn_emitter = auto& cudnn_emitter =
external_function->get_primitive_emitter()->get_cudnn_emitter(); external_function->get_primitive_emitter()->get_cudnn_emitter();
auto sum_index = auto sum_index =
cudnn_emitter->build_reduce_forward(external_function->ctx().get(), cudnn_emitter->build_reduce_forward(CUDNN_REDUCE_TENSOR_ADD,
CUDNN_REDUCE_TENSOR_ADD,
out[0].get_type(), out[0].get_type(),
args[0].get_shape(), args[0].get_shape(),
sum->get_reduction_axes()); sum->get_reduction_axes());
...@@ -1393,8 +1340,7 @@ namespace ngraph ...@@ -1393,8 +1340,7 @@ namespace ngraph
auto& cudnn_emitter = auto& cudnn_emitter =
external_function->get_primitive_emitter()->get_cudnn_emitter(); external_function->get_primitive_emitter()->get_cudnn_emitter();
auto index = auto index =
cudnn_emitter->build_reduce_forward(external_function->ctx().get(), cudnn_emitter->build_reduce_forward(CUDNN_REDUCE_TENSOR_MUL,
CUDNN_REDUCE_TENSOR_MUL,
out[0].get_type(), out[0].get_type(),
args[0].get_shape(), args[0].get_shape(),
product->get_reduction_axes()); product->get_reduction_axes());
...@@ -1491,7 +1437,6 @@ namespace ngraph ...@@ -1491,7 +1437,6 @@ namespace ngraph
auto& cudnn_emitter = auto& cudnn_emitter =
external_function->get_primitive_emitter()->get_cudnn_emitter(); external_function->get_primitive_emitter()->get_cudnn_emitter();
auto reduce_index = cudnn_emitter->build_reduce_forward( auto reduce_index = cudnn_emitter->build_reduce_forward(
external_function->ctx().get(),
reduce_tensor_op, reduce_tensor_op,
out[0].get_type(), out[0].get_type(),
args[0].get_shape(), args[0].get_shape(),
...@@ -1596,7 +1541,6 @@ namespace ngraph ...@@ -1596,7 +1541,6 @@ namespace ngraph
args[0].get_type(), args[0].get_type(), out[0].get_type()}; args[0].get_type(), args[0].get_type(), out[0].get_type()};
reduce_index = cuda_emitter->build_reduce_window( reduce_index = cuda_emitter->build_reduce_window(
external_function->ctx().get(),
it->second, it->second,
dtypes, dtypes,
args[0].get_shape(), args[0].get_shape(),
...@@ -1631,8 +1575,7 @@ namespace ngraph ...@@ -1631,8 +1575,7 @@ namespace ngraph
external_function->get_primitive_emitter()->get_cuda_emitter(); external_function->get_primitive_emitter()->get_cuda_emitter();
auto pad_index = auto pad_index =
cuda_emitter->build_pad(external_function->ctx().get(), cuda_emitter->build_pad({{args[0].get_type(), out[0].get_type()}},
{{args[0].get_type(), out[0].get_type()}},
input_shape, input_shape,
output_shape, output_shape,
padding_below, padding_below,
...@@ -1694,8 +1637,7 @@ namespace ngraph ...@@ -1694,8 +1637,7 @@ namespace ngraph
ss << TypeInfo::Get(args[0].get_element_type())->lowest(); ss << TypeInfo::Get(args[0].get_element_type())->lowest();
auto pad_index = auto pad_index =
cuda_emitter->build_pad(external_function->ctx().get(), cuda_emitter->build_pad({{args[0].get_type(), out[0].get_type()}},
{{args[0].get_type(), out[0].get_type()}},
input_shape, input_shape,
shape_to_pool, shape_to_pool,
padding_below, padding_below,
...@@ -1733,7 +1675,6 @@ namespace ngraph ...@@ -1733,7 +1675,6 @@ namespace ngraph
external_function->get_primitive_emitter()->get_cuda_emitter(); external_function->get_primitive_emitter()->get_cuda_emitter();
max_pool_index = cuda_emitter->build_1d_max_pool( max_pool_index = cuda_emitter->build_1d_max_pool(
external_function->ctx().get(),
{{args[0].get_type(), out[0].get_type()}}, {{args[0].get_type(), out[0].get_type()}},
input_shape, input_shape,
result_shape, result_shape,
...@@ -1747,7 +1688,6 @@ namespace ngraph ...@@ -1747,7 +1688,6 @@ namespace ngraph
external_function->get_primitive_emitter()->get_cudnn_emitter(); external_function->get_primitive_emitter()->get_cudnn_emitter();
max_pool_index = cudnn_emitter->build_pooling( max_pool_index = cudnn_emitter->build_pooling(
external_function->ctx().get(),
CUDNN_POOLING_MAX, CUDNN_POOLING_MAX,
out[0].get_type(), out[0].get_type(),
CUDNNEmitter::Prop::Forward, CUDNNEmitter::Prop::Forward,
...@@ -1795,8 +1735,7 @@ namespace ngraph ...@@ -1795,8 +1735,7 @@ namespace ngraph
if (fp_input_shape.size() >= 4) if (fp_input_shape.size() >= 4)
{ {
auto max_pool_bp_index = auto max_pool_bp_index =
cudnn_emitter->build_pooling(external_function->ctx().get(), cudnn_emitter->build_pooling(CUDNN_POOLING_MAX,
CUDNN_POOLING_MAX,
out[0].get_type(), out[0].get_type(),
CUDNNEmitter::Prop::Backward, CUDNNEmitter::Prop::Backward,
fp_input_shape, fp_input_shape,
...@@ -1835,8 +1774,7 @@ namespace ngraph ...@@ -1835,8 +1774,7 @@ namespace ngraph
direction = CUDNNEmitter::Prop::Inference; direction = CUDNNEmitter::Prop::Inference;
} }
auto bn_index = cudnn_emitter->build_batchnorm(external_function->ctx().get(), auto bn_index = cudnn_emitter->build_batchnorm(CUDNN_BATCHNORM_SPATIAL,
CUDNN_BATCHNORM_SPATIAL,
out[0].get_type(), out[0].get_type(),
direction, direction,
args[2].get_shape(), args[2].get_shape(),
...@@ -1872,8 +1810,7 @@ namespace ngraph ...@@ -1872,8 +1810,7 @@ namespace ngraph
auto& cudnn_emitter = auto& cudnn_emitter =
external_function->get_primitive_emitter()->get_cudnn_emitter(); external_function->get_primitive_emitter()->get_cudnn_emitter();
auto bn_index = cudnn_emitter->build_batchnorm(external_function->ctx().get(), auto bn_index = cudnn_emitter->build_batchnorm(CUDNN_BATCHNORM_SPATIAL,
CUDNN_BATCHNORM_SPATIAL,
out[0].get_type(), out[0].get_type(),
CUDNNEmitter::Prop::Backward, CUDNNEmitter::Prop::Backward,
args[2].get_shape(), args[2].get_shape(),
...@@ -1970,8 +1907,7 @@ namespace ngraph ...@@ -1970,8 +1907,7 @@ namespace ngraph
external_function->get_primitive_emitter()->get_cuda_emitter(); external_function->get_primitive_emitter()->get_cuda_emitter();
avg_pool_index = avg_pool_index =
cuda_emitter->build_avg_pool(external_function->ctx().get(), cuda_emitter->build_avg_pool({{args[0].get_type(), out[0].get_type()}},
{{args[0].get_type(), out[0].get_type()}},
input_shape, input_shape,
result_shape, result_shape,
avg_pool->get_window_shape(), avg_pool->get_window_shape(),
...@@ -1991,7 +1927,6 @@ namespace ngraph ...@@ -1991,7 +1927,6 @@ namespace ngraph
: CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING; : CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
avg_pool_index = cudnn_emitter->build_pooling( avg_pool_index = cudnn_emitter->build_pooling(
external_function->ctx().get(),
cudnn_avg_type, cudnn_avg_type,
out[0].get_type(), out[0].get_type(),
CUDNNEmitter::Prop::Forward, CUDNNEmitter::Prop::Forward,
...@@ -2036,8 +1971,7 @@ namespace ngraph ...@@ -2036,8 +1971,7 @@ namespace ngraph
: CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING; : CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
auto avg_pool_bp_index = auto avg_pool_bp_index =
cudnn_emitter->build_pooling(external_function->ctx().get(), cudnn_emitter->build_pooling(cudnn_avg_type,
cudnn_avg_type,
out[0].get_type(), out[0].get_type(),
CUDNNEmitter::Prop::Backward, CUDNNEmitter::Prop::Backward,
output_shape, output_shape,
...@@ -2095,7 +2029,6 @@ namespace ngraph ...@@ -2095,7 +2029,6 @@ namespace ngraph
external_function->get_primitive_emitter()->get_cuda_emitter(); external_function->get_primitive_emitter()->get_cuda_emitter();
auto replace_slice_index = cuda_emitter->build_replace_slice( auto replace_slice_index = cuda_emitter->build_replace_slice(
external_function->ctx().get(),
{{args[0].get_type(), args[1].get_type(), out[0].get_type()}}, {{args[0].get_type(), args[1].get_type(), out[0].get_type()}},
input_shape, input_shape,
source_shape, source_shape,
...@@ -2146,7 +2079,6 @@ namespace ngraph ...@@ -2146,7 +2079,6 @@ namespace ngraph
size_t exp_sum_reduce = size_t exp_sum_reduce =
cuda_emitter cuda_emitter
->build_elementwise_collective<ngraph::op::Exp, ngraph::op::Add>( ->build_elementwise_collective<ngraph::op::Exp, ngraph::op::Add>(
external_function->ctx().get(),
{{args[0].get_type(), out[0].get_type()}}, {{args[0].get_type(), out[0].get_type()}},
args[0].get_shape(), args[0].get_shape(),
{}, {},
...@@ -2166,7 +2098,6 @@ namespace ngraph ...@@ -2166,7 +2098,6 @@ namespace ngraph
// inplace binary division with fused broadcast to calculate softmax // inplace binary division with fused broadcast to calculate softmax
size_t div_broadcast = size_t div_broadcast =
cuda_emitter->build_elementwise_collective<ngraph::op::Divide>( cuda_emitter->build_elementwise_collective<ngraph::op::Divide>(
external_function->ctx().get(),
{{out[0].get_type(), out[0].get_type(), out[0].get_type()}}, {{out[0].get_type(), out[0].get_type(), out[0].get_type()}},
out[0].get_shape(), out[0].get_shape(),
{1}, {1},
...@@ -2181,8 +2112,7 @@ namespace ngraph ...@@ -2181,8 +2112,7 @@ namespace ngraph
else else
{ {
size_t softmax_index = size_t softmax_index =
cudnn_emitter->build_softmax(external_function->ctx().get(), cudnn_emitter->build_softmax(CUDNN_SOFTMAX_FAST,
CUDNN_SOFTMAX_FAST,
CUDNN_SOFTMAX_MODE_INSTANCE, CUDNN_SOFTMAX_MODE_INSTANCE,
out[0].get_type(), out[0].get_type(),
CUDNNEmitter::Prop::Forward, CUDNNEmitter::Prop::Forward,
......
...@@ -85,8 +85,8 @@ namespace ngraph ...@@ -85,8 +85,8 @@ namespace ngraph
dtypes.push_back(arg.get_type()); dtypes.push_back(arg.get_type());
} }
dtypes.push_back(out[0].get_type()); dtypes.push_back(out[0].get_type());
auto ew_index = cuda_emitter->build_elementwise<T>( auto ew_index =
external_function->ctx().get(), dtypes, out[0].get_shape()); cuda_emitter->build_elementwise<T>(dtypes, out[0].get_shape());
writer << "gpu::invoke_primitive(ctx, " << ew_index << ", "; writer << "gpu::invoke_primitive(ctx, " << ew_index << ", ";
writer << "std::vector<void*>{" << args.front().get_name(); writer << "std::vector<void*>{" << args.front().get_name();
for (size_t i = 1; i < args.size(); i++) for (size_t i = 1; i < args.size(); i++)
......
...@@ -237,44 +237,21 @@ static const runtime::gpu::OpMap dispatcher{ ...@@ -237,44 +237,21 @@ static const runtime::gpu::OpMap dispatcher{
{TI(ngraph::op::Or), &runtime::gpu::GPU_Emitter::emit_elementwise<ngraph::op::Or>}}; {TI(ngraph::op::Or), &runtime::gpu::GPU_Emitter::emit_elementwise<ngraph::op::Or>}};
runtime::gpu::GPU_ExternalFunction::GPU_ExternalFunction( runtime::gpu::GPU_ExternalFunction::GPU_ExternalFunction(
const shared_ptr<ngraph::Function>& function, bool release_function) const shared_ptr<ngraph::Function>& function,
std::shared_ptr<GPU_Backend::BackendContext>& shared_context,
bool release_function)
: m_compiled_function(nullptr) : m_compiled_function(nullptr)
, m_ctx(new GPURuntimeContext)
, m_function(function) , m_function(function)
, m_emit_timing(false) , m_emit_timing(false)
, m_is_compiled(false) , m_is_compiled(false)
, m_release_function(release_function) , m_release_function(release_function)
, m_temporaries_used(false) , m_temporaries_used(false)
, m_shared_context(shared_context)
{ {
// Create context use driver API and make it current, the runtime call will pickup the context
// http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html
// #interoperability-between-runtime-and-driver-apis
ngraph::runtime::gpu::CudaContextManager::Instance().SetContextCurrent();
cublasStatus_t cublasStatus = cublasCreate(&m_cublas_handle);
if (cublasStatus != CUBLAS_STATUS_SUCCESS)
{
throw runtime_error("cuBLAS create handle failed");
}
cudnnStatus_t cudnnStatus = cudnnCreate(&m_cudnn_handle);
if (cudnnStatus != CUDNN_STATUS_SUCCESS)
{
throw runtime_error("cuDNN create handle failed");
}
// Pass scalars as reference on the Device
cublasSetPointerMode(m_cublas_handle, CUBLAS_POINTER_MODE_DEVICE);
// register with c-api runtime context
m_ctx->cublas_handle = &m_cublas_handle;
m_ctx->cudnn_handle = &m_cudnn_handle;
m_ctx->compiled_kernel_pool = new CudaFunctionPool;
} }
runtime::gpu::GPU_ExternalFunction::~GPU_ExternalFunction() runtime::gpu::GPU_ExternalFunction::~GPU_ExternalFunction()
{ {
cublasDestroy(m_cublas_handle);
cudnnDestroy(m_cudnn_handle);
delete m_ctx->compiled_kernel_pool;
} }
void runtime::gpu::GPU_ExternalFunction::emit_header() void runtime::gpu::GPU_ExternalFunction::emit_header()
...@@ -387,7 +364,8 @@ void runtime::gpu::GPU_ExternalFunction::emit_constant_declarations() ...@@ -387,7 +364,8 @@ void runtime::gpu::GPU_ExternalFunction::emit_constant_declarations()
{ {
shared_ptr<descriptor::TensorView> tv = node->get_outputs()[0].get_tensor_view(); shared_ptr<descriptor::TensorView> tv = node->get_outputs()[0].get_tensor_view();
// get an allocator for transient per kernel gpu memory // get an allocator for transient per kernel gpu memory
GPUAllocator allocator = this->m_primitive_emitter->get_memory_allocator(); GPUAllocator allocator =
m_shared_context->m_primitive_emitter->get_memory_allocator();
size_t idx = allocator.reserve_argspace( size_t idx = allocator.reserve_argspace(
c->get_data_ptr(), c->get_data_ptr(),
tv->get_tensor().size() * tv->get_tensor().get_element_type().size()); tv->get_tensor().size() * tv->get_tensor().get_element_type().size());
...@@ -698,8 +676,6 @@ void runtime::gpu::GPU_ExternalFunction::compile() ...@@ -698,8 +676,6 @@ void runtime::gpu::GPU_ExternalFunction::compile()
return; return;
} }
m_primitive_emitter.reset(new GPUPrimitiveEmitter());
m_function_name = m_function->get_name(); m_function_name = m_function->get_name();
string dump_filename = file_util::path_join(s_output_dir, m_function_name + "_ops.txt"); string dump_filename = file_util::path_join(s_output_dir, m_function_name + "_ops.txt");
...@@ -722,8 +698,9 @@ void runtime::gpu::GPU_ExternalFunction::compile() ...@@ -722,8 +698,9 @@ void runtime::gpu::GPU_ExternalFunction::compile()
emit_function_declarations(); emit_function_declarations();
collect_unique_functions(); collect_unique_functions();
emit_functions(); emit_functions();
// allocate device buffers for primitive arguments and workspace // allocate device buffers for primitive arguments and workspace
m_primitive_emitter->allocate_primitive_memory(); m_shared_context->m_primitive_emitter->allocate_primitive_memory();
string code = m_writer.get_code(); string code = m_writer.get_code();
store_emitted_functions(code); store_emitted_functions(code);
...@@ -781,11 +758,6 @@ void runtime::gpu::GPU_ExternalFunction::emit_debug_function_exit(Node* node) ...@@ -781,11 +758,6 @@ void runtime::gpu::GPU_ExternalFunction::emit_debug_function_exit(Node* node)
} }
} }
unique_ptr<runtime::gpu::GPURuntimeContext>& runtime::gpu::GPU_ExternalFunction::ctx()
{
return m_ctx;
}
string runtime::gpu::GPU_ExternalFunction::emit_op_as_function(const Node& node, string runtime::gpu::GPU_ExternalFunction::emit_op_as_function(const Node& node,
const string& function_name) const string& function_name)
{ {
......
...@@ -32,6 +32,7 @@ ...@@ -32,6 +32,7 @@
#include "ngraph/pass/manager.hpp" #include "ngraph/pass/manager.hpp"
#include "ngraph/pass/memory_layout.hpp" #include "ngraph/pass/memory_layout.hpp"
#include "ngraph/pass/result_copy_elimination.hpp" #include "ngraph/pass/result_copy_elimination.hpp"
#include "ngraph/runtime/gpu/gpu_backend.hpp"
#include "ngraph/runtime/gpu/gpu_call_frame.hpp" #include "ngraph/runtime/gpu/gpu_call_frame.hpp"
#include "ngraph/runtime/gpu/gpu_primitive_emitter.hpp" #include "ngraph/runtime/gpu/gpu_primitive_emitter.hpp"
#include "ngraph/runtime/gpu/gpu_tensor_view_wrapper.hpp" #include "ngraph/runtime/gpu/gpu_tensor_view_wrapper.hpp"
...@@ -62,6 +63,7 @@ namespace ngraph ...@@ -62,6 +63,7 @@ namespace ngraph
public: public:
GPU_ExternalFunction(const std::shared_ptr<ngraph::Function>& function, GPU_ExternalFunction(const std::shared_ptr<ngraph::Function>& function,
std::shared_ptr<GPU_Backend::BackendContext>& shared_context,
bool release_function = true); bool release_function = true);
~GPU_ExternalFunction(); ~GPU_ExternalFunction();
...@@ -69,7 +71,7 @@ namespace ngraph ...@@ -69,7 +71,7 @@ namespace ngraph
std::unique_ptr<runtime::gpu::GPURuntimeContext>& ctx(); std::unique_ptr<runtime::gpu::GPURuntimeContext>& ctx();
const std::unique_ptr<GPUPrimitiveEmitter>& get_primitive_emitter() const const std::unique_ptr<GPUPrimitiveEmitter>& get_primitive_emitter() const
{ {
return m_primitive_emitter; return m_shared_context->m_primitive_emitter;
} }
protected: protected:
...@@ -98,8 +100,6 @@ namespace ngraph ...@@ -98,8 +100,6 @@ namespace ngraph
std::unique_ptr<codegen::Compiler> m_compiler; std::unique_ptr<codegen::Compiler> m_compiler;
std::unique_ptr<codegen::ExecutionEngine> m_execution_engine; std::unique_ptr<codegen::ExecutionEngine> m_execution_engine;
std::unique_ptr<GPUPrimitiveEmitter> m_primitive_emitter;
std::unique_ptr<GPURuntimeContext> m_ctx;
std::shared_ptr<ngraph::Function> m_function; std::shared_ptr<ngraph::Function> m_function;
std::map<std::string, size_t> m_name_index_map; std::map<std::string, size_t> m_name_index_map;
...@@ -116,8 +116,7 @@ namespace ngraph ...@@ -116,8 +116,7 @@ namespace ngraph
std::string m_function_name; std::string m_function_name;
std::string m_pch_header_source; std::string m_pch_header_source;
cublasHandle_t m_cublas_handle; std::shared_ptr<GPU_Backend::BackendContext> m_shared_context;
cudnnHandle_t m_cudnn_handle;
}; };
} }
} }
......
...@@ -28,17 +28,36 @@ runtime::gpu::GPUMemoryManager::GPUMemoryManager(GPUPrimitiveEmitter* emitter) ...@@ -28,17 +28,36 @@ runtime::gpu::GPUMemoryManager::GPUMemoryManager(GPUPrimitiveEmitter* emitter)
: m_buffer_offset(0) : m_buffer_offset(0)
, m_buffered_mem(initial_buffer_size) , m_buffered_mem(initial_buffer_size)
, m_workspace_manager(alignment) , m_workspace_manager(alignment)
, m_argspace(nullptr) , m_argspace_mem(1, {nullptr, 0})
, m_workspace(nullptr) , m_workspace_mem(1, {nullptr, 0})
, m_allocation_size(0)
, m_primitive_emitter(emitter) , m_primitive_emitter(emitter)
{ {
} }
size_t runtime::gpu::GPUMemoryManager::get_allocation_size() const
{
size_t allocation_size = 0;
for (auto const& alloc : m_argspace_mem)
{
allocation_size += alloc.size;
}
for (auto const& alloc : m_workspace_mem)
{
allocation_size += alloc.size;
}
return allocation_size;
}
runtime::gpu::GPUMemoryManager::~GPUMemoryManager() runtime::gpu::GPUMemoryManager::~GPUMemoryManager()
{ {
runtime::gpu::free_gpu_buffer(m_argspace); for (auto& alloc : m_argspace_mem)
runtime::gpu::free_gpu_buffer(m_workspace); {
runtime::gpu::free_gpu_buffer(alloc.ptr);
}
for (auto& alloc : m_workspace_mem)
{
runtime::gpu::free_gpu_buffer(alloc.ptr);
}
} }
void runtime::gpu::GPUMemoryManager::allocate() void runtime::gpu::GPUMemoryManager::allocate()
...@@ -46,15 +65,25 @@ void runtime::gpu::GPUMemoryManager::allocate() ...@@ -46,15 +65,25 @@ void runtime::gpu::GPUMemoryManager::allocate()
if (m_buffer_offset) if (m_buffer_offset)
{ {
m_buffer_offset = pass::MemoryManager::align(m_buffer_offset, alignment); m_buffer_offset = pass::MemoryManager::align(m_buffer_offset, alignment);
m_argspace = runtime::gpu::create_gpu_buffer(m_buffer_offset); // the back most node is always empty, fill it here
runtime::gpu::cuda_memcpyHtD(m_argspace, m_buffered_mem.data(), m_buffer_offset); m_argspace_mem.back().ptr = runtime::gpu::create_gpu_buffer(m_buffer_offset);
m_allocation_size += m_buffer_offset; m_argspace_mem.back().size = m_buffer_offset;
// copy buffered kernel arguments to device
runtime::gpu::cuda_memcpyHtD(
m_argspace_mem.back().ptr, m_buffered_mem.data(), m_buffer_offset);
// add an empty node to the end of the list and zero offset
m_argspace_mem.push_back({nullptr, 0});
m_buffer_offset = 0;
} }
auto workspace_size = m_workspace_manager.max_allocated(); auto workspace_size = m_workspace_manager.max_allocated();
if (workspace_size) if (workspace_size)
{ {
m_workspace = runtime::gpu::create_gpu_buffer(workspace_size); m_workspace_mem.back().ptr = runtime::gpu::create_gpu_buffer(workspace_size);
m_allocation_size += workspace_size; m_workspace_mem.back().size = workspace_size;
m_workspace_mem.push_back({nullptr, 0});
// construct a new manager if the current one was used
m_workspace_manager = pass::MemoryManager(alignment);
} }
} }
...@@ -86,17 +115,16 @@ size_t runtime::gpu::GPUAllocator::reserve_argspace(const void* data, size_t siz ...@@ -86,17 +115,16 @@ size_t runtime::gpu::GPUAllocator::reserve_argspace(const void* data, size_t siz
// add parameter data to host buffer that will be transfered to device // add parameter data to host buffer that will be transfered to device
size = pass::MemoryManager::align(size, runtime::gpu::GPUMemoryManager::alignment); size = pass::MemoryManager::align(size, runtime::gpu::GPUMemoryManager::alignment);
size_t offset = m_manager->queue_for_transfer(data, size); size_t offset = m_manager->queue_for_transfer(data, size);
// required to capture m_manager pointer auto local = std::prev(m_manager->m_argspace_mem.end());
// directly rather than `this` pointer
auto manager = m_manager;
// return a lambda that will yield the gpu memory address. this // return a lambda that will yield the gpu memory address. this
// should only be evaluated by the runtime invoked primitive // should only be evaluated by the runtime invoked primitive
gpu::memory_primitive mem_primitive = [=]() { gpu::memory_primitive mem_primitive = [=]() {
if (manager->m_argspace == nullptr) void* argspace = (*local).ptr;
if (argspace == nullptr)
{ {
throw std::runtime_error("An attempt was made to use unallocated device memory."); throw std::runtime_error("An attempt was made to use unallocated device memory.");
} }
auto gpu_mem = static_cast<uint8_t*>(manager->m_argspace); auto gpu_mem = static_cast<uint8_t*>(argspace);
return static_cast<void*>(gpu_mem + offset); return static_cast<void*>(gpu_mem + offset);
}; };
return m_manager->m_primitive_emitter->insert(mem_primitive); return m_manager->m_primitive_emitter->insert(mem_primitive);
...@@ -106,23 +134,22 @@ size_t runtime::gpu::GPUAllocator::reserve_workspace(size_t size, bool zero_init ...@@ -106,23 +134,22 @@ size_t runtime::gpu::GPUAllocator::reserve_workspace(size_t size, bool zero_init
{ {
size_t offset = m_manager->m_workspace_manager.allocate(size); size_t offset = m_manager->m_workspace_manager.allocate(size);
m_active.push(offset); m_active.push(offset);
// required to capture m_manager pointer auto local = std::prev(m_manager->m_workspace_mem.end());
// directly rather than `this` pointer
auto manager = m_manager;
// return a lambda that will yield the gpu memory address. this // return a lambda that will yield the gpu memory address. this
// should only be evaluated by the runtime invoked primitive // should only be evaluated by the runtime invoked primitive
gpu::memory_primitive mem_primitive = [=]() { gpu::memory_primitive mem_primitive = [=]() {
if (manager->m_workspace == nullptr) void* workspace = (*local).ptr;
if (workspace == nullptr)
{ {
throw std::runtime_error("An attempt was made to use unallocated device memory."); throw std::runtime_error("An attempt was made to use unallocated device memory.");
} }
auto gpu_mem = static_cast<uint8_t*>(manager->m_workspace); auto gpu_mem = static_cast<uint8_t*>(workspace);
auto workspace = static_cast<void*>(gpu_mem + offset); auto workspace_ptr = static_cast<void*>(gpu_mem + offset);
if (zero_initialize) if (zero_initialize)
{ {
runtime::gpu::cuda_memset(workspace, 0, size); runtime::gpu::cuda_memset(workspace_ptr, 0, size);
} }
return workspace; return workspace_ptr;
}; };
return m_manager->m_primitive_emitter->insert(mem_primitive); return m_manager->m_primitive_emitter->insert(mem_primitive);
} }
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#pragma once #pragma once
#include <list>
#include <memory> #include <memory>
#include <stack> #include <stack>
#include <vector> #include <vector>
...@@ -65,7 +66,7 @@ namespace ngraph ...@@ -65,7 +66,7 @@ namespace ngraph
~GPUMemoryManager(); ~GPUMemoryManager();
void allocate(); void allocate();
size_t get_allocation_size() { return m_allocation_size; } size_t get_allocation_size() const;
GPUAllocator build_allocator() { return GPUAllocator(this); } GPUAllocator build_allocator() { return GPUAllocator(this); }
private: private:
GPUMemoryManager(GPUPrimitiveEmitter* emitter); GPUMemoryManager(GPUPrimitiveEmitter* emitter);
...@@ -75,10 +76,15 @@ namespace ngraph ...@@ -75,10 +76,15 @@ namespace ngraph
std::vector<uint8_t> m_buffered_mem; std::vector<uint8_t> m_buffered_mem;
pass::MemoryManager m_workspace_manager; pass::MemoryManager m_workspace_manager;
static constexpr const uint16_t alignment = 8; static constexpr const uint16_t alignment = 8;
void* m_argspace;
void* m_workspace;
size_t m_allocation_size;
struct allocation
{
void* ptr;
size_t size;
};
std::list<allocation> m_argspace_mem;
std::list<allocation> m_workspace_mem;
GPUPrimitiveEmitter* m_primitive_emitter; GPUPrimitiveEmitter* m_primitive_emitter;
}; };
} }
......
...@@ -23,8 +23,15 @@ using namespace ngraph; ...@@ -23,8 +23,15 @@ using namespace ngraph;
using namespace ngraph::runtime::gpu; using namespace ngraph::runtime::gpu;
GPUPrimitiveEmitter::GPUPrimitiveEmitter() GPUPrimitiveEmitter::GPUPrimitiveEmitter()
: m_cuda_emitter(new CUDAEmitter(this)) : m_cuda_emitter(new CUDAEmitter(this, nullptr))
, m_cudnn_emitter(new CUDNNEmitter(this)) , m_cudnn_emitter(new CUDNNEmitter(this, nullptr))
, m_memory_manager(this)
{
}
GPUPrimitiveEmitter::GPUPrimitiveEmitter(const std::unique_ptr<GPURuntimeContext>& ctx)
: m_cuda_emitter(new CUDAEmitter(this, ctx.get()))
, m_cudnn_emitter(new CUDNNEmitter(this, ctx.get()))
, m_memory_manager(this) , m_memory_manager(this)
{ {
} }
......
...@@ -31,11 +31,11 @@ namespace ngraph ...@@ -31,11 +31,11 @@ namespace ngraph
{ {
class CUDAEmitter; class CUDAEmitter;
class CUDNNEmitter; class CUDNNEmitter;
class GPUPrimitiveEmitter class GPUPrimitiveEmitter
{ {
public: public:
GPUPrimitiveEmitter(); GPUPrimitiveEmitter();
GPUPrimitiveEmitter(const std::unique_ptr<GPURuntimeContext>& ctx);
std::unique_ptr<CUDAEmitter>& get_cuda_emitter(); std::unique_ptr<CUDAEmitter>& get_cuda_emitter();
std::unique_ptr<CUDNNEmitter>& get_cudnn_emitter(); std::unique_ptr<CUDNNEmitter>& get_cudnn_emitter();
std::vector<gpu::primitive*>& get_primitives() { return m_gpu_primitives; } std::vector<gpu::primitive*>& get_primitives() { return m_gpu_primitives; }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment