Commit 38ba5c12 authored by Fenglei's avatar Fenglei Committed by Scott Cyphers

gpu concat optimize (#1259)

* optimize concat

* compile sucess

* multi inputs

* clang format
parent deacf29a
......@@ -69,6 +69,100 @@ runtime::gpu::CUDAEmitter::CUDAEmitter(runtime::gpu::GPUPrimitiveEmitter* emitte
m_ctx = ctx;
}
size_t runtime::gpu::CUDAEmitter::build_concat(const std::vector<std::string>& dtypes,
std::vector<GPUShape> input_shapes,
size_t concat_axis,
GPUShape output_shape)
{
std::stringstream kernel_name;
size_t input_size = input_shapes.size();
kernel_name << "concat_" << join(dtypes, "_") << "_r_" << input_size;
std::stringstream hash;
hash << kernel_name.str() << "_o_" << join(output_shape, "_") << "_a_" << concat_axis;
for (size_t i = 0; i < input_size; i++)
{
hash << "_i_" << join(input_shapes[i], "_");
}
// For backwards compatability we currently use two unordered maps
// 1. one looks up the compiled cuda kernel (CudaFunctionPool)
// 2. the other looks to see if this kernel is already in the primitive list
// check if the requested kernel is already an inserted primitive
size_t primitive_index = m_primitive_emitter->lookup(hash.str());
if (primitive_index != std::numeric_limits<size_t>::max())
{
return primitive_index;
}
// check if the kernel has already been compiled. if so, create
// a launch primitive for it based on the input tensor shape
// but do not recompile the kernel. otherwise, do it all:
// recompile the kernel and then create the primitive
auto compiled_kernel = m_ctx->compiled_kernel_pool->get(kernel_name.str());
if (compiled_kernel == nullptr)
{
codegen::CodeWriter writer;
CudaKernelBuilder::add_pod_typedefs(writer);
CudaKernelBuilder::get_concat_op(writer, kernel_name.str(), dtypes, input_shapes.size());
compiled_kernel = m_ctx->compiled_kernel_pool->set(kernel_name.str(), writer.get_code());
}
std::vector<uint32_t> block_strides(input_size, 1);
uint32_t block_size = 0;
for (size_t i = 0; i < input_size; i++)
{
auto arg_rank = input_shapes[i].size();
for (size_t j = concat_axis; j < arg_rank; j++)
{
block_strides[i] *= input_shapes[i][j];
}
block_size += block_strides[i];
}
uint32_t nthreads = static_cast<uint32_t>(shape_size(output_shape));
//TODO: currently we set it to 64, will add tuning method later
uint32_t block_size_x = 64;
uint32_t aligned_grid_size_x = align_to_block_size(nthreads, block_size_x);
// get an allocator for transient per kernel gpu memory
GPUAllocator allocator = this->m_primitive_emitter->get_memory_allocator();
size_t idx_block_strides =
allocator.reserve_argspace(block_strides.data(), block_strides.size() * sizeof(uint32_t));
// create the launch primitive
std::unique_ptr<gpu::primitive> kernel_launch(new gpu::primitive{[=](void** inputs,
void** outputs) mutable {
void* param_block_strides = runtime::gpu::invoke_memory_primitive(m_ctx, idx_block_strides);
std::vector<void*> args_list;
for (size_t i = 0; i < input_size; i++)
{
args_list.push_back(&inputs[i]);
}
args_list.push_back(&outputs[0]);
args_list.push_back(&param_block_strides);
args_list.push_back(&block_size);
args_list.push_back(&nthreads);
CUDA_SAFE_CALL(cuLaunchKernel(*compiled_kernel.get(),
aligned_grid_size_x,
1,
1, // grid dim
block_size_x,
1,
1, // block dim
0,
NULL, // shared mem and stream
args_list.data(),
0)); // arguments
debug_sync();
}});
primitive_index = this->m_primitive_emitter->insert(std::move(kernel_launch));
m_primitive_emitter->cache(hash.str(), primitive_index);
return primitive_index;
}
size_t runtime::gpu::CUDAEmitter::build_pad(const std::array<std::string, 2>& dtypes,
GPUShape input_shape,
GPUShape output_shape,
......
......@@ -135,6 +135,11 @@ namespace ngraph
GPUShape filter_dilation,
GPUShape output_shape);
size_t build_concat(const std::vector<std::string>& dtypes,
std::vector<GPUShape> input_shapes,
size_t concat_axis,
GPUShape output_shape);
void debug_sync();
void sync();
......
......@@ -252,17 +252,17 @@ void runtime::gpu::CudaKernelBuilder::get_concat_op(codegen::CodeWriter& writer,
writer << data_types[i] << "* in" << i << ", ";
}
writer << data_types[num_inputs]
<< "* out, size_t* block_strides, size_t block_size, size_t n)\n";
<< "* out, uint32_t* block_strides, uint32_t block_size, uint32_t n)\n";
writer.block_begin();
{
writer << "size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n";
writer << "uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n";
writer << "if(tid < n)\n";
writer.block_begin();
{
writer << "out[tid] = 1;\n";
writer << "size_t output_idx = tid;\n";
writer << "size_t block_id = tid / block_size;\n";
writer << "size_t block_idx = tid % block_size;\n";
writer << "uint32_t output_idx = tid;\n";
writer << "uint32_t block_id = tid / block_size;\n";
writer << "uint32_t block_idx = tid % block_size;\n";
writer << "bool processed = false;\n";
for (size_t i = 0; i < num_inputs; i++)
{
......
......@@ -54,46 +54,6 @@ namespace ngraph
CUdeviceptr reverse_axes,
size_t rank,
size_t count);
template <typename... Inputs>
void emit_concat_op(const std::string& name,
const std::vector<std::string>& data_types,
GPURuntimeContext* ctx,
size_t count,
size_t block_size,
CUdeviceptr block_strides,
CUdeviceptr out,
Inputs&&... inputs)
{
std::string type_signature = "_" + join(data_types, "_");
std::replace(type_signature.begin(), type_signature.end(), ' ', '_');
auto compiled_kernel = ctx->compiled_kernel_pool->get(name + type_signature);
if (compiled_kernel == nullptr)
{
codegen::CodeWriter writer;
CudaKernelBuilder::add_pod_typedefs(writer);
CudaKernelBuilder::get_concat_op(
writer, name + type_signature, data_types, sizeof...(inputs));
std::string kernel = writer.get_code();
compiled_kernel = ctx->compiled_kernel_pool->set(name + type_signature, kernel);
}
void* args_list[] = {&inputs..., &out, &block_strides, &block_size, &count};
CUDA_SAFE_CALL(cuLaunchKernel(*compiled_kernel.get(),
count,
1,
1, // grid dim
1,
1,
1, // block dim
0,
NULL, // shared mem and stream
args_list,
0)); // arguments
CUDA_SAFE_CALL(cuCtxSynchronize()); // Retrieve and print output.
}
}
}
}
......@@ -824,46 +824,32 @@ namespace ngraph
auto concat = static_cast<const ngraph::op::Concat*>(node);
auto axis = concat->get_concatenation_axis();
std::vector<size_t> block_strides(args.size(), 1);
size_t block_size = 0;
for (size_t i = 0; i < args.size(); i++)
std::vector<std::string> dtypes;
std::vector<GPUShape> input_shapes;
for (auto arg : args)
{
auto arg_shape = args[i].get_shape();
auto arg_rank = arg_shape.size();
for (size_t j = axis; j < arg_rank; j++)
{
block_strides[i] *= arg_shape[j];
}
block_size += block_strides[i];
dtypes.push_back(arg.get_type());
input_shapes.push_back(arg.get_shape());
}
dtypes.push_back(out[0].get_type());
writer.block_begin();
writer << "int count = " << out[0].get_size() << ";\n";
writer << "int num_inputs = " << args.size() << ";\n";
GPUAllocator allocator =
external_function->get_primitive_emitter()->get_memory_allocator();
size_t idx_block_strides = allocator.reserve_argspace(
block_strides.data(), block_strides.size() * sizeof(size_t));
writer << "void* block_strides_d = runtime::gpu::invoke_memory_primitive(ctx, "
<< idx_block_strides << ");\n";
writer << "ngraph::runtime::gpu::emit_concat_op(\"" << node->description() << "\""
<< ", std::vector<std::string>{";
for (size_t i = 0; i < args.size(); i++)
{
writer << "\"" << args[i].get_type() << "\", ";
}
writer << "\"" << out[0].get_type() << "\"}"
<< ", ctx"
<< ", count"
<< ", " << block_size << ", CUdeviceptr(block_strides_d)"
<< ", CUdeviceptr(" << out[0].get_name() << ")";
for (size_t i = 0; i < args.size(); i++)
{
writer << ", CUdeviceptr(" << args[i].get_name() << ")";
auto& cuda_emitter =
external_function->get_primitive_emitter()->get_cuda_emitter();
auto index =
cuda_emitter->build_concat(dtypes, input_shapes, axis, out[0].get_shape());
writer << "gpu::invoke_primitive(ctx, " << index << ", ";
writer << "std::vector<void*>{" << args[0].get_name();
for (size_t i = 1; i < args.size(); i++)
{
writer << ", " << args[i].get_name();
}
writer << "}.data(), ";
writer << "std::vector<void*>{" << out[0].get_name() << "}.data()";
writer << ");\n";
}
writer << ");\n";
writer.block_end();
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment