Commit 68eb2e7d authored by Fenglei's avatar Fenglei Committed by Robert Kimball

pass args instead of pointer to array (#1591)

parent 309bfdf0
...@@ -588,19 +588,6 @@ size_t runtime::gpu::CUDAEmitter::build_reshape(const std::array<std::string, 2> ...@@ -588,19 +588,6 @@ size_t runtime::gpu::CUDAEmitter::build_reshape(const std::array<std::string, 2>
return primitive_index; return primitive_index;
} }
// check if the kernel has already been compiled. if so, create
// a launch primitive for it based on the input tensor shape
// but do not recompile the kernel. otherwise, do it all:
// recompile the kernel and then create the primitive
auto compiled_kernel = m_ctx->compiled_kernel_pool->get(kernel_name.str());
if (compiled_kernel == nullptr)
{
codegen::CodeWriter writer;
CudaKernelBuilder::add_pod_typedefs(writer);
CudaKernelBuilder::get_reshape_op(writer, kernel_name.str(), dtypes, rank);
compiled_kernel = m_ctx->compiled_kernel_pool->set(kernel_name.str(), writer.get_code());
}
uint32_t nthreads = static_cast<uint32_t>(shape_size(input_shape)); uint32_t nthreads = static_cast<uint32_t>(shape_size(input_shape));
// TODO: currently we set it to 64, will add tuning method later // TODO: currently we set it to 64, will add tuning method later
uint32_t block_size_x = 64; uint32_t block_size_x = 64;
...@@ -620,33 +607,46 @@ size_t runtime::gpu::CUDAEmitter::build_reshape(const std::array<std::string, 2> ...@@ -620,33 +607,46 @@ size_t runtime::gpu::CUDAEmitter::build_reshape(const std::array<std::string, 2>
} }
// get an allocator for transient per kernel gpu memory // get an allocator for transient per kernel gpu memory
GPUAllocator allocator = this->m_primitive_emitter->get_memory_allocator(); auto args = m_primitive_emitter->add_kernel_args();
size_t idx_input_strides = args.add_placeholder(dtypes[0], "in")
allocator.reserve_argspace(input_strides.data(), input_strides.size() * sizeof(uint32_t)); .add_placeholder(dtypes[1], "out")
size_t idx_trans_strides = .add("input_strides", input_strides)
allocator.reserve_argspace(trans_strides.data(), trans_strides.size() * sizeof(uint32_t)); .add("trans_strides", trans_strides)
.add("n", nthreads);
// check if the kernel has already been compiled. if so, create
// a launch primitive for it based on the input tensor shape
// but do not recompile the kernel. otherwise, do it all:
// recompile the kernel and then create the primitive
auto compiled_kernel = m_ctx->compiled_kernel_pool->get(kernel_name.str());
if (compiled_kernel == nullptr)
{
codegen::CodeWriter writer;
CudaKernelBuilder::add_pod_typedefs(writer);
CudaKernelBuilder::get_reshape_op(writer, kernel_name.str(), args, dtypes, rank);
compiled_kernel = m_ctx->compiled_kernel_pool->set(kernel_name.str(), writer.get_code());
}
// create the launch primitive // create the launch primitive
std::unique_ptr<gpu::primitive> kernel_launch(new gpu::primitive{[=](void** inputs, std::unique_ptr<gpu::primitive> kernel_launch(
void** outputs) mutable { new gpu::primitive{[=](void** inputs, void** outputs) mutable {
void* param_input_strides = runtime::gpu::invoke_memory_primitive(m_ctx, idx_input_strides); void** args_list = args.resolve_placeholder(0, &inputs[0])
void* param_trans_strides = runtime::gpu::invoke_memory_primitive(m_ctx, idx_trans_strides); .resolve_placeholder(1, &outputs[0])
std::vector<void*> args_list{ .get_argument_list();
&inputs[0], &outputs[0], &param_input_strides, &param_trans_strides, &nthreads};
CUDA_SAFE_CALL(cuLaunchKernel(*compiled_kernel.get(), CUDA_SAFE_CALL(cuLaunchKernel(*compiled_kernel.get(),
aligned_grid_size_x, aligned_grid_size_x,
1, 1,
1, // grid dim 1, // grid dim
block_size_x, block_size_x,
1, 1,
1, // block dim 1, // block dim
0, 0,
NULL, // shared mem and stream NULL, // shared mem and stream
args_list.data(), args_list,
0)); // arguments 0)); // arguments
debug_sync(); debug_sync();
}}); }});
primitive_index = this->m_primitive_emitter->insert(std::move(kernel_launch)); primitive_index = this->m_primitive_emitter->insert(std::move(kernel_launch));
m_primitive_emitter->cache(hash, primitive_index); m_primitive_emitter->cache(hash, primitive_index);
......
...@@ -452,12 +452,11 @@ void runtime::gpu::CudaKernelBuilder::get_onehot_op(codegen::CodeWriter& writer, ...@@ -452,12 +452,11 @@ void runtime::gpu::CudaKernelBuilder::get_onehot_op(codegen::CodeWriter& writer,
void runtime::gpu::CudaKernelBuilder::get_reshape_op(codegen::CodeWriter& writer, void runtime::gpu::CudaKernelBuilder::get_reshape_op(codegen::CodeWriter& writer,
const std::string& name, const std::string& name,
runtime::gpu::GPUKernelArgs& args,
const std::array<std::string, 2>& data_types, const std::array<std::string, 2>& data_types,
size_t rank) size_t rank)
{ {
writer << "extern \"C\" __global__ void cuda_" << name << "(" << data_types[0] << "* in, " writer << "extern \"C\" __global__ void cuda_" << name << args.get_input_signature();
<< data_types[1]
<< "* out, uint32_t* input_strides, uint32_t* trans_strides, uint32_t n)\n";
writer.block_begin(); writer.block_begin();
{ {
writer << "uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n"; writer << "uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n";
...@@ -469,12 +468,12 @@ void runtime::gpu::CudaKernelBuilder::get_reshape_op(codegen::CodeWriter& writer ...@@ -469,12 +468,12 @@ void runtime::gpu::CudaKernelBuilder::get_reshape_op(codegen::CodeWriter& writer
size_t i = 0; size_t i = 0;
for (; i < rank - 1; i++) for (; i < rank - 1; i++)
{ {
writer << "output_idx += (input_idx / input_strides[" << i << "]) * trans_strides[" writer << "output_idx += (input_idx / input_strides" << i << ") * trans_strides"
<< i << "];\n"; << i << ";\n";
writer << "input_idx %= input_strides[" << i << "];\n"; writer << "input_idx %= input_strides" << i << ";\n";
} }
writer << "output_idx += (input_idx / input_strides[" << i << "]) * trans_strides[" << i writer << "output_idx += (input_idx / input_strides" << i << ") * trans_strides" << i
<< "];\n"; << ";\n";
writer << "out[output_idx] = in[tid];\n"; writer << "out[output_idx] = in[tid];\n";
} }
writer.block_end(); writer.block_end();
......
...@@ -56,6 +56,7 @@ namespace ngraph ...@@ -56,6 +56,7 @@ namespace ngraph
static void get_reshape_op(codegen::CodeWriter& writer, static void get_reshape_op(codegen::CodeWriter& writer,
const std::string& name, const std::string& name,
runtime::gpu::GPUKernelArgs& args,
const std::array<std::string, 2>& data_types, const std::array<std::string, 2>& data_types,
size_t rank); size_t rank);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment