Commit 6cd35432 authored by Fenglei's avatar Fenglei Committed by Robert Kimball

nvgpu one hot update (#1773)

* update onehot

* clang

* fix bugs

* format

* add output_datatype_size to hash

* typo

* hash
parent f642bc4c
......@@ -170,13 +170,15 @@ size_t runtime::gpu::CUDAEmitter::build_concat(const std::vector<std::string>& d
size_t runtime::gpu::CUDAEmitter::build_onehot(const std::array<std::string, 2>& dtypes,
NVShape input_shape,
NVShape output_shape,
size_t one_hot_axis)
size_t one_hot_axis,
size_t output_datatype_size)
{
std::stringstream kernel_name;
kernel_name << "onehot_" << join(dtypes, "_");
std::string hash = kernel_name.str() + "_i_" + join(input_shape, "_") + "_o_" +
join(output_shape, "_") + std::to_string(one_hot_axis);
join(output_shape, "_") + "_axis_" + std::to_string(one_hot_axis) +
"_datasize_" + std::to_string(output_datatype_size);
// For backwards compatability we currently use two unordered maps
// 1. one looks up the compiled cuda kernel (CudaFunctionPool)
// 2. the other looks to see if this kernel is already in the primitive list
......@@ -206,18 +208,19 @@ size_t runtime::gpu::CUDAEmitter::build_onehot(const std::array<std::string, 2>&
uint32_t block_size_x = 64;
uint32_t aligned_grid_size_x = align_to_block_size(nthreads, block_size_x);
uint32_t repeat_times = static_cast<uint32_t>(output_shape[one_hot_axis]);
uint32_t repeat_size = 1;
uint32_t hot_axis_shape = static_cast<uint32_t>(output_shape[one_hot_axis]);
uint32_t hot_axis_stride = 1;
for (size_t i = one_hot_axis + 1; i < output_shape.size(); i++)
{
repeat_size *= output_shape[i];
hot_axis_stride *= output_shape[i];
}
uint32_t output_size = static_cast<uint32_t>(shape_size(output_shape) * output_datatype_size);
// create the launch primitive
std::unique_ptr<gpu::primitive> kernel_launch(
new gpu::primitive{[=](void** inputs, void** outputs) mutable {
std::vector<void*> args_list{
&inputs[0], &outputs[0], &repeat_size, &repeat_times, &nthreads};
&inputs[0], &outputs[0], &hot_axis_stride, &hot_axis_shape, &nthreads};
runtime::gpu::cuda_memset(outputs[0], 0, output_size);
CUDA_SAFE_CALL(cuLaunchKernel(*compiled_kernel.get(),
aligned_grid_size_x,
1,
......
......@@ -99,7 +99,8 @@ namespace ngraph
size_t build_onehot(const std::array<std::string, 2>& dtypes,
NVShape input_shape,
NVShape output_shape,
size_t one_hot_axis);
size_t one_hot_axis,
size_t output_datatype_size);
size_t build_reverse(const std::array<std::string, 2>& dtypes,
NVShape input_shape,
......
......@@ -435,15 +435,24 @@ void runtime::gpu::CudaKernelBuilder::get_onehot_op(codegen::CodeWriter& writer,
const std::array<std::string, 2>& data_types)
{
writer << "extern \"C\" __global__ void cuda_" << name << "(" << data_types[0] << "* in, "
<< data_types[1] << "* out, uint32_t m, uint32_t k, uint32_t n)\n";
<< data_types[1]
<< "* out, uint32_t hot_axis_stride, uint32_t hot_axis_shape, uint32_t n)\n";
writer.block_begin();
{
writer << "uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n";
writer << "if (tid < n)\n";
writer.block_begin();
{
writer << "uint32_t idx = (tid / m) * m * k + (m * in[tid]) + tid % m;\n";
writer << "out[idx] = 1;\n";
writer << "int32_t in_pixel = static_cast<int32_t>(in[tid]);\n";
writer << "if(in_pixel >= 0 && in_pixel < hot_axis_shape)\n";
writer.block_begin();
{
writer << "uint32_t idx = tid / hot_axis_stride * hot_axis_stride * hot_axis_shape "
"+ (hot_axis_stride * in_pixel) + tid % "
"hot_axis_stride;\n";
writer << "out[idx] = 1;\n";
}
writer.block_end();
}
writer.block_end();
}
......
......@@ -792,13 +792,17 @@ void runtime::gpu::GPU_Emitter::emit_OneHot(EMIT_ARGS)
auto onehot = static_cast<const ngraph::op::OneHot*>(node);
auto arg_shape = args[0].get_shape();
auto result_shape = out[0].get_shape();
auto output_datatype_size = out[0].get_element_type().size();
size_t idx = onehot->get_one_hot_axis();
writer.block_begin();
{
auto& cuda_emitter = external_function->get_primitive_emitter()->get_cuda_emitter();
auto index = cuda_emitter->build_onehot(
{{args[0].get_type(), out[0].get_type()}}, arg_shape, result_shape, idx);
auto index = cuda_emitter->build_onehot({{args[0].get_type(), out[0].get_type()}},
arg_shape,
result_shape,
idx,
output_datatype_size);
writer.block_begin();
writer << "void* input[] = {" << node_names(args) << "};\n";
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment