nvgpu one hot update (#1773)

* update onehot * clang * fix bugs * format * add output_datatype_size to hash * typo * hash

nvgpu one hot update (#1773)
* update onehot * clang * fix bugs * format * add output_datatype_size to hash * typo * hash
6cd35432 · Fenglei · Robert Kimball · f642bc4c · 6cd35432 · 6cd35432
Commit 6cd35432 authored Oct 10, 2018 by Fenglei Committed by Robert Kimball Oct 10, 2018
4 changed files
--- a/src/ngraph/runtime/gpu/cuda_emitter.cpp
+++ b/src/ngraph/runtime/gpu/cuda_emitter.cpp
@@ -170,13 +170,15 @@ size_t runtime::gpu::CUDAEmitter::build_concat(const std::vector<std::string>& d
 size_t runtime::gpu::CUDAEmitter::build_onehot(const std::array<std::string, 2>& dtypes,
                                               NVShape input_shape,
                                               NVShape output_shape,
-                                               size_t one_hot_axis)
+                                               size_t one_hot_axis,
+                                               size_t output_datatype_size)
 {
    std::stringstream kernel_name;
    kernel_name << "onehot_" << join(dtypes, "_");

    std::string hash = kernel_name.str() + "_i_" + join(input_shape, "_") + "_o_" +
-                       join(output_shape, "_") + std::to_string(one_hot_axis);
+                       join(output_shape, "_") + "_axis_" + std::to_string(one_hot_axis) +
+                       "_datasize_" + std::to_string(output_datatype_size);
    // For backwards compatability we currently use two unordered maps
    // 1. one looks up the compiled cuda kernel (CudaFunctionPool)
    // 2. the other looks to see if this kernel is already in the primitive list
@@ -206,18 +208,19 @@ size_t runtime::gpu::CUDAEmitter::build_onehot(const std::array<std::string, 2>&
    uint32_t block_size_x = 64;
    uint32_t aligned_grid_size_x = align_to_block_size(nthreads, block_size_x);

-    uint32_t repeat_times = static_cast<uint32_t>(output_shape[one_hot_axis]);
-    uint32_t repeat_size = 1;
+    uint32_t hot_axis_shape = static_cast<uint32_t>(output_shape[one_hot_axis]);
+    uint32_t hot_axis_stride = 1;
    for (size_t i = one_hot_axis + 1; i < output_shape.size(); i++)
    {
-        repeat_size *= output_shape[i];
+        hot_axis_stride *= output_shape[i];
    }
-
+    uint32_t output_size = static_cast<uint32_t>(shape_size(output_shape) * output_datatype_size);
    // create the launch primitive
    std::unique_ptr<gpu::primitive> kernel_launch(
        new gpu::primitive{[=](void** inputs, void** outputs) mutable {
            std::vector<void*> args_list{
-                &inputs[0], &outputs[0], &repeat_size, &repeat_times, &nthreads};
+                &inputs[0], &outputs[0], &hot_axis_stride, &hot_axis_shape, &nthreads};
+            runtime::gpu::cuda_memset(outputs[0], 0, output_size);
            CUDA_SAFE_CALL(cuLaunchKernel(*compiled_kernel.get(),
                                          aligned_grid_size_x,
                                          1,

--- a/src/ngraph/runtime/gpu/cuda_emitter.hpp
+++ b/src/ngraph/runtime/gpu/cuda_emitter.hpp
@@ -99,7 +99,8 @@ namespace ngraph
                size_t build_onehot(const std::array<std::string, 2>& dtypes,
                                    NVShape input_shape,
                                    NVShape output_shape,
-                                    size_t one_hot_axis);
+                                    size_t one_hot_axis,
+                                    size_t output_datatype_size);

                size_t build_reverse(const std::array<std::string, 2>& dtypes,
                                     NVShape input_shape,

--- a/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.cpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.cpp
@@ -435,15 +435,24 @@ void runtime::gpu::CudaKernelBuilder::get_onehot_op(codegen::CodeWriter& writer,
                                                    const std::array<std::string, 2>& data_types)
 {
    writer << "extern \"C\" __global__ void cuda_" << name << "(" << data_types[0] << "* in, "
-           << data_types[1] << "* out, uint32_t m, uint32_t k, uint32_t n)\n";
+           << data_types[1]
+           << "* out, uint32_t hot_axis_stride, uint32_t hot_axis_shape, uint32_t n)\n";
    writer.block_begin();
    {
        writer << "uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n";
        writer << "if (tid < n)\n";
        writer.block_begin();
        {
-            writer << "uint32_t idx = (tid / m) * m * k + (m * in[tid]) + tid % m;\n";
-            writer << "out[idx] = 1;\n";
+            writer << "int32_t in_pixel = static_cast<int32_t>(in[tid]);\n";
+            writer << "if(in_pixel >= 0 && in_pixel < hot_axis_shape)\n";
+            writer.block_begin();
+            {
+                writer << "uint32_t idx = tid / hot_axis_stride * hot_axis_stride * hot_axis_shape "
+                          "+ (hot_axis_stride * in_pixel) + tid % "
+                          "hot_axis_stride;\n";
+                writer << "out[idx] = 1;\n";
+            }
+            writer.block_end();
        }
        writer.block_end();
    }

--- a/src/ngraph/runtime/gpu/gpu_emitter.cpp
+++ b/src/ngraph/runtime/gpu/gpu_emitter.cpp
@@ -792,13 +792,17 @@ void runtime::gpu::GPU_Emitter::emit_OneHot(EMIT_ARGS)
    auto onehot = static_cast<const ngraph::op::OneHot*>(node);
    auto arg_shape = args[0].get_shape();
    auto result_shape = out[0].get_shape();
+    auto output_datatype_size = out[0].get_element_type().size();
    size_t idx = onehot->get_one_hot_axis();

    writer.block_begin();
    {
        auto& cuda_emitter = external_function->get_primitive_emitter()->get_cuda_emitter();
-        auto index = cuda_emitter->build_onehot(
-            {{args[0].get_type(), out[0].get_type()}}, arg_shape, result_shape, idx);
+        auto index = cuda_emitter->build_onehot({{args[0].get_type(), out[0].get_type()}},
+                                                arg_shape,
+                                                result_shape,
+                                                idx,
+                                                output_datatype_size);

        writer.block_begin();
        writer << "void* input[] = {" << node_names(args) << "};\n";