nvgpu cuda softmax optimization (#2101)

* add some helper function * update with new helper function * update reduce to nd with new helper function * update float sum to stable sum * fix bug * update all reduce to stable sum for float * fix bug and pass the sum stable test * remove debug info * style * update with shape * fix bug * add host parameters to cuda_emitter * clang format * fix bugs * add element::type support * format * add a cached value with datatype name * add init_reduce_value * unroll loop * optimization * remove the need for init_value * add memset kernel * add memcpy * working version * remove debug info * add comments, clean up code. * change in_idx to input_idx * fix bug * change args name for memset in emitter * pass element::Type instead of string * the op::reduce come with init value, add support * resolve codacy-bot comment * fix bug * resove codacy-bot comment * add soft_max_block_reduce kernel * fix bugs * add softmax_block_reduce to cuda_emitter * compiing ok, result wrong * fix bug in kernel * working version * removed unused code * remove unused comments, resolve comments * cuda reduce for max, min, mul, reduce op init value, format * use type::info * use type info for numeric_limits * remove code from gpu_host_parameters * header * remvoe outdated comments * add helper to check if stable sum is needed * add stable sum test for double * remove extra line * consolidate helper functions * no need list now. * remove extra ; * clang format * style * add skip test for cpu and intelGPU side * resolve more conflict * update comment * fix a warning * Update src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.cpp using load. Co-Authored-By: fengleitian <35274053+fengleitian@users.noreply.github.com> * using WARPSIZE instead of 32, using lambda * more WARPSIZE instead of 32 * fix block_size_x bug * using __expf

nvgpu cuda softmax optimization (#2101)
* add some helper function * update with new helper function * update reduce to nd with new helper function * update float sum to stable sum * fix bug * update all reduce to stable sum for float * fix bug and pass the sum stable test * remove debug info * style * update with shape * fix bug * add host parameters to cuda_emitter * clang format * fix bugs * add element::type support * format * add a cached value with datatype name * add init_reduce_value * unroll loop * optimization * remove the need for init_value * add memset kernel * add memcpy * working version * remove debug info * add comments, clean up code. * change in_idx to input_idx * fix bug * change args name for memset in emitter * pass element::Type instead of string * the op::reduce come with init value, add support * resolve codacy-bot comment * fix bug * resove codacy-bot comment * add soft_max_block_reduce kernel * fix bugs * add softmax_block_reduce to cuda_emitter * compiing ok, result wrong * fix bug in kernel * working version * removed unused code * remove unused comments, resolve comments * cuda reduce for max, min, mul, reduce op init value, format * use type::info * use type info for numeric_limits * remove code from gpu_host_parameters * header * remvoe outdated comments * add helper to check if stable sum is needed * add stable sum test for double * remove extra line * consolidate helper functions * no need list now. * remove extra ; * clang format * style * add skip test for cpu and intelGPU side * resolve more conflict * update comment * fix a warning * Update src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.cpp using load. Co-Authored-By: fengleitian <35274053+fengleitian@users.noreply.github.com> * using WARPSIZE instead of 32, using lambda * more WARPSIZE instead of 32 * fix block_size_x bug * using __expf
a3133482 · Fenglei · Scott Cyphers · 6584306c · a3133482 · a3133482
Commit a3133482 authored Dec 11, 2018 by Fenglei Committed by Scott Cyphers Dec 11, 2018
5 changed files
--- a/src/ngraph/runtime/gpu/cuda_emitter.cpp
+++ b/src/ngraph/runtime/gpu/cuda_emitter.cpp
@@ -1628,21 +1628,27 @@ size_t runtime::gpu::CUDAEmitter::build_primitive(const op::MaxPool* node)
    return this->m_primitive_emitter->register_primitive(kernel_launch, hash);
 }
-size_t runtime::gpu::CUDAEmitter::build_softmax(const std::vector<std::string>& dtypes,
+size_t runtime::gpu::CUDAEmitter::build_softmax(const std::vector<element::Type>& dtypes,
                                                NVShape input_shape,
                                                NVShape reduce_axis)
 {
-    size_t rank = input_shape.size();
+    std::vector<std::string> dtypes_str = get_string_vector(dtypes);
-    size_t reduce_rank = reduce_axis.size();
+    NVShape simplified_reduce_axis;
+    NVShape simplified_input_shape;
+    simplify_reduce_shape(input_shape, reduce_axis, simplified_input_shape, simplified_reduce_axis);
+    size_t rank = simplified_input_shape.size();
+    size_t reduce_rank = simplified_reduce_axis.size();
    size_t non_reduce_rank = rank - reduce_rank;
    // assumes NC{d1,...,dn} format
-    std::string kernel_name = "softmax_" + join(dtypes, "_");
+    std::string kernel_name = "softmax_" + join(dtypes_str, "_");
-    kernel_name +=
+    kernel_name += "_ri_" + std::to_string(simplified_input_shape.size()) + "_rr_" +
-        "_ri_" + std::to_string(input_shape.size()) + "_rr_" + std::to_string(reduce_axis.size());
+                   std::to_string(simplified_reduce_axis.size());
    std::replace(kernel_name.begin(), kernel_name.end(), ' ', '_');
    std::stringstream ss;
-    ss << kernel_name << "_s_" << join(input_shape, "_") << "_axis_" << join(reduce_axis, "_");
+    ss << kernel_name << "_s_" << join(simplified_input_shape, "_") << "_axis_"
+       << join(simplified_reduce_axis, "_");
    auto hash = ss.str();
    // check if the requested kernel is already an inserted primitive
    size_t primitive_index = m_primitive_emitter->lookup(hash);
@@ -1651,45 +1657,64 @@ size_t runtime::gpu::CUDAEmitter::build_softmax(const std::vector<std::string>&
        return primitive_index;
    }
-    NVShape reduce_flag(rank, 0);
+    NVShape non_reduce_shape;
-    for (auto a : reduce_axis)
-    {
-        reduce_flag[a] = 1;
-    }
-    NVShape output_shape;
    NVShape non_reduce_strides;
+    NVShape non_reduce_strides_in_input;
    NVShape reduce_shape;
    NVShape reduce_strides;
-    NVShape input_strides = row_major_strides(input_shape);
+    NVShape reduce_strides_in_input;
-    for (int i = 0; i < rank; i++)
+    get_reduce_strides(simplified_input_shape,
-    {
+                       simplified_reduce_axis,
-        if (reduce_flag[i] != 0)
+                       non_reduce_shape,
+                       non_reduce_strides,
+                       non_reduce_strides_in_input,
+                       reduce_shape,
+                       reduce_strides,
+                       reduce_strides_in_input);
+    std::vector<int> reduce_strides_magic;
+    std::vector<int> reduce_strides_shift;
+    std::vector<int> non_reduce_strides_magic;
+    std::vector<int> non_reduce_strides_shift;
+    div_to_mul(reduce_strides, reduce_strides_magic, reduce_strides_shift);
+    div_to_mul(non_reduce_strides, non_reduce_strides_magic, non_reduce_strides_shift);
+    uint32_t nthreads = static_cast<uint32_t>(shape_size(non_reduce_shape));
+    // if reduce shape is empty, all result should be 1.
+    if (reduce_shape.empty())
    {
-            reduce_shape.push_back(input_shape[i]);
+        size_t memset_idx = build_memset(dtypes_str[0], nthreads);
-            reduce_strides.push_back(input_strides[i]);
+        void* init_value =
+            m_host_parameters->val_by_datatype(dtypes_str[0], static_cast<int64_t>(1));
+        // get an allocator for transient per kernel gpu memory
+        GPUAllocator allocator = this->m_primitive_emitter->get_memory_allocator();
+        // (lazy) allocation for kernel arguments
+        size_t idx_init_value = allocator.reserve_argspace(init_value, dtypes[0].size());
+        std::unique_ptr<gpu::primitive> memset(new gpu::primitive{[=](void** inputs,
+                                                                      void** outputs) mutable {
+            void* init_value_buff = runtime::gpu::invoke_memory_primitive(m_ctx, idx_init_value);
+            gpu::invoke_primitive(m_ctx,
+                                  memset_idx,
+                                  std::vector<void*>{init_value_buff}.data(),
+                                  std::vector<void*>{outputs[0]}.data());
+        }});
+        return this->m_primitive_emitter->register_primitive(memset, hash);
    }
-        else
+    // if reduce not include last axis, this is a heuristic to choose by reduce axis for better cache
+    // a more accurate but slow way is to tune with actual kernel
+    else if (reduce_strides_in_input.back() != 1)
    {
-            non_reduce_strides.push_back(input_strides[i]);
-            output_shape.push_back(input_shape[i]);
-        }
-    }
-    NVShape output_strides = row_major_strides(output_shape);
-    uint32_t nthreads = static_cast<uint32_t>(shape_size(output_shape));
        // TODO: currently we set it to 64, will add tuning method later
        uint32_t block_size_x = 64;
-    if (reduce_flag.back() == 1)
-    {
-        block_size_x = 8;
-    }
        uint32_t aligned_grid_size_x = align_to_block_size(nthreads, block_size_x);
        auto args = m_primitive_emitter->add_kernel_args();
-    args.add_placeholder(dtypes[0], "in")
+        args.add_placeholder(dtypes_str[0], "in")
-        .add_placeholder(dtypes[1], "out")
+            .add_placeholder(dtypes_str[1], "out")
-        .add("out_strides", output_strides)
            .add("non_reduce_strides", non_reduce_strides)
+            .add("non_reduce_strides_in_input", non_reduce_strides_in_input)
+            .add("reduce_strides_in_input", reduce_strides_in_input)
            .add("reduce_shape", reduce_shape)
-        .add("reduce_strides", reduce_strides)
            .add("nthreads", nthreads);
        // if the kernel has not been compiled, build it
@@ -1699,7 +1724,7 @@ size_t runtime::gpu::CUDAEmitter::build_softmax(const std::vector<std::string>&
            codegen::CodeWriter writer;
            CudaKernelBuilder::add_pod_typedefs(writer);
            runtime::gpu::CudaKernelBuilder::get_softmax_op(
-            writer, kernel_name, args, dtypes, non_reduce_rank, reduce_rank);
+                writer, kernel_name, args, dtypes_str, non_reduce_rank, reduce_rank);
            compiled_kernel = m_ctx->compiled_kernel_pool->set(kernel_name, writer.get_code());
        }
@@ -1724,6 +1749,65 @@ size_t runtime::gpu::CUDAEmitter::build_softmax(const std::vector<std::string>&
            }});
        return this->m_primitive_emitter->register_primitive(softmax, hash);
+    }
+    else
+    {
+        uint32_t reduce_count = static_cast<uint32_t>(shape_size(reduce_shape));
+        uint32_t block_size_x = 1;
+        while ((block_size_x << 1) <= fmin(512, reduce_count))
+        {
+            block_size_x <<= 1;
+        }
+        uint32_t shared_data_bytes = block_size_x * static_cast<uint32_t>(dtypes[0].size());
+        uint32_t aligned_grid_size_x = nthreads;
+        auto args = m_primitive_emitter->add_kernel_args();
+        args.add_placeholder(dtypes_str[0], "in")
+            .add_placeholder(dtypes_str[1], "out")
+            .add("non_reduce_strides", non_reduce_strides)
+            .add("non_reduce_strides_magic", non_reduce_strides_magic)
+            .add("non_reduce_strides_shift", non_reduce_strides_shift)
+            .add("non_reduce_strides_in_input", non_reduce_strides_in_input)
+            .add("reduce_strides", reduce_strides)
+            .add("reduce_strides_magic", reduce_strides_magic)
+            .add("reduce_strides_shift", reduce_strides_shift)
+            .add("reduce_strides_in_input", reduce_strides_in_input)
+            .add("reduce_count", reduce_count)
+            .add("nthreads", nthreads);
+        // if the kernel has not been compiled, build it
+        kernel_name += "_bs_" + std::to_string(block_size_x);
+        auto compiled_kernel = m_ctx->compiled_kernel_pool->get(kernel_name);
+        if (compiled_kernel == nullptr)
+        {
+            codegen::CodeWriter writer;
+            CudaKernelBuilder::add_pod_typedefs(writer);
+            runtime::gpu::CudaKernelBuilder::get_softmax_block_reduce_op(
+                writer, kernel_name, args, dtypes_str, non_reduce_rank, reduce_rank, block_size_x);
+            compiled_kernel = m_ctx->compiled_kernel_pool->set(kernel_name, writer.get_code());
+        }
+        std::unique_ptr<gpu::primitive> softmax(
+            new gpu::primitive{[=](void** inputs, void** outputs) mutable {
+                void** args_list = args.resolve_placeholder(0, &inputs[0])
+                                       .resolve_placeholder(1, &outputs[0])
+                                       .get_argument_list();
+                CUDA_SAFE_CALL(cuLaunchKernel(*compiled_kernel.get(),
+                                              aligned_grid_size_x,
+                                              1,
+                                              1,
+                                              block_size_x,
+                                              1,
+                                              1,
+                                              shared_data_bytes,
+                                              nullptr,
+                                              args_list,
+                                              nullptr));
+                debug_sync();
+            }});
+        return this->m_primitive_emitter->register_primitive(softmax, hash);
+    }
 }
 size_t runtime::gpu::CUDAEmitter::build_reduce_to_nd(const std::vector<element::Type>& dtypes,

--- a/src/ngraph/runtime/gpu/cuda_emitter.hpp
+++ b/src/ngraph/runtime/gpu/cuda_emitter.hpp
@@ -190,7 +190,7 @@ namespace ngraph
                                    size_t concat_axis,
                                    NVShape output_shape);
-                size_t build_softmax(const std::vector<std::string>& dtypes,
+                size_t build_softmax(const std::vector<element::Type>& dtypes,
                                     NVShape input_shape,
                                     NVShape reduce_axis);

--- a/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.cpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.cpp
--- a/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.hpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.hpp
@@ -208,6 +208,14 @@ namespace ngraph
                                           size_t out_rank,
                                           size_t reduce_rank);
+                static void get_softmax_block_reduce_op(codegen::CodeWriter& writer,
+                                                        const std::string& name,
+                                                        runtime::gpu::GPUKernelArgs& args,
+                                                        const std::vector<std::string>& data_types,
+                                                        size_t non_reduce_rank,
+                                                        size_t reduce_rank,
+                                                        size_t block_size_x);
                static void add_pod_typedefs(codegen::CodeWriter& writer);
                static void coordinate_transform_to_multi_d(codegen::CodeWriter& writer,

--- a/src/ngraph/runtime/gpu/gpu_emitter.cpp
+++ b/src/ngraph/runtime/gpu/gpu_emitter.cpp
@@ -1477,9 +1477,9 @@ void runtime::gpu::GPU_Emitter::emit_Softmax(EMIT_ARGS)
    writer.block_begin();
    {
        auto axes_set = softmax->get_axes();
-        std::vector<string> dtypes;
+        std::vector<element::Type> dtypes;
-        dtypes.push_back(args[0].get_type());
+        dtypes.push_back(args[0].get_element_type());
-        dtypes.push_back(out[0].get_type());
+        dtypes.push_back(out[0].get_element_type());
        auto& cuda_emitter = external_function->get_primitive_emitter()->get_cuda_emitter();
        size_t index = cuda_emitter->build_softmax(dtypes, args[0].get_shape(), axes_set);