Commit a3133482 authored by Fenglei's avatar Fenglei Committed by Scott Cyphers

nvgpu cuda softmax optimization (#2101)

* add some helper function

* update with new helper function

* update reduce to nd with new helper function

* update float sum to stable sum

* fix bug

* update all reduce to stable sum for float

* fix bug and pass the sum stable test

* remove debug info

* style

* update with shape

* fix bug

* add host parameters to cuda_emitter

* clang format

* fix bugs

* add element::type support

* format

* add a cached value with datatype name

* add init_reduce_value

* unroll loop

* optimization

* remove the need for init_value

* add memset kernel

* add memcpy

* working version

* remove debug info

* add comments, clean up code.

* change in_idx to input_idx

* fix bug

* change args name for memset in emitter

* pass element::Type instead of string

* the op::reduce come with init value, add support

* resolve codacy-bot comment

* fix bug

* resove codacy-bot comment

* add soft_max_block_reduce kernel

* fix bugs

* add softmax_block_reduce to cuda_emitter

* compiing ok, result wrong

* fix bug in kernel

* working version

* removed unused code

* remove unused comments, resolve comments

* cuda reduce for max, min, mul, reduce op init value, format

* use type::info

* use type info for numeric_limits

* remove code from gpu_host_parameters

* header

* remvoe outdated comments

* add helper to check if stable sum is needed

* add stable sum test for double

* remove extra line

* consolidate helper functions

* no need list now.

* remove extra ;

* clang format

* style

* add skip test for cpu and intelGPU side

* resolve more conflict

* update comment

* fix a warning

* Update src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.cpp

using load.
Co-Authored-By: 's avatarfengleitian <35274053+fengleitian@users.noreply.github.com>

* using WARPSIZE instead of 32, using lambda

* more WARPSIZE instead of 32

* fix block_size_x bug

* using __expf
parent 6584306c
......@@ -1628,21 +1628,27 @@ size_t runtime::gpu::CUDAEmitter::build_primitive(const op::MaxPool* node)
return this->m_primitive_emitter->register_primitive(kernel_launch, hash);
}
size_t runtime::gpu::CUDAEmitter::build_softmax(const std::vector<std::string>& dtypes,
size_t runtime::gpu::CUDAEmitter::build_softmax(const std::vector<element::Type>& dtypes,
NVShape input_shape,
NVShape reduce_axis)
{
size_t rank = input_shape.size();
size_t reduce_rank = reduce_axis.size();
std::vector<std::string> dtypes_str = get_string_vector(dtypes);
NVShape simplified_reduce_axis;
NVShape simplified_input_shape;
simplify_reduce_shape(input_shape, reduce_axis, simplified_input_shape, simplified_reduce_axis);
size_t rank = simplified_input_shape.size();
size_t reduce_rank = simplified_reduce_axis.size();
size_t non_reduce_rank = rank - reduce_rank;
// assumes NC{d1,...,dn} format
std::string kernel_name = "softmax_" + join(dtypes, "_");
kernel_name +=
"_ri_" + std::to_string(input_shape.size()) + "_rr_" + std::to_string(reduce_axis.size());
std::string kernel_name = "softmax_" + join(dtypes_str, "_");
kernel_name += "_ri_" + std::to_string(simplified_input_shape.size()) + "_rr_" +
std::to_string(simplified_reduce_axis.size());
std::replace(kernel_name.begin(), kernel_name.end(), ' ', '_');
std::stringstream ss;
ss << kernel_name << "_s_" << join(input_shape, "_") << "_axis_" << join(reduce_axis, "_");
ss << kernel_name << "_s_" << join(simplified_input_shape, "_") << "_axis_"
<< join(simplified_reduce_axis, "_");
auto hash = ss.str();
// check if the requested kernel is already an inserted primitive
size_t primitive_index = m_primitive_emitter->lookup(hash);
......@@ -1651,45 +1657,64 @@ size_t runtime::gpu::CUDAEmitter::build_softmax(const std::vector<std::string>&
return primitive_index;
}
NVShape reduce_flag(rank, 0);
for (auto a : reduce_axis)
{
reduce_flag[a] = 1;
}
NVShape output_shape;
NVShape non_reduce_shape;
NVShape non_reduce_strides;
NVShape non_reduce_strides_in_input;
NVShape reduce_shape;
NVShape reduce_strides;
NVShape input_strides = row_major_strides(input_shape);
for (int i = 0; i < rank; i++)
{
if (reduce_flag[i] != 0)
NVShape reduce_strides_in_input;
get_reduce_strides(simplified_input_shape,
simplified_reduce_axis,
non_reduce_shape,
non_reduce_strides,
non_reduce_strides_in_input,
reduce_shape,
reduce_strides,
reduce_strides_in_input);
std::vector<int> reduce_strides_magic;
std::vector<int> reduce_strides_shift;
std::vector<int> non_reduce_strides_magic;
std::vector<int> non_reduce_strides_shift;
div_to_mul(reduce_strides, reduce_strides_magic, reduce_strides_shift);
div_to_mul(non_reduce_strides, non_reduce_strides_magic, non_reduce_strides_shift);
uint32_t nthreads = static_cast<uint32_t>(shape_size(non_reduce_shape));
// if reduce shape is empty, all result should be 1.
if (reduce_shape.empty())
{
reduce_shape.push_back(input_shape[i]);
reduce_strides.push_back(input_strides[i]);
size_t memset_idx = build_memset(dtypes_str[0], nthreads);
void* init_value =
m_host_parameters->val_by_datatype(dtypes_str[0], static_cast<int64_t>(1));
// get an allocator for transient per kernel gpu memory
GPUAllocator allocator = this->m_primitive_emitter->get_memory_allocator();
// (lazy) allocation for kernel arguments
size_t idx_init_value = allocator.reserve_argspace(init_value, dtypes[0].size());
std::unique_ptr<gpu::primitive> memset(new gpu::primitive{[=](void** inputs,
void** outputs) mutable {
void* init_value_buff = runtime::gpu::invoke_memory_primitive(m_ctx, idx_init_value);
gpu::invoke_primitive(m_ctx,
memset_idx,
std::vector<void*>{init_value_buff}.data(),
std::vector<void*>{outputs[0]}.data());
}});
return this->m_primitive_emitter->register_primitive(memset, hash);
}
else
// if reduce not include last axis, this is a heuristic to choose by reduce axis for better cache
// a more accurate but slow way is to tune with actual kernel
else if (reduce_strides_in_input.back() != 1)
{
non_reduce_strides.push_back(input_strides[i]);
output_shape.push_back(input_shape[i]);
}
}
NVShape output_strides = row_major_strides(output_shape);
uint32_t nthreads = static_cast<uint32_t>(shape_size(output_shape));
// TODO: currently we set it to 64, will add tuning method later
uint32_t block_size_x = 64;
if (reduce_flag.back() == 1)
{
block_size_x = 8;
}
uint32_t aligned_grid_size_x = align_to_block_size(nthreads, block_size_x);
auto args = m_primitive_emitter->add_kernel_args();
args.add_placeholder(dtypes[0], "in")
.add_placeholder(dtypes[1], "out")
.add("out_strides", output_strides)
args.add_placeholder(dtypes_str[0], "in")
.add_placeholder(dtypes_str[1], "out")
.add("non_reduce_strides", non_reduce_strides)
.add("non_reduce_strides_in_input", non_reduce_strides_in_input)
.add("reduce_strides_in_input", reduce_strides_in_input)
.add("reduce_shape", reduce_shape)
.add("reduce_strides", reduce_strides)
.add("nthreads", nthreads);
// if the kernel has not been compiled, build it
......@@ -1699,7 +1724,7 @@ size_t runtime::gpu::CUDAEmitter::build_softmax(const std::vector<std::string>&
codegen::CodeWriter writer;
CudaKernelBuilder::add_pod_typedefs(writer);
runtime::gpu::CudaKernelBuilder::get_softmax_op(
writer, kernel_name, args, dtypes, non_reduce_rank, reduce_rank);
writer, kernel_name, args, dtypes_str, non_reduce_rank, reduce_rank);
compiled_kernel = m_ctx->compiled_kernel_pool->set(kernel_name, writer.get_code());
}
......@@ -1724,6 +1749,65 @@ size_t runtime::gpu::CUDAEmitter::build_softmax(const std::vector<std::string>&
}});
return this->m_primitive_emitter->register_primitive(softmax, hash);
}
else
{
uint32_t reduce_count = static_cast<uint32_t>(shape_size(reduce_shape));
uint32_t block_size_x = 1;
while ((block_size_x << 1) <= fmin(512, reduce_count))
{
block_size_x <<= 1;
}
uint32_t shared_data_bytes = block_size_x * static_cast<uint32_t>(dtypes[0].size());
uint32_t aligned_grid_size_x = nthreads;
auto args = m_primitive_emitter->add_kernel_args();
args.add_placeholder(dtypes_str[0], "in")
.add_placeholder(dtypes_str[1], "out")
.add("non_reduce_strides", non_reduce_strides)
.add("non_reduce_strides_magic", non_reduce_strides_magic)
.add("non_reduce_strides_shift", non_reduce_strides_shift)
.add("non_reduce_strides_in_input", non_reduce_strides_in_input)
.add("reduce_strides", reduce_strides)
.add("reduce_strides_magic", reduce_strides_magic)
.add("reduce_strides_shift", reduce_strides_shift)
.add("reduce_strides_in_input", reduce_strides_in_input)
.add("reduce_count", reduce_count)
.add("nthreads", nthreads);
// if the kernel has not been compiled, build it
kernel_name += "_bs_" + std::to_string(block_size_x);
auto compiled_kernel = m_ctx->compiled_kernel_pool->get(kernel_name);
if (compiled_kernel == nullptr)
{
codegen::CodeWriter writer;
CudaKernelBuilder::add_pod_typedefs(writer);
runtime::gpu::CudaKernelBuilder::get_softmax_block_reduce_op(
writer, kernel_name, args, dtypes_str, non_reduce_rank, reduce_rank, block_size_x);
compiled_kernel = m_ctx->compiled_kernel_pool->set(kernel_name, writer.get_code());
}
std::unique_ptr<gpu::primitive> softmax(
new gpu::primitive{[=](void** inputs, void** outputs) mutable {
void** args_list = args.resolve_placeholder(0, &inputs[0])
.resolve_placeholder(1, &outputs[0])
.get_argument_list();
CUDA_SAFE_CALL(cuLaunchKernel(*compiled_kernel.get(),
aligned_grid_size_x,
1,
1,
block_size_x,
1,
1,
shared_data_bytes,
nullptr,
args_list,
nullptr));
debug_sync();
}});
return this->m_primitive_emitter->register_primitive(softmax, hash);
}
}
size_t runtime::gpu::CUDAEmitter::build_reduce_to_nd(const std::vector<element::Type>& dtypes,
......
......@@ -190,7 +190,7 @@ namespace ngraph
size_t concat_axis,
NVShape output_shape);
size_t build_softmax(const std::vector<std::string>& dtypes,
size_t build_softmax(const std::vector<element::Type>& dtypes,
NVShape input_shape,
NVShape reduce_axis);
......
......@@ -208,6 +208,14 @@ namespace ngraph
size_t out_rank,
size_t reduce_rank);
static void get_softmax_block_reduce_op(codegen::CodeWriter& writer,
const std::string& name,
runtime::gpu::GPUKernelArgs& args,
const std::vector<std::string>& data_types,
size_t non_reduce_rank,
size_t reduce_rank,
size_t block_size_x);
static void add_pod_typedefs(codegen::CodeWriter& writer);
static void coordinate_transform_to_multi_d(codegen::CodeWriter& writer,
......
......@@ -1477,9 +1477,9 @@ void runtime::gpu::GPU_Emitter::emit_Softmax(EMIT_ARGS)
writer.block_begin();
{
auto axes_set = softmax->get_axes();
std::vector<string> dtypes;
dtypes.push_back(args[0].get_type());
dtypes.push_back(out[0].get_type());
std::vector<element::Type> dtypes;
dtypes.push_back(args[0].get_element_type());
dtypes.push_back(out[0].get_element_type());
auto& cuda_emitter = external_function->get_primitive_emitter()->get_cuda_emitter();
size_t index = cuda_emitter->build_softmax(dtypes, args[0].get_shape(), axes_set);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment