Commit be9f031e authored by Fenglei's avatar Fenglei Committed by Robert Kimball

nvgpu softmax cuda version (#2014)

* add softmax cuda support

* optimize block size

* remove debug info

* remove debug

* style

* remove unused

* remove cudnn softmax

* format

* using nullptr

* move helper, add test

* fix style

* using all_close_f

* using kahansum

* style

* remove commentted out code
parent 702d465a
This diff is collapsed.
...@@ -44,7 +44,6 @@ namespace ngraph ...@@ -44,7 +44,6 @@ namespace ngraph
friend class GPUPrimitiveEmitter; friend class GPUPrimitiveEmitter;
public: public:
size_t build_primitive(const op::Softmax* node);
size_t build_primitive(const op::Convolution* node); size_t build_primitive(const op::Convolution* node);
size_t build_primitive(const op::MaxPool* node); size_t build_primitive(const op::MaxPool* node);
size_t build_primitive(const op::ReplaceSlice* node, bool in_place_op); size_t build_primitive(const op::ReplaceSlice* node, bool in_place_op);
...@@ -186,10 +185,9 @@ namespace ngraph ...@@ -186,10 +185,9 @@ namespace ngraph
size_t concat_axis, size_t concat_axis,
NVShape output_shape); NVShape output_shape);
size_t build_softmax_divide(const std::vector<std::string>& dtypes, size_t build_softmax(const std::vector<std::string>& dtypes,
NVShape input_shape, NVShape input_shape,
NVShape reduce_shape, NVShape reduce_axis);
std::vector<size_t> axes_flag);
void debug_sync(); void debug_sync();
void sync(); void sync();
......
...@@ -1874,74 +1874,6 @@ size_t runtime::gpu::CUDNNEmitter::build_lrn(const std::string& dtype, ...@@ -1874,74 +1874,6 @@ size_t runtime::gpu::CUDNNEmitter::build_lrn(const std::string& dtype,
return primitive_index; return primitive_index;
} }
size_t runtime::gpu::CUDNNEmitter::build_softmax(const cudnnSoftmaxAlgorithm_t& algorithm,
const cudnnSoftmaxMode_t& mode,
const std::string& dtype,
const Prop& direction,
const Shape& tensor_shape)
{
// construct hash to determine if kernel needs to be emitted
// or if it already exists in the primitive list
std::stringstream ss;
ss << "softmax_op_" << mode << "_dtype_" << dtype << "_alg" << algorithm << "_dir"
<< static_cast<int>(direction) << "_s" << join(tensor_shape, "_");
std::string hash = ss.str();
// check if the requested kernel is already an inserted primitive
size_t primitive_index = m_primitive_emitter->lookup(hash);
if (primitive_index != std::numeric_limits<size_t>::max())
{
return primitive_index;
}
cudnnDataType_t data_type = get_cudnn_datatype(dtype);
cudnnTensorFormat_t tensor_format = CUDNN_TENSOR_NCHW;
auto& tensor_desc = tensor_descriptor_from_shape(tensor_shape, data_type, tensor_format);
void* alpha = m_host_parameters.allocate_by_datatype(data_type, 1.0);
void* beta = m_host_parameters.allocate_by_datatype(data_type, 0);
std::unique_ptr<runtime::gpu::primitive> softmax;
switch (direction)
{
case Prop::Forward:
case Prop::Inference:
{
softmax.reset(new gpu::primitive{[=, &tensor_desc](void** inputs, void** outputs) {
CUDNN_SAFE_CALL(cudnnSoftmaxForward(*m_ctx->cudnn_handle,
algorithm,
mode,
alpha,
tensor_desc,
inputs[0],
beta,
tensor_desc,
outputs[0]));
debug_sync();
}});
break;
}
case Prop::Backward:
{
softmax.reset(new gpu::primitive{[=, &tensor_desc](void** inputs, void** outputs) {
CUDNN_SAFE_CALL(cudnnSoftmaxBackward(*m_ctx->cudnn_handle,
algorithm,
mode,
alpha,
tensor_desc,
inputs[0],
tensor_desc,
inputs[1],
beta,
tensor_desc,
outputs[0]));
debug_sync();
}});
break;
}
}
return this->m_primitive_emitter->register_primitive(softmax, hash);
}
void runtime::gpu::CUDNNEmitter::sync() void runtime::gpu::CUDNNEmitter::sync()
{ {
CUDA_RT_SAFE_CALL(cudaDeviceSynchronize()); CUDA_RT_SAFE_CALL(cudaDeviceSynchronize());
......
...@@ -155,12 +155,6 @@ namespace ngraph ...@@ -155,12 +155,6 @@ namespace ngraph
const double lrn_bias, const double lrn_bias,
const size_t lrn_size); const size_t lrn_size);
size_t build_softmax(const cudnnSoftmaxAlgorithm_t& algorithm,
const cudnnSoftmaxMode_t& mode,
const std::string& dtype,
const Prop& direction,
const Shape& tensor_shape);
void debug_sync(); void debug_sync();
void sync(); void sync();
......
...@@ -186,11 +186,12 @@ namespace ngraph ...@@ -186,11 +186,12 @@ namespace ngraph
int sm_tile_size = 8, int sm_tile_size = 8,
int reg_tile_size = 1); int reg_tile_size = 1);
static void get_softmax_divide_op(codegen::CodeWriter& writer, static void get_softmax_op(codegen::CodeWriter& writer,
const std::string& name, const std::string& name,
runtime::gpu::GPUKernelArgs& args,
const std::vector<std::string>& data_types, const std::vector<std::string>& data_types,
std::vector<size_t> axes_flag, size_t out_rank,
size_t rank); size_t reduce_rank);
static void add_pod_typedefs(codegen::CodeWriter& writer); static void add_pod_typedefs(codegen::CodeWriter& writer);
......
...@@ -1528,23 +1528,17 @@ void runtime::gpu::GPU_Emitter::emit_Softmax(EMIT_ARGS) ...@@ -1528,23 +1528,17 @@ void runtime::gpu::GPU_Emitter::emit_Softmax(EMIT_ARGS)
auto softmax = static_cast<const ngraph::op::Softmax*>(node); auto softmax = static_cast<const ngraph::op::Softmax*>(node);
writer.block_begin(); writer.block_begin();
{ {
size_t index; auto axes_set = softmax->get_axes();
if (softmax->get_axes().size() != args[0].get_shape().size()) ngraph::AxisVector axes_vec;
{ for (auto a : axes_set)
auto& cuda_emitter = external_function->get_primitive_emitter()->get_cuda_emitter();
index = cuda_emitter->build_primitive(softmax);
}
else
{ {
auto& cudnn_emitter = external_function->get_primitive_emitter()->get_cudnn_emitter(); axes_vec.push_back(a);
index = cudnn_emitter->build_softmax(CUDNN_SOFTMAX_FAST,
CUDNN_SOFTMAX_MODE_INSTANCE,
out[0].get_type(),
CUDNNEmitter::Prop::Forward,
args[0].get_shape());
} }
std::vector<string> dtypes;
dtypes.push_back(args[0].get_type());
dtypes.push_back(out[0].get_type());
auto& cuda_emitter = external_function->get_primitive_emitter()->get_cuda_emitter();
size_t index = cuda_emitter->build_softmax(dtypes, args[0].get_shape(), axes_vec);
writer << "void* input[] = {" << node_names(args) << "};\n"; writer << "void* input[] = {" << node_names(args) << "};\n";
writer << "void* output[] = {" << node_names(out) << "};\n"; writer << "void* output[] = {" << node_names(out) << "};\n";
......
...@@ -4187,6 +4187,34 @@ NGRAPH_TEST(${BACKEND_NAME}, softmax_underflow) ...@@ -4187,6 +4187,34 @@ NGRAPH_TEST(${BACKEND_NAME}, softmax_underflow)
EXPECT_TRUE(test::all_close(expected, read_vector<float>(result))); EXPECT_TRUE(test::all_close(expected, read_vector<float>(result)));
} }
NGRAPH_TEST(${BACKEND_NAME}, softmax_overflow)
{
Shape shape{2, 3};
auto A = make_shared<op::Parameter>(element::f32, shape);
auto f = make_shared<Function>(make_shared<op::Softmax>(A, AxisSet{0}), op::ParameterVector{A});
auto backend = runtime::Backend::create("${BACKEND_NAME}");
auto high = std::numeric_limits<float>::max();
auto a = backend->create_tensor(element::f32, shape);
copy_data(a, vector<float>{high, 1, 2, 3, 4, 5});
auto result = backend->create_tensor(element::f32, shape);
auto d0 = expf(high - high) + expf(3 - high);
auto d1 = expf(1) + expf(4);
auto d2 = expf(2) + expf(5);
backend->call_with_validate(f, {result}, {a});
vector<float> expected{expf(high - high) / d0,
expf(1) / d1,
expf(2) / d2,
expf(3 - high) / d0,
expf(4) / d1,
expf(5) / d2};
EXPECT_TRUE(test::all_close_f(expected, read_vector<float>(result)));
}
NGRAPH_TEST(${BACKEND_NAME}, multiple_backends) NGRAPH_TEST(${BACKEND_NAME}, multiple_backends)
{ {
Shape shape{2, 2}; Shape shape{2, 2};
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment