nvgpu softmax cuda version (#2014)

* add softmax cuda support * optimize block size * remove debug info * remove debug * style * remove unused * remove cudnn softmax * format * using nullptr * move helper, add test * fix style * using all_close_f * using kahansum * style * remove commentted out code

nvgpu softmax cuda version (#2014)
* add softmax cuda support * optimize block size * remove debug info * remove debug * style * remove unused * remove cudnn softmax * format * using nullptr * move helper, add test * fix style * using all_close_f * using kahansum * style * remove commentted out code
be9f031e · Fenglei · Robert Kimball · 702d465a · be9f031e · be9f031e
Commit be9f031e authored Nov 11, 2018 by Fenglei Committed by Robert Kimball Nov 11, 2018
8 changed files
--- a/src/ngraph/runtime/gpu/cuda_emitter.cpp
+++ b/src/ngraph/runtime/gpu/cuda_emitter.cpp
--- a/src/ngraph/runtime/gpu/cuda_emitter.hpp
+++ b/src/ngraph/runtime/gpu/cuda_emitter.hpp
@@ -44,7 +44,6 @@ namespace ngraph
                friend class GPUPrimitiveEmitter;

            public:
-                size_t build_primitive(const op::Softmax* node);
                size_t build_primitive(const op::Convolution* node);
                size_t build_primitive(const op::MaxPool* node);
                size_t build_primitive(const op::ReplaceSlice* node, bool in_place_op);
@@ -186,10 +185,9 @@ namespace ngraph
                                    size_t concat_axis,
                                    NVShape output_shape);

-                size_t build_softmax_divide(const std::vector<std::string>& dtypes,
+                size_t build_softmax(const std::vector<std::string>& dtypes,
                                     NVShape input_shape,
-                                            NVShape reduce_shape,
-                                            std::vector<size_t> axes_flag);
+                                     NVShape reduce_axis);

                void debug_sync();
                void sync();

--- a/src/ngraph/runtime/gpu/cudnn_emitter.cpp
+++ b/src/ngraph/runtime/gpu/cudnn_emitter.cpp
@@ -1874,74 +1874,6 @@ size_t runtime::gpu::CUDNNEmitter::build_lrn(const std::string& dtype,
    return primitive_index;
 }

-size_t runtime::gpu::CUDNNEmitter::build_softmax(const cudnnSoftmaxAlgorithm_t& algorithm,
-                                                 const cudnnSoftmaxMode_t& mode,
-                                                 const std::string& dtype,
-                                                 const Prop& direction,
-                                                 const Shape& tensor_shape)
-{
-    // construct hash to determine if kernel needs to be emitted
-    // or if it already exists in the primitive list
-    std::stringstream ss;
-    ss << "softmax_op_" << mode << "_dtype_" << dtype << "_alg" << algorithm << "_dir"
-       << static_cast<int>(direction) << "_s" << join(tensor_shape, "_");
-    std::string hash = ss.str();
-
-    // check if the requested kernel is already an inserted primitive
-    size_t primitive_index = m_primitive_emitter->lookup(hash);
-    if (primitive_index != std::numeric_limits<size_t>::max())
-    {
-        return primitive_index;
-    }
-
-    cudnnDataType_t data_type = get_cudnn_datatype(dtype);
-    cudnnTensorFormat_t tensor_format = CUDNN_TENSOR_NCHW;
-    auto& tensor_desc = tensor_descriptor_from_shape(tensor_shape, data_type, tensor_format);
-    void* alpha = m_host_parameters.allocate_by_datatype(data_type, 1.0);
-    void* beta = m_host_parameters.allocate_by_datatype(data_type, 0);
-    std::unique_ptr<runtime::gpu::primitive> softmax;
-    switch (direction)
-    {
-    case Prop::Forward:
-    case Prop::Inference:
-    {
-        softmax.reset(new gpu::primitive{[=, &tensor_desc](void** inputs, void** outputs) {
-            CUDNN_SAFE_CALL(cudnnSoftmaxForward(*m_ctx->cudnn_handle,
-                                                algorithm,
-                                                mode,
-                                                alpha,
-                                                tensor_desc,
-                                                inputs[0],
-                                                beta,
-                                                tensor_desc,
-                                                outputs[0]));
-            debug_sync();
-        }});
-        break;
-    }
-    case Prop::Backward:
-    {
-        softmax.reset(new gpu::primitive{[=, &tensor_desc](void** inputs, void** outputs) {
-            CUDNN_SAFE_CALL(cudnnSoftmaxBackward(*m_ctx->cudnn_handle,
-                                                 algorithm,
-                                                 mode,
-                                                 alpha,
-                                                 tensor_desc,
-                                                 inputs[0],
-                                                 tensor_desc,
-                                                 inputs[1],
-                                                 beta,
-                                                 tensor_desc,
-                                                 outputs[0]));
-            debug_sync();
-        }});
-        break;
-    }
-    }
-
-    return this->m_primitive_emitter->register_primitive(softmax, hash);
-}
-
 void runtime::gpu::CUDNNEmitter::sync()
 {
    CUDA_RT_SAFE_CALL(cudaDeviceSynchronize());

--- a/src/ngraph/runtime/gpu/cudnn_emitter.hpp
+++ b/src/ngraph/runtime/gpu/cudnn_emitter.hpp
@@ -155,12 +155,6 @@ namespace ngraph
                                 const double lrn_bias,
                                 const size_t lrn_size);

-                size_t build_softmax(const cudnnSoftmaxAlgorithm_t& algorithm,
-                                     const cudnnSoftmaxMode_t& mode,
-                                     const std::string& dtype,
-                                     const Prop& direction,
-                                     const Shape& tensor_shape);
-
                void debug_sync();
                void sync();


--- a/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.cpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.cpp
--- a/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.hpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.hpp
@@ -186,11 +186,12 @@ namespace ngraph
                                                    int sm_tile_size = 8,
                                                    int reg_tile_size = 1);

-                static void get_softmax_divide_op(codegen::CodeWriter& writer,
+                static void get_softmax_op(codegen::CodeWriter& writer,
                                           const std::string& name,
+                                           runtime::gpu::GPUKernelArgs& args,
                                           const std::vector<std::string>& data_types,
-                                                  std::vector<size_t> axes_flag,
-                                                  size_t rank);
+                                           size_t out_rank,
+                                           size_t reduce_rank);

                static void add_pod_typedefs(codegen::CodeWriter& writer);


--- a/src/ngraph/runtime/gpu/gpu_emitter.cpp
+++ b/src/ngraph/runtime/gpu/gpu_emitter.cpp
@@ -1528,23 +1528,17 @@ void runtime::gpu::GPU_Emitter::emit_Softmax(EMIT_ARGS)
    auto softmax = static_cast<const ngraph::op::Softmax*>(node);
    writer.block_begin();
    {
-        size_t index;
-        if (softmax->get_axes().size() != args[0].get_shape().size())
-        {
-            auto& cuda_emitter = external_function->get_primitive_emitter()->get_cuda_emitter();
-
-            index = cuda_emitter->build_primitive(softmax);
-        }
-        else
+        auto axes_set = softmax->get_axes();
+        ngraph::AxisVector axes_vec;
+        for (auto a : axes_set)
        {
-            auto& cudnn_emitter = external_function->get_primitive_emitter()->get_cudnn_emitter();
-
-            index = cudnn_emitter->build_softmax(CUDNN_SOFTMAX_FAST,
-                                                 CUDNN_SOFTMAX_MODE_INSTANCE,
-                                                 out[0].get_type(),
-                                                 CUDNNEmitter::Prop::Forward,
-                                                 args[0].get_shape());
+            axes_vec.push_back(a);
        }
+        std::vector<string> dtypes;
+        dtypes.push_back(args[0].get_type());
+        dtypes.push_back(out[0].get_type());
+        auto& cuda_emitter = external_function->get_primitive_emitter()->get_cuda_emitter();
+        size_t index = cuda_emitter->build_softmax(dtypes, args[0].get_shape(), axes_vec);

        writer << "void* input[] = {" << node_names(args) << "};\n";
        writer << "void* output[] = {" << node_names(out) << "};\n";

--- a/test/backend_test.in.cpp
+++ b/test/backend_test.in.cpp
@@ -4187,6 +4187,34 @@ NGRAPH_TEST(${BACKEND_NAME}, softmax_underflow)
    EXPECT_TRUE(test::all_close(expected, read_vector<float>(result)));
 }

+NGRAPH_TEST(${BACKEND_NAME}, softmax_overflow)
+{
+    Shape shape{2, 3};
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto f = make_shared<Function>(make_shared<op::Softmax>(A, AxisSet{0}), op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    auto high = std::numeric_limits<float>::max();
+
+    auto a = backend->create_tensor(element::f32, shape);
+    copy_data(a, vector<float>{high, 1, 2, 3, 4, 5});
+    auto result = backend->create_tensor(element::f32, shape);
+
+    auto d0 = expf(high - high) + expf(3 - high);
+    auto d1 = expf(1) + expf(4);
+    auto d2 = expf(2) + expf(5);
+
+    backend->call_with_validate(f, {result}, {a});
+    vector<float> expected{expf(high - high) / d0,
+                           expf(1) / d1,
+                           expf(2) / d2,
+                           expf(3 - high) / d0,
+                           expf(4) / d1,
+                           expf(5) / d2};
+    EXPECT_TRUE(test::all_close_f(expected, read_vector<float>(result)));
+}
+
 NGRAPH_TEST(${BACKEND_NAME}, multiple_backends)
 {
    Shape shape{2, 2};