IntelGPU backend: Softmax operation implementation (#1413)

c449005a · dmyershov · Scott Cyphers · 428f567d · c449005a · c449005a
Commit c449005a authored Aug 14, 2018 by dmyershov Committed by Scott Cyphers Aug 14, 2018
7 changed files
--- a/src/ngraph/runtime/intelgpu/CMakeLists.txt
+++ b/src/ngraph/runtime/intelgpu/CMakeLists.txt
@@ -22,6 +22,7 @@ set(SRC
    intelgpu_op_broadcast.cpp
    intelgpu_op_custom_kernels.cpp
    intelgpu_op_convolution.cpp
+    intelgpu_op_softmax.cpp
    code_writer.cpp
 )


--- a/src/ngraph/runtime/intelgpu/intelgpu_backend.cpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_backend.cpp
@@ -28,6 +28,7 @@
 #include <CPP/reorder.hpp>
 #include <CPP/reshape.hpp>
 #include <CPP/scale.hpp>
+#include <CPP/softmax.hpp>
 #include <CPP/topology.hpp>

 #include "ngraph/runtime/intelgpu/intelgpu_backend.hpp"
@@ -36,6 +37,7 @@
 #include "ngraph/runtime/intelgpu/intelgpu_op_broadcast.hpp"
 #include "ngraph/runtime/intelgpu/intelgpu_op_convolution.hpp"
 #include "ngraph/runtime/intelgpu/intelgpu_op_custom_kernels.hpp"
+#include "ngraph/runtime/intelgpu/intelgpu_op_softmax.hpp"
 #include "ngraph/runtime/intelgpu/intelgpu_tensor_view.hpp"

 #include "ngraph/function.hpp"
@@ -57,6 +59,7 @@
 #include "ngraph/op/reshape.hpp"
 #include "ngraph/op/reverse.hpp"
 #include "ngraph/op/slice.hpp"
+#include "ngraph/op/softmax.hpp"
 #include "ngraph/op/sum.hpp"
 #include "ngraph/util.hpp"

@@ -375,6 +378,50 @@ bool runtime::intelgpu::IntelGPUBackend::compile(shared_ptr<Function> func)
            const cldnn::concatenation cldnn_concat(get_output_name(op), inputs, cldnn_axis);
            topology.add(cldnn_concat);
        }
+        else if ("Softmax" == op->description())
+        {
+            arguments_check(op, 1, 1);
+
+            const shared_ptr<op::Softmax> softmax_op = static_pointer_cast<op::Softmax>(op);
+            const AxisSet& axes = softmax_op->get_axes();
+            const size_t axes_size = axes.size();
+            const size_t shape_dim_count = get_input_shape(op, 0).size();
+
+            // clDNN has limited support for Softmax operation
+            // following are the checks to go with custom kernel
+            if ((shape_dim_count > 3) || ((shape_dim_count == 3) && (axes_size == 2)))
+            {
+                do_softmax_operation(topology,
+                                     get_input_name(op),
+                                     get_input_shape(op),
+                                     get_input_type(op),
+                                     get_output_name(op),
+                                     get_output_shape(op),
+                                     get_output_type(op),
+                                     axes);
+            }
+            else
+            {
+                cldnn::softmax::dimension_t dimension = cldnn::softmax::normalize_fyx;
+                if (axes_size == 1)
+                {
+                    size_t axes_idx = shape_dim_count - *(axes.begin()) - 1;
+                    switch (axes_idx)
+                    {
+                    case 0: dimension = cldnn::softmax::normalize_x; break;
+                    case 1: dimension = cldnn::softmax::normalize_y; break;
+                    case 2: dimension = cldnn::softmax::normalize_f; break;
+                    default:
+                        throw invalid_argument("Softmax operation: wrong axis " +
+                                               to_string(axes_idx));
+                    }
+                }
+
+                const cldnn::softmax cldnn_softmax(
+                    get_output_name(op), get_input_name(op), dimension);
+                topology.add(cldnn_softmax);
+            }
+        }
        else if ("Add" == op->description())
        {
            do_eltwise_operation(topology, op, cldnn::eltwise_mode::sum);

--- a/src/ngraph/runtime/intelgpu/intelgpu_op_custom_kernels.cpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_op_custom_kernels.cpp
@@ -43,13 +43,18 @@ vector<cldnn_arg> runtime::intelgpu::get_kernel_args(size_t input, size_t output
    return result;
 }

-string runtime::intelgpu::array_dims(const Shape& dimentions)
+string runtime::intelgpu::array_dims(const Shape& dimentions, const AxisSet& axis)
 {
+    size_t var_idx = 0;
    string buffer;

    for (auto const& dim : dimentions)
    {
-        buffer += "[" + to_string(dim) + "]";
+        if (axis.find(var_idx) == axis.end())
+        {
+            buffer += "[" + to_string(dim) + "]";
+        }
+        ++var_idx;
    }

    if (buffer.empty())

--- a/src/ngraph/runtime/intelgpu/intelgpu_op_custom_kernels.hpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_op_custom_kernels.hpp
@@ -102,7 +102,7 @@ namespace ngraph

            // Helper functions used in cldnn::custom_gpu_primitive kernels
            std::vector<cldnn_arg> get_kernel_args(size_t input, size_t output);
-            std::string array_dims(const Shape& dimentions);
+            std::string array_dims(const Shape& dimentions, const AxisSet& axis = {});
            std::string access_dims(const Shape& dimentions,
                                    const AxisSet& axis = {},
                                    bool is_reversed = false);

--- a/src/ngraph/runtime/intelgpu/intelgpu_op_softmax.cpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_op_softmax.cpp
+/*******************************************************************************
+* Copyright 2017-2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include <CPP/custom_gpu_primitive.hpp>
+
+#include "ngraph/runtime/intelgpu/code_writer.hpp"
+#include "ngraph/runtime/intelgpu/intelgpu_layout.hpp"
+#include "ngraph/runtime/intelgpu/intelgpu_op_custom_kernels.hpp"
+#include "ngraph/runtime/intelgpu/intelgpu_op_softmax.hpp"
+
+using namespace std;
+using namespace ngraph;
+
+static Shape shape_dims(const Shape& dimentions, const AxisSet& axis = {})
+{
+    size_t var_idx = 0;
+    Shape output_shape;
+    for (auto const& dim : dimentions)
+    {
+        if (axis.find(var_idx) == axis.end())
+        {
+            output_shape.push_back(dim);
+        }
+        ++var_idx;
+    }
+
+    if (output_shape.size() == 0)
+    { // it means scalar
+        output_shape.push_back(1);
+    }
+
+    return output_shape;
+}
+
+static vector<size_t> generate_loops_w_axes(codegen::CodeWriter& writer,
+                                            const Shape& shape,
+                                            bool is_begin,
+                                            const AxisSet& axis,
+                                            const string& expression)
+{
+    const size_t cldnn_gws_lim = 3;
+    vector<size_t> gws;
+    size_t var_idx = 0;
+    size_t dim_idx = 0;
+
+    for (auto const& i : shape)
+    {
+        if (axis.find(var_idx) == axis.end())
+        {
+            if (dim_idx < cldnn_gws_lim)
+            {
+                if (is_begin)
+                {
+                    writer << "const unsigned i" << var_idx << " = get_global_id(" << dim_idx
+                           << ");\n";
+                    gws.push_back(i);
+                }
+                ++dim_idx;
+            }
+            else
+            {
+                if (is_begin)
+                {
+                    writer << "for (uint i" << var_idx << " = 0; i" << var_idx << " < " << i
+                           << "; ++i" << var_idx << ")\n";
+                    writer.block_begin();
+                }
+                else
+                {
+                    writer.block_end();
+                }
+            }
+        }
+        ++var_idx;
+    }
+    if (is_begin)
+    {
+        writer << expression;
+    }
+
+    var_idx = 0;
+
+    for (auto const& i : shape)
+    {
+        if (axis.find(var_idx) != axis.end())
+        {
+            if (is_begin)
+            {
+                writer << "for (uint i" << var_idx << " = 0; i" << var_idx << " < " << i << "; ++i"
+                       << var_idx << ")\n";
+                writer.block_begin();
+            }
+            else
+            {
+                writer.block_end();
+            }
+        }
+        ++var_idx;
+    }
+
+    if (gws.empty())
+    {
+        gws.push_back(1);
+    }
+
+    return gws;
+}
+
+void runtime::intelgpu::do_softmax_operation(cldnn::topology& topology,
+                                             const string& input_name,
+                                             const Shape& input_shape,
+                                             const element::Type& input_type,
+                                             const string& output_name,
+                                             const Shape& output_shape,
+                                             const element::Type& output_type,
+                                             const AxisSet& axes)
+{
+    const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, output_shape);
+    const string entry_point_name = "softmax_" + output_name;
+    const string middle_name = entry_point_name + "_middle";
+    const string entry_point_middle_name = "softmax_middle_" + output_name;
+    const string expression = "output" + access_dims(input_shape, axes) + " = 0.0f;\n";
+    const Shape new_shape = shape_dims(output_shape, axes);
+    const cldnn::layout layout_middle = IntelGPULayout::create_cldnn_layout(output_type, new_shape);
+    codegen::CodeWriter writer0;
+    codegen::CodeWriter writer1;
+    vector<size_t> gws;
+
+    writer0 << "__kernel void " << entry_point_middle_name << "(const __global float input"
+            << array_dims(input_shape) << ", __global float output" << array_dims(input_shape, axes)
+            << ")\n";
+
+    writer0.block_begin();
+    {
+        gws = generate_loops_w_axes(writer0, output_shape, true, axes, expression);
+
+        writer0 << "output" << access_dims(input_shape, axes) << " += exp(input"
+                << access_dims(input_shape) << ");\n";
+
+        generate_loops_w_axes(writer0, output_shape, false, axes, "");
+    }
+    writer0.block_end();
+
+    const cldnn::custom_gpu_primitive op_softmax_middle(middle_name,
+                                                        {input_name},
+                                                        {writer0.get_code()},
+                                                        entry_point_middle_name,
+                                                        get_kernel_args(1, 1),
+                                                        "",
+                                                        layout_middle,
+                                                        gws);
+    topology.add(op_softmax_middle);
+
+    writer1 << "__kernel void " << entry_point_name << "(const __global float input0"
+            << array_dims(input_shape) << ", const __global float input1"
+            << array_dims(input_shape, axes) << ", __global float output"
+            << array_dims(output_shape) << ")\n";
+
+    writer1.block_begin();
+    {
+        gws = generate_loops(writer1, output_shape, true);
+        writer1 << "output" << access_dims(input_shape) << " = exp(input0"
+                << access_dims(input_shape) << ")/input1" << access_dims(input_shape, axes)
+                << ";\n";
+        generate_loops(writer1, output_shape, false);
+    }
+    writer1.block_end();
+
+    const cldnn::custom_gpu_primitive op_softmax(output_name,
+                                                 {input_name, middle_name},
+                                                 {writer1.get_code()},
+                                                 entry_point_name,
+                                                 get_kernel_args(2, 1),
+                                                 "",
+                                                 layout,
+                                                 gws);
+    topology.add(op_softmax);
+}
--- a/src/ngraph/runtime/intelgpu/intelgpu_op_softmax.hpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_op_softmax.hpp
+/*******************************************************************************
+* Copyright 2017-2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#pragma once
+
+#include <CPP/topology.hpp>
+
+#include "ngraph/shape.hpp"
+#include "ngraph/type/element_type.hpp"
+
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace intelgpu
+        {
+            void do_softmax_operation(cldnn::topology& topology,
+                                      const std::string& input_name,
+                                      const Shape& input_shape,
+                                      const element::Type& input_type,
+                                      const std::string& output_name,
+                                      const Shape& output_shape,
+                                      const element::Type& output_type,
+                                      const AxisSet& axes);
+        }
+    }
+}
--- a/src/ngraph/runtime/intelgpu/unit_test.manifest
+++ b/src/ngraph/runtime/intelgpu/unit_test.manifest
@@ -31,10 +31,6 @@ backwards_select_nested
 backwards_sigmoid
 backwards_sign
 backwards_slice
-backwards_softmax_3d
-backwards_softmax_all
-backwards_softmax_axis
-backwards_softmax_underflow
 backwards_tan
 batchnorm_bprop_n4c3h2w2
 batch_norm_one_output
@@ -97,12 +93,6 @@ select_and_scatter_without_overlap
 select_and_scatter_with_overlap
 sigmoid_bprop_n1c1h4
 sign
-softmax_all
-softmax_axis
-softmax_axis_2
-softmax_axis_3d
-softmax_axis_3d_trivial
-softmax_underflow
 tan
 tensor_constant_int64
 validate_call_input_type