add softmax op (#542)

add softmax op and documentation

add softmax op (#542)
add softmax op and documentation
0c43f175 · adstraw · GitHub · 498fbefd · 0c43f175 · 0c43f175
Unverified Commit 0c43f175 authored Mar 02, 2018 by adstraw Committed by GitHub Mar 02, 2018
12 changed files
--- a/doc/sphinx/source/ops/softmax.rst
+++ b/doc/sphinx/source/ops/softmax.rst
+.. softmax.rst:
+#######
+Softmax
+#######
+.. code-block:: cpp
+   Softmax  // Softmax operation
+Description
+===========
+Produces a tensor of the same element type and shape as ``arg``,
+where the value at each coordinate of ``output`` is the expine of the
+value of the corresponding coordinate of ``arg`` divided by the sum
+of the expine of all coordinates of ``arg`` in the specified ``axes``.
+Inputs
+------
+-----------------+-------------------------+--------------------------------+
+| Name            | Element Type            | Shape                          |
+=================+=========================+================================+
+| ``arg``         | Any                     | Any                            |
+-----------------+-------------------------+--------------------------------+
+Parameters
+----------
+-----------------+----------------------------------------------------------------+
+| Name            | Description                                                    |
+=================+================================================================+
+| ``axes``        | The axis positions (0-based) on which to calculate the softmax |
+-----------------+----------------------------------------------------------------+
+Outputs
+-------
+-----------------+-------------------------+--------------------------------+
+| Name            | Element Type            | Shape                          |
+=================+=========================+================================+
+| ``output``      | Same as ``arg``         | Same as ``arg``                |
+-----------------+-------------------------+--------------------------------+
+Mathematical Definition
+=======================
+.. math::
+   \texttt{output}_{i} = \frac{\exp(\texttt{arg}_{i})}{\sum_{j} \exp(\texttt{arg}_{j})}
+C++ Interface
+=============
+.. doxygenclass:: ngraph::op::Softmax
+   :project: ngraph
+   :members: m_axes
\ No newline at end of file
--- a/src/ngraph/CMakeLists.txt
+++ b/src/ngraph/CMakeLists.txt
@@ -72,6 +72,7 @@ set (SRC
    ops/sin.cpp
    ops/sinh.cpp
    ops/slice.cpp
+    ops/softmax.cpp
    ops/sqrt.cpp
    ops/subtract.cpp
    ops/sum.cpp

--- a/src/ngraph/ngraph.hpp
+++ b/src/ngraph/ngraph.hpp
@@ -118,6 +118,7 @@
 #include "ngraph/ops/sin.hpp"
 #include "ngraph/ops/sinh.hpp"
 #include "ngraph/ops/slice.hpp"
+#include "ngraph/ops/softmax.hpp"
 #include "ngraph/ops/sqrt.hpp"
 #include "ngraph/ops/subtract.hpp"
 #include "ngraph/ops/sum.hpp"

--- a/src/ngraph/ops/softmax.cpp
+++ b/src/ngraph/ops/softmax.cpp
+/*******************************************************************************
+* Copyright 2017-2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#include "ngraph/ops/softmax.hpp"
+#include <algorithm>
+#include "ngraph/builder/autobroadcast.hpp"
+#include "ngraph/ops/multiply.hpp"
+#include "ngraph/ops/reshape.hpp"
+#include "ngraph/ops/subtract.hpp"
+#include "ngraph/ops/sum.hpp"
+void ngraph::op::Softmax::generate_adjoints(autodiff::Adjoints& adjoints,
+                                            const std::shared_ptr<Node>& delta)
+{
+    auto z = delta * shared_from_this();
+    auto zsum = std::make_shared<op::Sum>(z, m_axes);
+    Shape shape;
+    for (size_t i = 0; i < get_shape().size(); ++i)
+    {
+        if (m_axes.find(i) == m_axes.end())
+        {
+            shape.push_back(get_shape()[i]);
+        }
+        else
+        {
+            shape.push_back(1);
+        }
+    }
+    AxisVector order(zsum->get_shape().size());
+    std::iota(order.begin(), order.end(), 0);
+    auto zreshape = std::make_shared<op::Reshape>(zsum, order, shape);
+    auto adjoint =
+        z - builder::make_with_numpy_broadcast<op::Multiply>(shared_from_this(), zreshape);
+    auto x = get_input_op(0);
+    adjoints.add_delta(x, adjoint);
+}
--- a/src/ngraph/ops/softmax.hpp
+++ b/src/ngraph/ops/softmax.hpp
+/*******************************************************************************
+* Copyright 2017-2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#pragma once
+#include "ngraph/ops/util/unary_elementwise_arithmetic.hpp"
+namespace ngraph
+{
+    namespace op
+    {
+        /// \brief Softmax operation.
+        ///
+        class Softmax : public util::UnaryElementwiseArithmetic
+        {
+        public:
+            /// \brief Constructs a softmax operation.
+            ///
+            /// \param arg Node that produces the first input tensor.<br>
+            /// `[d0, ...]`
+            /// \param axes The axis positions (0-based) on which to calculate the softmax.
+            ///
+            /// Output `[d0, ...]`
+            ///
+            Softmax(const std::shared_ptr<Node>& arg, const AxisSet& axes)
+                : UnaryElementwiseArithmetic("Softmax", arg)
+                , m_axes(axes)
+            {
+                for (auto axis : m_axes)
+                {
+                    if (axis >= get_shape().size())
+                    {
+                        throw ngraph_error("Axis for softmax reduction operator is out of bounds");
+                    }
+                }
+                // empty axes == all axes
+                if (m_axes.size() == 0)
+                {
+                    for (size_t i = 0; i < get_shape().size(); ++i)
+                    {
+                        m_axes.insert(i);
+                    }
+                }
+            }
+            virtual std::shared_ptr<Node>
+                copy_with_new_args(const NodeVector& new_args) const override
+            {
+                if (new_args.size() != 1)
+                {
+                    throw ngraph_error("Incorrect number of new arguments");
+                }
+                return std::make_shared<Softmax>(new_args.at(0), m_axes);
+            }
+            const AxisSet& get_axes() const { return m_axes; }
+        protected:
+            virtual void generate_adjoints(autodiff::Adjoints& adjoints,
+                                           const std::shared_ptr<Node>& delta) override;
+        private:
+            AxisSet m_axes;
+        };
+    }
+}
--- a/src/ngraph/runtime/cpu/cpu_emitter.cpp
+++ b/src/ngraph/runtime/cpu/cpu_emitter.cpp
@@ -79,6 +79,7 @@
 #include "ngraph/ops/sin.hpp"
 #include "ngraph/ops/sinh.hpp"
 #include "ngraph/ops/slice.hpp"
+#include "ngraph/ops/softmax.hpp"
 #include "ngraph/ops/sqrt.hpp"
 #include "ngraph/ops/subtract.hpp"
 #include "ngraph/ops/sum.hpp"
@@ -3188,6 +3189,203 @@ namespace ngraph
                    writer << "                   " << out[0].get_size() << ");\n";
                }
            }
+            template <>
+            void CPU_Emitter::EMITTER_DECL(ngraph::op::Softmax)
+            {
+                const ngraph::op::Softmax* softmax = static_cast<const ngraph::op::Softmax*>(node);
+                auto type = out[0].get_type();
+                auto shape = out[0].get_shape();
+                auto dims = out[0].get_shape().size();
+                auto axes = softmax->get_axes();
+                // create arg/out if 1d
+                if (dims < 1)
+                {
+                    writer << type << "* arg = " << args[0].get_name() << "\n";
+                    writer << type << "* out = " << out[0].get_name() << "\n";
+                }
+                // else cast arg/out to an Nd array
+                else
+                {
+                    std::string shape1toN;
+                    for (size_t d = 1; d < dims; ++d)
+                    {
+                        shape1toN += "[";
+                        shape1toN += std::to_string(shape[d]);
+                        shape1toN += "]";
+                    }
+                    writer << type << " (*arg)" << shape1toN << " = (" << type << " (*)"
+                           << shape1toN << ") " << args[0].get_name() << ";\n";
+                    writer << type << " (*out)" << shape1toN << " = (" << type << " (*)"
+                           << shape1toN << ") " << out[0].get_name() << ";\n";
+                }
+                // build arg/out index
+                std::string index;
+                for (size_t d = 0; d < dims; ++d)
+                {
+                    index += "[i";
+                    index += std::to_string(d);
+                    index += "]";
+                }
+                // calculate e ^ (arg - max)
+                // outer loop(s) - for axis not in axes
+                for (size_t d = 0; d < dims; ++d)
+                {
+                    if (axes.find(d) == axes.end())
+                    {
+                        writer << "#pragma omp parallel for\n";
+                        writer << "for (size_t i" << d << " = 0; i" << d << " < " << shape[d]
+                               << "; ++i" << d << ")\n";
+                        writer << "{\n";
+                        writer.indent++;
+                    }
+                }
+                // max inner loop(s)
+                writer << type << " m = 0;\n"; // TODO: needs to be minval for the type
+                for (size_t d = 0; d < dims; ++d)
+                {
+                    if (axes.find(d) != axes.end())
+                    {
+                        writer << "for (size_t i" << d << " = 0; i" << d << " < " << shape[d]
+                               << "; ++i" << d << ")\n";
+                        writer << "{\n";
+                        writer.indent++;
+                    }
+                }
+                writer << "if (arg" << index << " > m)\n";
+                writer << "{\n";
+                writer.indent++;
+                writer << "m = arg" << index << ";\n";
+                writer.indent--;
+                writer << "}\n";
+                // end max inner loop(s)
+                for (size_t d = 0; d < dims; ++d)
+                {
+                    if (axes.find(d) != axes.end())
+                    {
+                        writer.indent--;
+                        writer << "}\n";
+                    }
+                }
+                // e ^ (arg - max) inner loop
+                for (size_t d = 0; d < dims; ++d)
+                {
+                    if (axes.find(d) != axes.end())
+                    {
+                        writer << "for (size_t i" << d << " = 0; i" << d << " < " << shape[d]
+                               << "; ++i" << d << ")\n";
+                        writer << "{\n";
+                        writer.indent++;
+                    }
+                }
+                writer << "out" << index << " = exp(arg" << index << " - m);\n";
+                // end e ^ (arg - max) inner loop
+                for (size_t d = 0; d < dims; ++d)
+                {
+                    if (axes.find(d) != axes.end())
+                    {
+                        writer.indent--;
+                        writer << "}\n";
+                    }
+                }
+                // end e ^ (arg - max) outer loop(s)
+                for (size_t d = 0; d < dims; ++d)
+                {
+                    if (axes.find(d) == axes.end())
+                    {
+                        writer.indent--;
+                        writer << "}\n";
+                    }
+                }
+                // calculate softmax = e ^ (arg - max) / sum (e ^ (arg - max))
+                // outer loop(s) - for axis not in axes
+                for (size_t d = 0; d < dims; ++d)
+                {
+                    if (axes.find(d) == axes.end())
+                    {
+                        writer << "#pragma omp parallel for\n";
+                        writer << "for (size_t i" << d << " = 0; i" << d << " < " << shape[d]
+                               << "; ++i" << d << ")\n";
+                        writer << "{\n";
+                        writer.indent++;
+                    }
+                }
+                // sum (e ^ (arg - max) inner loop(s)
+                writer << type << " d = 0;\n";
+                for (size_t d = 0; d < dims; ++d)
+                {
+                    if (axes.find(d) != axes.end())
+                    {
+                        writer << "for (size_t i" << d << " = 0; i" << d << " < " << shape[d]
+                               << "; ++i" << d << ")\n";
+                        writer << "{\n";
+                        writer.indent++;
+                    }
+                }
+                writer << "d += out" << index << ";\n";
+                // end sum (e ^ (arg - max) inner loop(s)
+                for (size_t d = 0; d < dims; ++d)
+                {
+                    if (axes.find(d) != axes.end())
+                    {
+                        writer.indent--;
+                        writer << "}\n";
+                    }
+                }
+                writer << "d = 1 / d;\n";
+                // softmax inner loop(s)
+                for (size_t d = 0; d < dims; ++d)
+                {
+                    if (axes.find(d) != axes.end())
+                    {
+                        writer << "for (size_t i" << d << " = 0; i" << d << " < " << shape[d]
+                               << "; ++i" << d << ")\n";
+                        writer << "{\n";
+                        writer.indent++;
+                    }
+                }
+                writer << "out" << index << " *= d;\n";
+                // end softmax inner loop(s)
+                for (size_t d = 0; d < dims; ++d)
+                {
+                    if (axes.find(d) != axes.end())
+                    {
+                        writer.indent--;
+                        writer << "}\n";
+                    }
+                }
+                // end softmax outer loop(s)
+                for (size_t d = 0; d < dims; ++d)
+                {
+                    if (axes.find(d) == axes.end())
+                    {
+                        writer.indent--;
+                        writer << "}\n";
+                    }
+                }
+            }
        }
    }
 }

--- a/src/ngraph/runtime/cpu/cpu_external_function.cpp
+++ b/src/ngraph/runtime/cpu/cpu_external_function.cpp
@@ -89,6 +89,7 @@
 #include "ngraph/ops/sin.hpp"
 #include "ngraph/ops/sinh.hpp"
 #include "ngraph/ops/slice.hpp"
+#include "ngraph/ops/softmax.hpp"
 #include "ngraph/ops/sqrt.hpp"
 #include "ngraph/ops/subtract.hpp"
 #include "ngraph/ops/sum.hpp"
@@ -239,6 +240,7 @@ static const runtime::cpu::OpMap dispatcher{
    {TI(ngraph::op::Min), &runtime::cpu::CPU_Emitter::emit<op::Min>},
    {TI(ngraph::op::Relu), &runtime::cpu::CPU_Emitter::emit<op::Relu>},
    {TI(ngraph::op::ReluBackprop), &runtime::cpu::CPU_Emitter::emit<op::ReluBackprop>},
+    {TI(ngraph::op::Softmax), &runtime::cpu::CPU_Emitter::emit<op::Softmax>},
 };
 runtime::cpu::CPU_ExternalFunction::CPU_ExternalFunction(

--- a/src/ngraph/runtime/interpreter/int_call_frame.hpp
+++ b/src/ngraph/runtime/interpreter/int_call_frame.hpp
@@ -42,6 +42,7 @@
 #include "ngraph/ops/reverse.hpp"
 #include "ngraph/ops/select_and_scatter.hpp"
 #include "ngraph/ops/slice.hpp"
+#include "ngraph/ops/softmax.hpp"
 #include "ngraph/ops/sum.hpp"
 #include "ngraph/runtime/call_frame.hpp"
 #include "ngraph/runtime/host_tensor_view.hpp"
@@ -95,6 +96,7 @@
 #include "ngraph/runtime/kernel/sin.hpp"
 #include "ngraph/runtime/kernel/sinh.hpp"
 #include "ngraph/runtime/kernel/slice.hpp"
+#include "ngraph/runtime/kernel/softmax.hpp"
 #include "ngraph/runtime/kernel/sqrt.hpp"
 #include "ngraph/runtime/kernel/subtract.hpp"
 #include "ngraph/runtime/kernel/sum.hpp"
@@ -812,6 +814,14 @@ private:
                             slice->get_strides(),
                             out[0]->get_shape());
        }
+        else if (node_op == "Softmax")
+        {
+            const op::Softmax* softmax = static_cast<const op::Softmax*>(&node);
+            kernel::softmax<T>(reinterpret_cast<T*>(args[0]->get_data_ptr()),
+                               reinterpret_cast<T*>(out[0]->get_data_ptr()),
+                               out[0]->get_shape(),
+                               softmax->get_axes());
+        }
        else if (node_op == "Sqrt")
        {
            kernel::sqrt<T>(reinterpret_cast<T*>(args[0]->get_data_ptr()),

--- a/src/ngraph/runtime/kernel/softmax.hpp
+++ b/src/ngraph/runtime/kernel/softmax.hpp
+/*******************************************************************************
+* Copyright 2017-2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#pragma once
+#include <cmath>
+#include "ngraph/coordinate_transform.hpp"
+#include "ngraph/runtime/kernel/max.hpp"
+#include "ngraph/runtime/kernel/sum.hpp"
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace kernel
+        {
+            template <typename T>
+            void softmax(T* arg, T* out, const Shape& shape, const AxisSet& axes)
+            {
+                auto temp_shape = project(shape, axes);
+                auto temp_elements = std::accumulate(
+                    temp_shape.begin(), temp_shape.end(), 1, std::multiplies<size_t>());
+                auto temp_ptr = new T[temp_elements];
+                max(arg, temp_ptr, shape, temp_shape, axes);
+                CoordinateTransform transform(shape);
+                CoordinateTransform temp_transform(temp_shape);
+                for (const Coordinate& coord : transform)
+                {
+                    Coordinate temp_coord = project(coord, axes);
+                    out[transform.index(coord)] = std::exp(
+                        arg[transform.index(coord)] - temp_ptr[temp_transform.index(temp_coord)]);
+                }
+                sum(out, temp_ptr, shape, temp_shape, axes);
+                for (const Coordinate& coord : transform)
+                {
+                    Coordinate temp_coord = project(coord, axes);
+                    out[transform.index(coord)] /= temp_ptr[temp_transform.index(temp_coord)];
+                }
+                delete[] temp_ptr;
+            }
+        }
+    }
+}
--- a/src/ngraph/serializer.cpp
+++ b/src/ngraph/serializer.cpp
@@ -70,6 +70,7 @@
 #include "ngraph/ops/sin.hpp"
 #include "ngraph/ops/sinh.hpp"
 #include "ngraph/ops/slice.hpp"
+#include "ngraph/ops/softmax.hpp"
 #include "ngraph/ops/sqrt.hpp"
 #include "ngraph/ops/subtract.hpp"
 #include "ngraph/ops/sum.hpp"
@@ -712,6 +713,11 @@ static shared_ptr<ngraph::Function>
            auto strides = node_js.at("strides").get<vector<size_t>>();
            node = make_shared<op::Slice>(args[0], lower_bounds, upper_bounds, strides);
        }
+        else if (node_op == "Softmax")
+        {
+            auto reduction_axes = node_js.at("reduction_axes").get<set<size_t>>();
+            node = make_shared<op::Softmax>(args[0], reduction_axes);
+        }
        else if (node_op == "Sqrt")
        {
            node = make_shared<op::Sqrt>(args[0]);

--- a/test/autodiff.in.cpp
+++ b/test/autodiff.in.cpp
@@ -1303,27 +1303,116 @@ TEST(${BACKEND_NAME}, backwards_slice)
    }
 }
-TEST(${BACKEND_NAME}, backwards_sqrt)
+TEST(${BACKEND_NAME}, backwards_softmax_all)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    auto manager = runtime::Manager::get("${BACKEND_NAME}");
    auto backend = manager->allocate_backend();
-    // Deriv has an asymptote at 0 so we'll stay away from there.
+    test::Uniform<float> rng(-1.0f, 1.0f);
-    test::Uniform<float> rng(0.1f, 10.0f);
    Shape shape{2, 3};
+    auto x0 = rng.initialize(backend->make_primary_tensor_view<float>(shape));
    auto make_graph = [shape]() {
-        auto X = make_shared<op::Parameter>(element::f32, shape);
+        auto X0 = make_shared<op::Parameter>(element::f32, shape);
-        return make_shared<Function>(make_shared<op::Sqrt>(X),
+        return make_shared<Function>(make_shared<op::Softmax>(X0, AxisSet{0, 1}),
-                                     std::vector<std::shared_ptr<op::Parameter>>{X});
+                                     std::vector<std::shared_ptr<op::Parameter>>{X0});
    };
+    EXPECT_TRUE(autodiff_numeric_compare<float>(manager, backend, make_graph, {x0}, .01f, .01f));
+}
-    for (auto i = 0; i < ${TEST_LOOPS}; i++)
+TEST(${BACKEND_NAME}, backwards_softmax_axis)
-    {
+{
-        auto x = rng.initialize(backend->make_primary_tensor_view<float>(shape));
+    auto manager = runtime::Manager::get("${BACKEND_NAME}");
+    auto backend = manager->allocate_backend();
-        EXPECT_TRUE(autodiff_numeric_compare<float>(manager, backend, make_graph, {x}, .01f, .01f));
+    test::Uniform<float> rng(-1.0f, 1.0f);
-    }
+    Shape shape{2, 3};
+    auto x0 = rng.initialize(backend->make_primary_tensor_view<float>(shape));
+    auto make_graph = [shape]() {
+        auto X0 = make_shared<op::Parameter>(element::f32, shape);
+        return make_shared<Function>(make_shared<op::Softmax>(X0, AxisSet{1}),
+                                     std::vector<std::shared_ptr<op::Parameter>>{X0});
+    };
+    EXPECT_TRUE(autodiff_numeric_compare<float>(manager, backend, make_graph, {x0}, .01f, .01f));
+}
+TEST(${BACKEND_NAME}, backwards_softmax_underflow)
+{
+    auto manager = runtime::Manager::get("${BACKEND_NAME}");
+    auto backend = manager->allocate_backend();
+    auto low = std::numeric_limits<float>::lowest();
+    Shape shape{2, 3};
+    auto x0 = backend->make_primary_tensor_view(element::f32, shape);
+    copy_data(x0, vector<float>{low, 1, 2, 3, 4, 5});
+    auto make_graph = [shape]() {
+        auto X0 = make_shared<op::Parameter>(element::f32, shape);
+        return make_shared<Function>(make_shared<op::Softmax>(X0, AxisSet{0, 1}),
+                                     std::vector<std::shared_ptr<op::Parameter>>{X0});
+    };
+    EXPECT_TRUE(autodiff_numeric_compare<float>(manager, backend, make_graph, {x0}, .01f, .01f));
+}
+TEST(${BACKEND_NAME}, backwards_softmax_3d)
+{
+    auto manager = runtime::Manager::get("${BACKEND_NAME}");
+    auto backend = manager->allocate_backend();
+    test::Uniform<float> rng(-1.0f, 1.0f);
+    Shape shape{2, 3, 4};
+    auto x0 = rng.initialize(backend->make_primary_tensor_view<float>(shape));
+    auto make_graph0 = [shape]() {
+        auto X0 = make_shared<op::Parameter>(element::f32, shape);
+        return make_shared<Function>(make_shared<op::Softmax>(X0, AxisSet{0}),
+                                     std::vector<std::shared_ptr<op::Parameter>>{X0});
+    };
+    EXPECT_TRUE(autodiff_numeric_compare<float>(manager, backend, make_graph0, {x0}, .01f, .01f));
+    auto make_graph1 = [shape]() {
+        auto X0 = make_shared<op::Parameter>(element::f32, shape);
+        return make_shared<Function>(make_shared<op::Softmax>(X0, AxisSet{1}),
+                                     std::vector<std::shared_ptr<op::Parameter>>{X0});
+    };
+    EXPECT_TRUE(autodiff_numeric_compare<float>(manager, backend, make_graph1, {x0}, .01f, .01f));
+    auto make_graph2 = [shape]() {
+        auto X0 = make_shared<op::Parameter>(element::f32, shape);
+        return make_shared<Function>(make_shared<op::Softmax>(X0, AxisSet{2}),
+                                     std::vector<std::shared_ptr<op::Parameter>>{X0});
+    };
+    EXPECT_TRUE(autodiff_numeric_compare<float>(manager, backend, make_graph2, {x0}, .01f, .01f));
+    auto make_graph01 = [shape]() {
+        auto X0 = make_shared<op::Parameter>(element::f32, shape);
+        return make_shared<Function>(make_shared<op::Softmax>(X0, AxisSet{0, 1}),
+                                     std::vector<std::shared_ptr<op::Parameter>>{X0});
+    };
+    EXPECT_TRUE(autodiff_numeric_compare<float>(manager, backend, make_graph01, {x0}, .01f, .01f));
+    auto make_graph02 = [shape]() {
+        auto X0 = make_shared<op::Parameter>(element::f32, shape);
+        return make_shared<Function>(make_shared<op::Softmax>(X0, AxisSet{0, 2}),
+                                     std::vector<std::shared_ptr<op::Parameter>>{X0});
+    };
+    EXPECT_TRUE(autodiff_numeric_compare<float>(manager, backend, make_graph02, {x0}, .01f, .01f));
+    auto make_graph12 = [shape]() {
+        auto X0 = make_shared<op::Parameter>(element::f32, shape);
+        return make_shared<Function>(make_shared<op::Softmax>(X0, AxisSet{1, 2}),
+                                     std::vector<std::shared_ptr<op::Parameter>>{X0});
+    };
+    EXPECT_TRUE(autodiff_numeric_compare<float>(manager, backend, make_graph12, {x0}, .01f, .01f));
+    auto make_graph012 = [shape]() {
+        auto X0 = make_shared<op::Parameter>(element::f32, shape);
+        return make_shared<Function>(make_shared<op::Softmax>(X0, AxisSet{0, 1, 2}),
+                                     std::vector<std::shared_ptr<op::Parameter>>{X0});
+    };
+    EXPECT_TRUE(autodiff_numeric_compare<float>(manager, backend, make_graph012, {x0}, .01f, .01f));
 }
 TEST(${BACKEND_NAME}, backwards_subtract)

--- a/test/backend_test.in.cpp
+++ b/test/backend_test.in.cpp
@@ -8447,3 +8447,90 @@ TEST(${BACKEND_NAME}, relu_4Dbackprop)
    cf->call({a, delta}, {result});
    EXPECT_EQ(read_vector<float>(result), expected);
 }
+TEST(${BACKEND_NAME}, softmax_all)
+{
+    Shape shape{2, 3};
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto f =
+        make_shared<Function>(make_shared<op::Softmax>(A, AxisSet{0, 1}), op::ParameterVector{A});
+    auto manager = runtime::Manager::get("${BACKEND_NAME}");
+    auto external = manager->compile(f);
+    auto backend = manager->allocate_backend();
+    auto cf = backend->make_call_frame(external);
+    auto a = backend->make_primary_tensor_view(element::f32, shape);
+    copy_data(a, vector<float>{-3, -2, -1, 0, 1, 2});
+    auto result = backend->make_primary_tensor_view(element::f32, shape);
+    auto d = expf(-3) + expf(-2) + expf(-1) + expf(0) + expf(1) + expf(2);
+    cf->call({a}, {result});
+    vector<float> expected{
+        expf(-3) / d, expf(-2) / d, expf(-1) / d, expf(0) / d, expf(1) / d, expf(2) / d};
+    EXPECT_TRUE(test::all_close(expected, read_vector<float>(result)));
+    // empty AxisSet is the same as "full" AxisSet
+    f = make_shared<Function>(make_shared<op::Softmax>(A, AxisSet{}), op::ParameterVector{A});
+    external = manager->compile(f);
+    cf = backend->make_call_frame(external);
+    cf->call({a}, {result});
+    EXPECT_TRUE(test::all_close(expected, read_vector<float>(result)));
+}
+TEST(${BACKEND_NAME}, softmax_axis)
+{
+    Shape shape{2, 3};
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto f = make_shared<Function>(make_shared<op::Softmax>(A, AxisSet{1}), op::ParameterVector{A});
+    auto manager = runtime::Manager::get("${BACKEND_NAME}");
+    auto external = manager->compile(f);
+    auto backend = manager->allocate_backend();
+    auto cf = backend->make_call_frame(external);
+    auto a = backend->make_primary_tensor_view(element::f32, shape);
+    copy_data(a, vector<float>{-10, -20, -30, -40, -50, -60});
+    auto result = backend->make_primary_tensor_view(element::f32, shape);
+    auto d0 = expf(-10) + expf(-20) + expf(-30);
+    auto d1 = expf(-40) + expf(-50) + expf(-60);
+    cf->call({a}, {result});
+    vector<float> expected{expf(-10) / d0,
+                           expf(-20) / d0,
+                           expf(-30) / d0,
+                           expf(-40) / d1,
+                           expf(-50) / d1,
+                           expf(-60) / d1};
+    EXPECT_TRUE(test::all_close(expected, read_vector<float>(result)));
+}
+TEST(${BACKEND_NAME}, softmax_underflow)
+{
+    Shape shape{2, 3};
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto f = make_shared<Function>(make_shared<op::Softmax>(A, AxisSet{0}), op::ParameterVector{A});
+    auto manager = runtime::Manager::get("${BACKEND_NAME}");
+    auto external = manager->compile(f);
+    auto backend = manager->allocate_backend();
+    auto cf = backend->make_call_frame(external);
+    auto low = std::numeric_limits<float>::lowest();
+    auto a = backend->make_primary_tensor_view(element::f32, shape);
+    copy_data(a, vector<float>{low, 1, 2, 3, 4, 5});
+    auto result = backend->make_primary_tensor_view(element::f32, shape);
+    auto d0 = expf(low) + expf(3);
+    auto d1 = expf(1) + expf(4);
+    auto d2 = expf(2) + expf(5);
+    cf->call({a}, {result});
+    vector<float> expected{
+        expf(low) / d0, expf(1) / d1, expf(2) / d2, expf(3) / d0, expf(4) / d1, expf(5) / d2};
+    EXPECT_TRUE(test::all_close(expected, read_vector<float>(result)));
+}