Merge pull request #16658 from YashasSamaga:cuda4dnn-refactor-activations

cuda4dnn(activations, eltwise, scale_shift): refactor to reduce code duplication * refactor activations * refactor eltwise kernels * move all functors to functors.hpp * remove bias1 and scale1 kernels

Merge pull request #16658 from YashasSamaga:cuda4dnn-refactor-activations
cuda4dnn(activations, eltwise, scale_shift): refactor to reduce code duplication * refactor activations * refactor eltwise kernels * move all functors to functors.hpp * remove bias1 and scale1 kernels
8808aacc · Yashas Samaga B L · GitHub · 333a767b · 8808aacc · 8808aacc
Unverified Commit 8808aacc authored Feb 29, 2020 by Yashas Samaga B L Committed by GitHub Feb 29, 2020
10 changed files
--- a/modules/dnn/src/cuda/activations.cu
+++ b/modules/dnn/src/cuda/activations.cu
--- a/modules/dnn/src/cuda/bias_activation.cu
+++ b/modules/dnn/src/cuda/bias_activation.cu
--- a/modules/dnn/src/cuda/eltwise_ops.cu
+++ b/modules/dnn/src/cuda/eltwise_ops.cu
--- a/modules/dnn/src/cuda/execution.hpp
+++ b/modules/dnn/src/cuda/execution.hpp
@@ -63,17 +63,17 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl {
    template <class Kernel, typename ...Args> inline
    void launch_kernel(Kernel kernel, Args ...args) {
        auto policy = make_policy(kernel);
-        kernel <<<policy.grid, policy.block>>> (std::forward<Args>(args)...);
+        kernel <<<policy.grid, policy.block>>> (args...);
    }

    template <class Kernel, typename ...Args> inline
    void launch_kernel(Kernel kernel, dim3 grid, dim3 block, Args ...args) {
-        kernel <<<grid, block>>> (std::forward<Args>(args)...);
+        kernel <<<grid, block>>> (args...);
    }

    template <class Kernel, typename ...Args> inline
    void launch_kernel(Kernel kernel, execution_policy policy, Args ...args) {
-        kernel <<<policy.grid, policy.block, policy.sharedMem, policy.stream>>> (std::forward<Args>(args)...);
+        kernel <<<policy.grid, policy.block, policy.sharedMem, policy.stream>>> (args...);
    }

 }}}} /* namespace cv::dnn::cuda4dnn::csl */

--- a/modules/dnn/src/cuda/functors.hpp
+++ b/modules/dnn/src/cuda/functors.hpp
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA_FUNCTORS_HPP
+#define OPENCV_DNN_SRC_CUDA_FUNCTORS_HPP
+
+#include <cuda_runtime.h>
+
+#include "math.hpp"
+
+namespace cv { namespace dnn { namespace cuda4dnn  { namespace kernels {
+
+template <class T>
+struct abs_functor {
+    __device__ T operator()(T value) {
+        using csl::device::abs;
+        return abs(value);
+    }
+};
+
+template <class T>
+struct tanh_functor {
+    __device__ T operator()(T value) {
+        using csl::device::tanh;
+        return tanh(value);
+    }
+};
+
+template <class T>
+struct swish_functor {
+    __device__ T operator()(T value) {
+        using csl::device::sigmoid;
+        return value * sigmoid(value);
+    }
+};
+
+template <class T>
+struct mish_functor {
+    __device__ T operator()(T value) {
+        using csl::device::tanh;
+        using csl::device::log1pexp;
+        return value * tanh(log1pexp(value));
+    }
+};
+
+template <class T>
+struct sigmoid_functor {
+    __device__ T operator()(T value) {
+        using csl::device::sigmoid;
+        return sigmoid(value);
+    }
+};
+
+template <class T>
+struct bnll_functor {
+    __device__ T operator()(T value) {
+        using csl::device::log1pexp;
+        return value > T(0) ? value + log1pexp(-value) : log1pexp(value);
+    }
+};
+
+template <class T>
+struct elu_functor {
+    __device__ T operator()(T value) {
+        using csl::device::expm1;
+        return value >= T(0) ? value : expm1(value);
+    }
+};
+
+template <class T>
+struct relu_functor {
+    __device__ relu_functor(T slope_) : slope{slope_} { }
+    __device__ T operator()(T value) {
+        using csl::device::log1pexp;
+        return value >= T(0) ? value : slope * value;
+    }
+
+    T slope;
+};
+
+template <class T>
+struct clipped_relu_functor {
+    __device__ clipped_relu_functor(T floor_, T ceiling_) : floor{floor_}, ceiling{ceiling_} { }
+    __device__ T operator()(T value) {
+        using csl::device::clamp;
+        return clamp(value, floor, ceiling);
+    }
+
+    T floor, ceiling;
+};
+
+template <class T>
+struct power_functor {
+    __device__ power_functor(T exp_, T scale_, T shift_) : exp{exp_}, scale{scale_}, shift{shift_} { }
+    __device__ T operator()(T value) {
+        using csl::device::pow;
+        return pow(shift + scale * value, exp);
+    }
+
+    T exp, scale, shift;
+};
+
+template <class T>
+struct max_functor {
+    __device__ T operator()(T x, T y) {
+        using csl::device::max;
+        return max(x, y);
+    }
+};
+
+template <class T>
+struct sum_functor {
+    __device__ T operator()(T x, T y) { return x + y; }
+};
+
+template <class T>
+struct scaled_sum_functor {
+    __device__ scaled_sum_functor(T scale_x_, T scale_y_)
+        : scale_x{scale_x_}, scale_y{scale_y_} { }
+
+    __device__ T operator()(T x, T y) { return scale_x * x + scale_y * y; }
+
+    T scale_x, scale_y;
+};
+
+template <class T>
+struct product_functor {
+    __device__ T operator()(T x, T y) { return x * y; }
+};
+
+template <class T>
+struct div_functor {
+    __device__ T operator()(T x, T y) { return x / y; }
+};
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
+
+#endif /* OPENCV_DNN_SRC_CUDA_FUNCTORS_HPP */
\ No newline at end of file
--- a/modules/dnn/src/cuda/scale_shift.cu
+++ b/modules/dnn/src/cuda/scale_shift.cu
@@ -24,22 +24,6 @@ using namespace cv::dnn::cuda4dnn::csl::device;
 namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {

    namespace raw {
-        template <class T, std::size_t N>
-        __global__ void bias1_vec(Span<T> output, View<T> input, T beta) {
-            using vector_type = get_vector_type_t<T, N>;
-
-            auto output_vPtr = vector_type::get_pointer(output.data());
-            auto input_vPtr = vector_type::get_pointer(input.data());
-
-            for (auto i : grid_stride_range(output.size() / vector_type::size())) {
-                vector_type vec;
-                v_load(vec, input_vPtr[i]);
-                for (int j = 0; j < vec.size(); j++)
-                    vec.data[j] = vec.data[j] + beta;
-                v_store(output_vPtr[i], vec);
-            }
-        }
-
        template <class T, std::size_t N>
        __global__ void biasN_vec(Span<T> output, View<T> input, size_type inner_size, View<T> bias) {
            using vector_type = get_vector_type_t<T, N>;
@@ -59,22 +43,6 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
            }
        }

-        template <class T, std::size_t N>
-        __global__ void scale1_vec(Span<T> output, View<T> input, T alpha) {
-            using vector_type = get_vector_type_t<T, N>;
-
-            auto output_vPtr = vector_type::get_pointer(output.data());
-            auto input_vPtr = vector_type::get_pointer(input.data());
-
-            for (auto i : grid_stride_range(output.size() / vector_type::size())) {
-                vector_type vec;
-                v_load(vec, input_vPtr[i]);
-                for (int j = 0; j < vec.size(); j++)
-                    vec.data[j] = vec.data[j] * alpha;
-                v_store(output_vPtr[i], vec);
-            }
-        }
-
        template <class T, std::size_t N>
        __global__ void scaleN_vec(Span<T> output, View<T> input, size_type inner_size, View<T> weights)
        {
@@ -133,34 +101,6 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
        }
    }

-    template <class T, std::size_t N> static
-    void launch_bias1_vec_kernel(const Stream& stream, Span<T> output, View<T> input, T beta) {
-        CV_Assert(is_fully_aligned<T>(output, N));
-        CV_Assert(is_fully_aligned<T>(input, N));
-
-        auto kernel = raw::bias1_vec<T, N>;
-        auto policy = make_policy(kernel, output.size() / N, 0, stream);
-        launch_kernel(kernel, policy, output, input, beta);
-    }
-
-    template <class T>
-    void bias1(const Stream& stream, TensorSpan<T> output, TensorView<T> input, T beta) {
-        CV_Assert(is_shape_same(input, output));
-
-        if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4)) {
-            launch_bias1_vec_kernel<T, 4>(stream, output, input, beta);
-        } else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2)) {
-            launch_bias1_vec_kernel<T, 2>(stream, output, input, beta);
-        } else {
-            launch_bias1_vec_kernel<T, 1>(stream, output, input, beta);
-        }
-    }
-
-#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
-    template void bias1<__half>(const Stream&, TensorSpan<__half>, TensorView<__half>, __half);
-#endif
-    template void bias1<float>(const Stream&, TensorSpan<float>, TensorView<float>, float);
-
    template <class T, std::size_t N> static
    void launch_biasN_vec_kernel(const Stream& stream, Span<T> output, View<T> input, std::size_t inner_size, View<T> bias){
        CV_Assert(is_fully_aligned<T>(output, N));
@@ -195,34 +135,6 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
 #endif
    template void biasN<float>(const Stream&, TensorSpan<float>, TensorView<float>, std::size_t, TensorView<float>);

-    template <class T, std::size_t N> static
-    void launch_scale1_vec_kernel(const Stream& stream, Span<T> output, View<T> input, T alpha) {
-        CV_Assert(is_fully_aligned<T>(output, N));
-        CV_Assert(is_fully_aligned<T>(input, N));
-
-        auto kernel = raw::scale1_vec<T, N>;
-        auto policy = make_policy(kernel, output.size() / N, 0, stream);
-        launch_kernel(kernel, policy, output, input, alpha);
-    }
-
-    template <class T>
-    void scale1(const Stream& stream, TensorSpan<T> output, TensorView<T> input, T alpha) {
-        CV_Assert(is_shape_same(input, output));
-
-        if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4)) {
-            launch_scale1_vec_kernel<T, 4>(stream, output, input, alpha);
-        } else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2)) {
-            launch_scale1_vec_kernel<T, 2>(stream, output, input, alpha);
-        } else {
-            launch_scale1_vec_kernel<T, 1>(stream, output, input, alpha);
-        }
-    }
-
-#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
-    template void scale1<__half>(const Stream&, TensorSpan<__half>, TensorView<__half>, __half);
-#endif
-    template void scale1<float>(const Stream&, TensorSpan<float>, TensorView<float>, float);
-
    template <class T, std::size_t N> static
    void launch_scaleN_vec_kernel(const Stream& stream, Span<T> output, View<T> input, std::size_t inner_size, View<T> weights) {
        CV_Assert(is_fully_aligned<T>(output, N));

--- a/modules/dnn/src/cuda4dnn/kernels/bias_activation.hpp
+++ b/modules/dnn/src/cuda4dnn/kernels/bias_activation.hpp
@@ -19,7 +19,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
    void biasN_clipped_relu_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, std::size_t inner_size, csl::View<T> bias, T floor, T ceiling);

    template <class T>
-    void biasN_power_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, std::size_t inner_size, csl::View<T> bias, T exp);
+    void biasN_power_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, std::size_t inner_size, csl::View<T> bias, T exp, T scale, T shift);

    template <class T>
    void biasN_tanh_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, std::size_t inner_size, csl::View<T> bias);

--- a/modules/dnn/src/cuda4dnn/kernels/scale_shift.hpp
+++ b/modules/dnn/src/cuda4dnn/kernels/scale_shift.hpp
@@ -12,18 +12,12 @@

 namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {

-    template <class T>
-    void bias1(const csl::Stream& stream, csl::TensorSpan<T> output, csl::TensorView<T> input, T alpha);
-
    template <class T>
    void biasN(const csl::Stream& stream,
        csl::TensorSpan<T> output,
        csl::TensorView<T> input, std::size_t inner_size,
        csl::TensorView<T> bias);

-    template <class T>
-    void scale1(const csl::Stream& stream, csl::TensorSpan<T> output, csl::TensorView<T> input, T alpha);
-
    template <class T>
    void scaleN(const csl::Stream& stream,
        csl::TensorSpan<T> output,

--- a/modules/dnn/src/cuda4dnn/primitives/convolution.hpp
+++ b/modules/dnn/src/cuda4dnn/primitives/convolution.hpp
@@ -286,7 +286,7 @@ namespace cv { namespace dnn { namespace cuda4dnn {
                            kernels::biasN_clipped_relu_inplace<T>(stream, output, inner_size, biasTensor, crelu_floor, crelu_ceil);
                            break;
                        case ConvolutionConfiguration::ActivationType::POWER:
-                            kernels::biasN_power_inplace<T>(stream, output, inner_size, biasTensor, power_exp);
+                            kernels::biasN_power_inplace<T>(stream, output, inner_size, biasTensor, power_exp, T(1.0), T(0.0));
                            break;
                        case ConvolutionConfiguration::ActivationType::TANH:
                            kernels::biasN_tanh_inplace<T>(stream, output, inner_size, biasTensor);

--- a/modules/dnn/src/cuda4dnn/primitives/normalize_bbox.hpp
+++ b/modules/dnn/src/cuda4dnn/primitives/normalize_bbox.hpp
@@ -113,7 +113,7 @@ namespace cv { namespace dnn { namespace cuda4dnn {
             */
            if (weight != 1.0)
            {
-                kernels::scale1<T>(stream, output, input, weight);
+                kernels::scale1_with_bias1<T>(stream, output, input, weight, 1.0);
            }
            else if (!weightsTensor.empty())
            {