Unverified Commit 8808aacc authored by Yashas Samaga B L's avatar Yashas Samaga B L Committed by GitHub

Merge pull request #16658 from YashasSamaga:cuda4dnn-refactor-activations

cuda4dnn(activations, eltwise, scale_shift): refactor to reduce code duplication

* refactor activations

* refactor eltwise kernels

* move all functors to functors.hpp

* remove bias1 and scale1 kernels
parent 333a767b
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -63,17 +63,17 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl {
template <class Kernel, typename ...Args> inline
void launch_kernel(Kernel kernel, Args ...args) {
auto policy = make_policy(kernel);
kernel <<<policy.grid, policy.block>>> (std::forward<Args>(args)...);
kernel <<<policy.grid, policy.block>>> (args...);
}
template <class Kernel, typename ...Args> inline
void launch_kernel(Kernel kernel, dim3 grid, dim3 block, Args ...args) {
kernel <<<grid, block>>> (std::forward<Args>(args)...);
kernel <<<grid, block>>> (args...);
}
template <class Kernel, typename ...Args> inline
void launch_kernel(Kernel kernel, execution_policy policy, Args ...args) {
kernel <<<policy.grid, policy.block, policy.sharedMem, policy.stream>>> (std::forward<Args>(args)...);
kernel <<<policy.grid, policy.block, policy.sharedMem, policy.stream>>> (args...);
}
}}}} /* namespace cv::dnn::cuda4dnn::csl */
......
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#ifndef OPENCV_DNN_SRC_CUDA_FUNCTORS_HPP
#define OPENCV_DNN_SRC_CUDA_FUNCTORS_HPP
#include <cuda_runtime.h>
#include "math.hpp"
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
template <class T>
struct abs_functor {
__device__ T operator()(T value) {
using csl::device::abs;
return abs(value);
}
};
template <class T>
struct tanh_functor {
__device__ T operator()(T value) {
using csl::device::tanh;
return tanh(value);
}
};
template <class T>
struct swish_functor {
__device__ T operator()(T value) {
using csl::device::sigmoid;
return value * sigmoid(value);
}
};
template <class T>
struct mish_functor {
__device__ T operator()(T value) {
using csl::device::tanh;
using csl::device::log1pexp;
return value * tanh(log1pexp(value));
}
};
template <class T>
struct sigmoid_functor {
__device__ T operator()(T value) {
using csl::device::sigmoid;
return sigmoid(value);
}
};
template <class T>
struct bnll_functor {
__device__ T operator()(T value) {
using csl::device::log1pexp;
return value > T(0) ? value + log1pexp(-value) : log1pexp(value);
}
};
template <class T>
struct elu_functor {
__device__ T operator()(T value) {
using csl::device::expm1;
return value >= T(0) ? value : expm1(value);
}
};
template <class T>
struct relu_functor {
__device__ relu_functor(T slope_) : slope{slope_} { }
__device__ T operator()(T value) {
using csl::device::log1pexp;
return value >= T(0) ? value : slope * value;
}
T slope;
};
template <class T>
struct clipped_relu_functor {
__device__ clipped_relu_functor(T floor_, T ceiling_) : floor{floor_}, ceiling{ceiling_} { }
__device__ T operator()(T value) {
using csl::device::clamp;
return clamp(value, floor, ceiling);
}
T floor, ceiling;
};
template <class T>
struct power_functor {
__device__ power_functor(T exp_, T scale_, T shift_) : exp{exp_}, scale{scale_}, shift{shift_} { }
__device__ T operator()(T value) {
using csl::device::pow;
return pow(shift + scale * value, exp);
}
T exp, scale, shift;
};
template <class T>
struct max_functor {
__device__ T operator()(T x, T y) {
using csl::device::max;
return max(x, y);
}
};
template <class T>
struct sum_functor {
__device__ T operator()(T x, T y) { return x + y; }
};
template <class T>
struct scaled_sum_functor {
__device__ scaled_sum_functor(T scale_x_, T scale_y_)
: scale_x{scale_x_}, scale_y{scale_y_} { }
__device__ T operator()(T x, T y) { return scale_x * x + scale_y * y; }
T scale_x, scale_y;
};
template <class T>
struct product_functor {
__device__ T operator()(T x, T y) { return x * y; }
};
template <class T>
struct div_functor {
__device__ T operator()(T x, T y) { return x / y; }
};
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
#endif /* OPENCV_DNN_SRC_CUDA_FUNCTORS_HPP */
\ No newline at end of file
......@@ -24,22 +24,6 @@ using namespace cv::dnn::cuda4dnn::csl::device;
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
namespace raw {
template <class T, std::size_t N>
__global__ void bias1_vec(Span<T> output, View<T> input, T beta) {
using vector_type = get_vector_type_t<T, N>;
auto output_vPtr = vector_type::get_pointer(output.data());
auto input_vPtr = vector_type::get_pointer(input.data());
for (auto i : grid_stride_range(output.size() / vector_type::size())) {
vector_type vec;
v_load(vec, input_vPtr[i]);
for (int j = 0; j < vec.size(); j++)
vec.data[j] = vec.data[j] + beta;
v_store(output_vPtr[i], vec);
}
}
template <class T, std::size_t N>
__global__ void biasN_vec(Span<T> output, View<T> input, size_type inner_size, View<T> bias) {
using vector_type = get_vector_type_t<T, N>;
......@@ -59,22 +43,6 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
}
}
template <class T, std::size_t N>
__global__ void scale1_vec(Span<T> output, View<T> input, T alpha) {
using vector_type = get_vector_type_t<T, N>;
auto output_vPtr = vector_type::get_pointer(output.data());
auto input_vPtr = vector_type::get_pointer(input.data());
for (auto i : grid_stride_range(output.size() / vector_type::size())) {
vector_type vec;
v_load(vec, input_vPtr[i]);
for (int j = 0; j < vec.size(); j++)
vec.data[j] = vec.data[j] * alpha;
v_store(output_vPtr[i], vec);
}
}
template <class T, std::size_t N>
__global__ void scaleN_vec(Span<T> output, View<T> input, size_type inner_size, View<T> weights)
{
......@@ -133,34 +101,6 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
}
}
template <class T, std::size_t N> static
void launch_bias1_vec_kernel(const Stream& stream, Span<T> output, View<T> input, T beta) {
CV_Assert(is_fully_aligned<T>(output, N));
CV_Assert(is_fully_aligned<T>(input, N));
auto kernel = raw::bias1_vec<T, N>;
auto policy = make_policy(kernel, output.size() / N, 0, stream);
launch_kernel(kernel, policy, output, input, beta);
}
template <class T>
void bias1(const Stream& stream, TensorSpan<T> output, TensorView<T> input, T beta) {
CV_Assert(is_shape_same(input, output));
if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4)) {
launch_bias1_vec_kernel<T, 4>(stream, output, input, beta);
} else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2)) {
launch_bias1_vec_kernel<T, 2>(stream, output, input, beta);
} else {
launch_bias1_vec_kernel<T, 1>(stream, output, input, beta);
}
}
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
template void bias1<__half>(const Stream&, TensorSpan<__half>, TensorView<__half>, __half);
#endif
template void bias1<float>(const Stream&, TensorSpan<float>, TensorView<float>, float);
template <class T, std::size_t N> static
void launch_biasN_vec_kernel(const Stream& stream, Span<T> output, View<T> input, std::size_t inner_size, View<T> bias){
CV_Assert(is_fully_aligned<T>(output, N));
......@@ -195,34 +135,6 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
#endif
template void biasN<float>(const Stream&, TensorSpan<float>, TensorView<float>, std::size_t, TensorView<float>);
template <class T, std::size_t N> static
void launch_scale1_vec_kernel(const Stream& stream, Span<T> output, View<T> input, T alpha) {
CV_Assert(is_fully_aligned<T>(output, N));
CV_Assert(is_fully_aligned<T>(input, N));
auto kernel = raw::scale1_vec<T, N>;
auto policy = make_policy(kernel, output.size() / N, 0, stream);
launch_kernel(kernel, policy, output, input, alpha);
}
template <class T>
void scale1(const Stream& stream, TensorSpan<T> output, TensorView<T> input, T alpha) {
CV_Assert(is_shape_same(input, output));
if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4)) {
launch_scale1_vec_kernel<T, 4>(stream, output, input, alpha);
} else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2)) {
launch_scale1_vec_kernel<T, 2>(stream, output, input, alpha);
} else {
launch_scale1_vec_kernel<T, 1>(stream, output, input, alpha);
}
}
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
template void scale1<__half>(const Stream&, TensorSpan<__half>, TensorView<__half>, __half);
#endif
template void scale1<float>(const Stream&, TensorSpan<float>, TensorView<float>, float);
template <class T, std::size_t N> static
void launch_scaleN_vec_kernel(const Stream& stream, Span<T> output, View<T> input, std::size_t inner_size, View<T> weights) {
CV_Assert(is_fully_aligned<T>(output, N));
......
......@@ -19,7 +19,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
void biasN_clipped_relu_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, std::size_t inner_size, csl::View<T> bias, T floor, T ceiling);
template <class T>
void biasN_power_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, std::size_t inner_size, csl::View<T> bias, T exp);
void biasN_power_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, std::size_t inner_size, csl::View<T> bias, T exp, T scale, T shift);
template <class T>
void biasN_tanh_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, std::size_t inner_size, csl::View<T> bias);
......
......@@ -12,18 +12,12 @@
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
template <class T>
void bias1(const csl::Stream& stream, csl::TensorSpan<T> output, csl::TensorView<T> input, T alpha);
template <class T>
void biasN(const csl::Stream& stream,
csl::TensorSpan<T> output,
csl::TensorView<T> input, std::size_t inner_size,
csl::TensorView<T> bias);
template <class T>
void scale1(const csl::Stream& stream, csl::TensorSpan<T> output, csl::TensorView<T> input, T alpha);
template <class T>
void scaleN(const csl::Stream& stream,
csl::TensorSpan<T> output,
......
......@@ -286,7 +286,7 @@ namespace cv { namespace dnn { namespace cuda4dnn {
kernels::biasN_clipped_relu_inplace<T>(stream, output, inner_size, biasTensor, crelu_floor, crelu_ceil);
break;
case ConvolutionConfiguration::ActivationType::POWER:
kernels::biasN_power_inplace<T>(stream, output, inner_size, biasTensor, power_exp);
kernels::biasN_power_inplace<T>(stream, output, inner_size, biasTensor, power_exp, T(1.0), T(0.0));
break;
case ConvolutionConfiguration::ActivationType::TANH:
kernels::biasN_tanh_inplace<T>(stream, output, inner_size, biasTensor);
......
......@@ -113,7 +113,7 @@ namespace cv { namespace dnn { namespace cuda4dnn {
*/
if (weight != 1.0)
{
kernels::scale1<T>(stream, output, input, weight);
kernels::scale1_with_bias1<T>(stream, output, input, weight, 1.0);
}
else if (!weightsTensor.empty())
{
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment