Commit 17c485eb authored by Yashas Samaga B L's avatar Yashas Samaga B L Committed by Alexander Alekhin

Merge pull request #16092 from YashasSamaga:cuda4dnn-conv-act-fuse

cuda4dnn: fuse activations with convolutions

* fuse ReLU, ReLU6, TanH, Sigmoid with conv

* fix OpenCL errors

* improve ReLU, add power, swish and mish

* fix missing fusion entries

* fix handling of unsetAttached

* remove whole file indentation

* optimize power = 1.0, use IDENTITY instead of NONE

* handle edge case: change backend and then clear
parent 5b0b59ec
This diff is collapsed.
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_BIAS_ACTIVATION_HPP
#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_BIAS_ACTIVATION_HPP
#include "../csl/stream.hpp"
#include "../csl/span.hpp"
#include <cstddef>
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
template <class T>
void biasN_relu_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, std::size_t inner_size, csl::View<T> bias, T slope);
template <class T>
void biasN_clipped_relu_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, std::size_t inner_size, csl::View<T> bias, T floor, T ceiling);
template <class T>
void biasN_power_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, std::size_t inner_size, csl::View<T> bias, T exp);
template <class T>
void biasN_tanh_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, std::size_t inner_size, csl::View<T> bias);
template <class T>
void biasN_sigmoid_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, std::size_t inner_size, csl::View<T> bias);
template <class T>
void biasN_swish_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, std::size_t inner_size, csl::View<T> bias);
template <class T>
void biasN_mish_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, std::size_t inner_size, csl::View<T> bias);
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_BIAS_ACTIVATION_HPP */
...@@ -12,6 +12,8 @@ ...@@ -12,6 +12,8 @@
#include "../csl/tensor.hpp" #include "../csl/tensor.hpp"
#include "../csl/tensor_ops.hpp" #include "../csl/tensor_ops.hpp"
#include "../kernels/scale_shift.hpp" #include "../kernels/scale_shift.hpp"
#include "../kernels/activations.hpp"
#include "../kernels/bias_activation.hpp"
#include <opencv2/core.hpp> #include <opencv2/core.hpp>
...@@ -44,6 +46,20 @@ namespace cv { namespace dnn { namespace cuda4dnn { ...@@ -44,6 +46,20 @@ namespace cv { namespace dnn { namespace cuda4dnn {
/* group count for grouped convolution */ /* group count for grouped convolution */
std::size_t groups; std::size_t groups;
enum class ActivationType {
IDENTITY,
RELU, /* uses value provided in `relu_negative_slope` */
CLIPPED_RELU, /* uses values provided in `crelu_floor` and `crelu_ceil` */
POWER, /* scale and shift fused beforehand (fuseWeights); only `power_exp` is handled by CUDA */
TANH,
SIGMOID,
SWISH,
MISH
};
ActivationType activation_type;
float relu_negative_slope, crelu_floor, crelu_ceil, power_exp;
}; };
template <class T> template <class T>
...@@ -59,7 +75,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { ...@@ -59,7 +75,7 @@ namespace cv { namespace dnn { namespace cuda4dnn {
const auto& strides = config.strides; const auto& strides = config.strides;
const auto convolution_order = kernel_size.size(); const auto convolution_order = kernel_size.size();
CV_Assert(convolution_order >= 1); CV_Assert(convolution_order > 1);
CV_Assert(convolution_order == dilations.size()); CV_Assert(convolution_order == dilations.size());
CV_Assert(convolution_order == strides.size()); CV_Assert(convolution_order == strides.size());
...@@ -72,7 +88,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { ...@@ -72,7 +88,7 @@ namespace cv { namespace dnn { namespace cuda4dnn {
const auto groups = config.groups; const auto groups = config.groups;
if (convolution_order > 3) if (convolution_order > 3)
CV_Error(Error::StsNotImplemented, "Only 1D/2D/3D convolution is supported."); CV_Error(Error::StsNotImplemented, "Only 2D/3D convolution is supported.");
const auto rank = input_shape.size(); const auto rank = input_shape.size();
const auto output_feature_maps = output_shape[1]; const auto output_feature_maps = output_shape[1];
...@@ -190,6 +206,15 @@ namespace cv { namespace dnn { namespace cuda4dnn { ...@@ -190,6 +206,15 @@ namespace cv { namespace dnn { namespace cuda4dnn {
convoluter = csl::Convolution<T>(cudnnHandle, params); convoluter = csl::Convolution<T>(cudnnHandle, params);
activation = config.activation_type;
relu_negative_slope = config.relu_negative_slope;
crelu_floor = config.crelu_floor;
crelu_ceil = config.crelu_ceil;
power_exp = config.power_exp;
if (activation == ConvolutionConfiguration::ActivationType::POWER && power_exp == 1.0f)
activation = ConvolutionConfiguration::ActivationType::IDENTITY;
csl::WorkspaceBuilder builder; csl::WorkspaceBuilder builder;
if (!transformed_shape.empty()) { if (!transformed_shape.empty()) {
auto& shape = transformed_shape; auto& shape = transformed_shape;
...@@ -227,7 +252,62 @@ namespace cv { namespace dnn { namespace cuda4dnn { ...@@ -227,7 +252,62 @@ namespace cv { namespace dnn { namespace cuda4dnn {
if (!biasTensor.empty()) if (!biasTensor.empty())
{ {
std::size_t inner_size = output.size_range(2, output.rank()); std::size_t inner_size = output.size_range(2, output.rank());
kernels::biasN<T>(stream, output, output, inner_size, biasTensor); switch(activation)
{
case ConvolutionConfiguration::ActivationType::IDENTITY:
kernels::biasN<T>(stream, output, output, inner_size, biasTensor);
break;
case ConvolutionConfiguration::ActivationType::RELU:
kernels::biasN_relu_inplace<T>(stream, output, inner_size, biasTensor, relu_negative_slope);
break;
case ConvolutionConfiguration::ActivationType::CLIPPED_RELU:
kernels::biasN_clipped_relu_inplace<T>(stream, output, inner_size, biasTensor, crelu_floor, crelu_ceil);
break;
case ConvolutionConfiguration::ActivationType::POWER:
kernels::biasN_power_inplace<T>(stream, output, inner_size, biasTensor, power_exp);
break;
case ConvolutionConfiguration::ActivationType::TANH:
kernels::biasN_tanh_inplace<T>(stream, output, inner_size, biasTensor);
break;
case ConvolutionConfiguration::ActivationType::SIGMOID:
kernels::biasN_sigmoid_inplace<T>(stream, output, inner_size, biasTensor);
break;
case ConvolutionConfiguration::ActivationType::SWISH:
kernels::biasN_swish_inplace<T>(stream, output, inner_size, biasTensor);
break;
case ConvolutionConfiguration::ActivationType::MISH:
kernels::biasN_mish_inplace<T>(stream, output, inner_size, biasTensor);
break;
}
}
else
{
switch(activation)
{
case ConvolutionConfiguration::ActivationType::IDENTITY:
break;
case ConvolutionConfiguration::ActivationType::RELU:
kernels::relu<T>(stream, output, output, relu_negative_slope);
break;
case ConvolutionConfiguration::ActivationType::CLIPPED_RELU:
kernels::clipped_relu<T>(stream, output, output, crelu_floor, crelu_ceil);
break;
case ConvolutionConfiguration::ActivationType::POWER:
kernels::power<T>(stream, output, output, power_exp, 1.0, 0.0);
break;
case ConvolutionConfiguration::ActivationType::TANH:
kernels::tanh<T>(stream, output, output);
break;
case ConvolutionConfiguration::ActivationType::SIGMOID:
kernels::sigmoid<T>(stream, output, output);
break;
case ConvolutionConfiguration::ActivationType::SWISH:
kernels::swish<T>(stream, output, output);
break;
case ConvolutionConfiguration::ActivationType::MISH:
kernels::mish<T>(stream, output, output);
break;
}
} }
} }
...@@ -243,6 +323,9 @@ namespace cv { namespace dnn { namespace cuda4dnn { ...@@ -243,6 +323,9 @@ namespace cv { namespace dnn { namespace cuda4dnn {
csl::TensorTransform<T> inputTransformer; csl::TensorTransform<T> inputTransformer;
std::size_t scratch_mem_in_bytes; std::size_t scratch_mem_in_bytes;
ConvolutionConfiguration::ActivationType activation;
float relu_negative_slope, crelu_floor, crelu_ceil, power_exp;
}; };
}}} /* namespace cv::dnn::cuda4dnn */ }}} /* namespace cv::dnn::cuda4dnn */
......
...@@ -2405,7 +2405,7 @@ struct Net::Impl ...@@ -2405,7 +2405,7 @@ struct Net::Impl
break; break;
} }
if (preferableBackend != DNN_BACKEND_OPENCV) if (preferableBackend != DNN_BACKEND_OPENCV && preferableBackend != DNN_BACKEND_CUDA)
continue; // Go to the next layer. continue; // Go to the next layer.
// TODO: OpenCL target support more fusion styles. // TODO: OpenCL target support more fusion styles.
...@@ -2415,6 +2415,9 @@ struct Net::Impl ...@@ -2415,6 +2415,9 @@ struct Net::Impl
ld.layerInstance->type != "Concat")) ) ld.layerInstance->type != "Concat")) )
continue; continue;
if (preferableBackend == DNN_BACKEND_CUDA && IS_DNN_CUDA_TARGET(preferableTarget) && ld.layerInstance->type != "Convolution")
continue;
while (nextData) while (nextData)
{ {
// For now, OpenCL target support fusion with activation of ReLU/ChannelsPReLU/Power/Tanh // For now, OpenCL target support fusion with activation of ReLU/ChannelsPReLU/Power/Tanh
...@@ -2426,6 +2429,16 @@ struct Net::Impl ...@@ -2426,6 +2429,16 @@ struct Net::Impl
nextData->type != "Power") nextData->type != "Power")
break; break;
if (IS_DNN_CUDA_TARGET(preferableTarget) &&
nextData->type != "ReLU" &&
nextData->type != "ReLU6" &&
nextData->type != "Power" &&
nextData->type != "TanH" &&
nextData->type != "Sigmoid" &&
nextData->type != "Swish" &&
nextData->type != "Mish")
break;
Ptr<ActivationLayer> nextActivLayer = nextData->layerInstance.dynamicCast<ActivationLayer>(); Ptr<ActivationLayer> nextActivLayer = nextData->layerInstance.dynamicCast<ActivationLayer>();
if (nextActivLayer.empty()) if (nextActivLayer.empty())
break; break;
......
...@@ -239,6 +239,12 @@ public: ...@@ -239,6 +239,12 @@ public:
ocl4dnnFusedActiv_t activType; ocl4dnnFusedActiv_t activType;
float power; float power;
#endif #endif
#ifdef HAVE_CUDA
cuda4dnn::ConvolutionConfiguration::ActivationType cudaActType;
float cuda_relu_slope, cuda_crelu_floor, cuda_crelu_ceil, cuda_power_exp;
#endif
ConvolutionLayerImpl(const LayerParams &params) : BaseConvolutionLayerImpl(params) ConvolutionLayerImpl(const LayerParams &params) : BaseConvolutionLayerImpl(params)
{ {
#ifdef HAVE_OPENCL #ifdef HAVE_OPENCL
...@@ -246,6 +252,10 @@ public: ...@@ -246,6 +252,10 @@ public:
activType = OCL4DNN_CONV_FUSED_ACTIV_NONE; activType = OCL4DNN_CONV_FUSED_ACTIV_NONE;
power = 0.f; power = 0.f;
#endif #endif
#ifdef HAVE_CUDA
cudaActType = cuda4dnn::ConvolutionConfiguration::ActivationType::IDENTITY;
#endif
} }
MatShape computeColRowShape(const MatShape &inpShape, const MatShape &outShape) const CV_OVERRIDE MatShape computeColRowShape(const MatShape &inpShape, const MatShape &outShape) const CV_OVERRIDE
...@@ -406,6 +416,61 @@ public: ...@@ -406,6 +416,61 @@ public:
} }
} }
#endif #endif
#ifdef HAVE_CUDA
cudaActType = cuda4dnn::ConvolutionConfiguration::ActivationType::IDENTITY;
if(IS_DNN_CUDA_TARGET(preferableTarget))
{
Ptr<ReLULayer> activ_relu = activ.dynamicCast<ReLULayer>();
if(!activ_relu.empty())
{
cudaActType = cuda4dnn::ConvolutionConfiguration::ActivationType::RELU;
cuda_relu_slope = activ_relu->negativeSlope;
}
Ptr<ReLU6Layer> activ_relu6 = activ.dynamicCast<ReLU6Layer>();
if(!activ_relu6.empty())
{
cudaActType = cuda4dnn::ConvolutionConfiguration::ActivationType::CLIPPED_RELU;
cuda_crelu_floor = activ_relu6->minValue;
cuda_crelu_ceil = activ_relu6->maxValue;
}
Ptr<PowerLayer> activ_power = activ.dynamicCast<PowerLayer>();
if (!activ_power.empty())
{
if (activ_power->scale != 1.f || activ_power->shift != 0.f)
{
const int outCh = blobs[0].size[0];
fuseWeights(Mat(1, outCh, CV_32F, Scalar(activ_power->scale)),
Mat(1, outCh, CV_32F, Scalar(activ_power->shift)));
}
cuda_power_exp = activ_power->power;
cudaActType = cuda4dnn::ConvolutionConfiguration::ActivationType::POWER;
}
Ptr<TanHLayer> activ_tanh = activ.dynamicCast<TanHLayer>();
if(!activ_tanh.empty())
cudaActType = cuda4dnn::ConvolutionConfiguration::ActivationType::TANH;
Ptr<SigmoidLayer> activ_sigmoid = activ.dynamicCast<SigmoidLayer>();
if(!activ_sigmoid.empty())
cudaActType = cuda4dnn::ConvolutionConfiguration::ActivationType::SIGMOID;
Ptr<SwishLayer> activ_swish = activ.dynamicCast<SwishLayer>();
if(!activ_swish.empty())
cudaActType = cuda4dnn::ConvolutionConfiguration::ActivationType::SWISH;
Ptr<MishLayer> activ_mish = activ.dynamicCast<MishLayer>();
if(!activ_mish.empty())
cudaActType = cuda4dnn::ConvolutionConfiguration::ActivationType::MISH;
if (cudaActType == cuda4dnn::ConvolutionConfiguration::ActivationType::IDENTITY)
activ.reset();
}
#endif
return !activ.empty(); return !activ.empty();
} }
...@@ -1418,6 +1483,12 @@ public: ...@@ -1418,6 +1483,12 @@ public:
config.output_shape.assign(std::begin(output_shape), std::end(output_shape)); config.output_shape.assign(std::begin(output_shape), std::end(output_shape));
config.groups = groups; config.groups = groups;
config.activation_type = cudaActType;
config.relu_negative_slope = cuda_relu_slope;
config.crelu_floor = cuda_crelu_floor;
config.crelu_ceil = cuda_crelu_ceil;
config.power_exp = cuda_power_exp;
Mat filtersMat = fusedWeights ? weightsMat : blobs[0]; Mat filtersMat = fusedWeights ? weightsMat : blobs[0];
Mat biasMat = (hasBias() || fusedBias) ? Mat(output_feature_maps, 1, CV_32F, biasvec.data()) : Mat(); Mat biasMat = (hasBias() || fusedBias) ? Mat(output_feature_maps, 1, CV_32F, biasvec.data()) : Mat();
if (countNonZero(biasMat) == 0) if (countNonZero(biasMat) == 0)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment