Commit e0e40405 authored by Vadim Pisarevsky's avatar Vadim Pisarevsky

Merge pull request #9847 from wzw-intel:ocl4dnn_fusion

parents ff037ebe 2d8f2c2a
......@@ -1028,7 +1028,7 @@ struct Net::Impl
void fuseLayers(const std::vector<LayerPin>& blobsToKeep_)
{
if( !fusion || !(preferableBackend == DNN_BACKEND_DEFAULT && preferableTarget == DNN_TARGET_CPU))
if( !fusion || preferableBackend != DNN_BACKEND_DEFAULT)
return;
CV_TRACE_FUNCTION();
......@@ -1056,6 +1056,11 @@ struct Net::Impl
// with the current layer if they follow it. Normally, the are fused with the convolution layer,
// but some of them (like activation) may be fused with fully-connected, elemwise (+) and
// some other layers.
// TODO: OpenCL target support more fusion styles.
if ( preferableTarget == DNN_TARGET_OPENCL && ld.layerInstance->type.compare("Convolution") )
continue;
Ptr<Layer>& currLayer = ld.layerInstance;
if( ld.consumers.size() == 1 && pinsToKeep.count(LayerPin(lid, 0)) == 0 )
{
......@@ -1100,16 +1105,27 @@ struct Net::Impl
}
}
Ptr<ActivationLayer> nextActivLayer;
if( nextData )
nextActivLayer = nextData->layerInstance.dynamicCast<ActivationLayer>();
if( !nextActivLayer.empty() && pinsToKeep.count(lpNext) == 0
&& currLayer->setActivation(nextActivLayer) )
// For now, OpenCL target only support fusion with activation of ReLU/ChannelsPReLU
if ( preferableTarget != DNN_TARGET_OPENCL ||
(preferableTarget == DNN_TARGET_OPENCL &&
nextData &&
(!nextData->type.compare("ReLU") ||
!nextData->type.compare("ChannelsPReLU"))) )
{
printf_(("\tfused with %s\n", nextActivLayer->name.c_str()));
nextData->skipFlags[DNN_BACKEND_DEFAULT] = true;
ld.outputBlobs = layers[lpNext.lid].outputBlobs;
Ptr<ActivationLayer> nextActivLayer;
if( nextData )
nextActivLayer = nextData->layerInstance.dynamicCast<ActivationLayer>();
if( !nextActivLayer.empty() && pinsToKeep.count(lpNext) == 0
&& currLayer->setActivation(nextActivLayer) )
{
LayerData *activData = nextData;
printf_(("\tfused with %s\n", nextActivLayer->name.c_str()));
activData->skipFlags[DNN_BACKEND_DEFAULT] = true;
ld.outputBlobs = layers[lpNext.lid].outputBlobs;
}
}
}
......
......@@ -157,7 +157,20 @@ public:
#ifdef HAVE_OPENCL
Ptr<OCL4DNNConvSpatial<float> > convolutionOp;
std::vector<UMat> umat_blobs;
bool fusedBias;
bool newWeightAndBias;
bool newActiv;
ocl4dnnFusedActiv_t activType;
#endif
ConvolutionLayerImpl()
{
#ifdef HAVE_OPENCL
fusedBias = false;
newWeightAndBias = false;
newActiv = false;
activType = OCL4DNN_CONV_FUSED_ACTIV_NONE;
#endif
}
MatShape computeColRowShape(const MatShape &inpShape, const MatShape &outShape) const
{
......@@ -209,6 +222,10 @@ public:
activ = layer;
if (activ.empty())
reluslope.clear();
#ifdef HAVE_OPENCL
newActiv = true;
activType = OCL4DNN_CONV_FUSED_ACTIV_NONE;
#endif
return !activ.empty();
}
......@@ -221,6 +238,10 @@ public:
// we will need to re-compute the weights with the batch
// norm coefficients taken into account
weightsMat.release();
#ifdef HAVE_OPENCL
newWeightAndBias = true;
fusedBias = false;
#endif
return !bnorm.empty();
}
......@@ -230,6 +251,10 @@ public:
// we will need to re-compute the weights with the scaling
// coefficients taken into account
weightsMat.release();
#ifdef HAVE_OPENCL
newWeightAndBias = true;
fusedBias = false;
#endif
return !scaleLayer.empty();
}
......@@ -665,19 +690,49 @@ public:
convolutionOp = Ptr<OCL4DNNConvSpatial<float> >(new OCL4DNNConvSpatial<float>(config));
}
for (size_t ii = 0; ii < outputs.size(); ii++)
if ( newWeightAndBias )
{
UMat inpMat, outMat;
inpMat = inputs[ii]->getUMat(ACCESS_READ);
outMat = outputs[ii].getUMat(ACCESS_WRITE);
int batch_size = inpMat.size[0];
weightsMat.copyTo(umat_blobs[0]);
if ( fusedBias )
{
if ( umat_blobs.size() < 2 )
umat_blobs.resize(2);
umat_blobs[1] = UMat(biasvec, true);
}
convolutionOp->setBias(fusedBias || hasBias());
newWeightAndBias = false;
}
if (!convolutionOp->Forward(inpMat, umat_blobs[0], hasBias() ? umat_blobs[1] : UMat(),
outMat, batch_size))
return false;
if ( newActiv )
{
if ( activType == OCL4DNN_CONV_FUSED_ACTIV_RELU )
{
CV_Assert(!reluslope.empty());
convolutionOp->setActivReLU(true, reluslope[0]);
}
else if ( activType == OCL4DNN_CONV_FUSED_ACTIV_PRELU)
{
CV_Assert(!reluslope.empty());
convolutionOp->setActivPReLU(true, reluslope);
}
else
{
convolutionOp->setActivReLU(false, 0);
convolutionOp->setActivPReLU(false, reluslope);
}
newActiv = false;
}
return true;
UMat inpMat, outMat;
inpMat = inputs[0]->getUMat(ACCESS_READ);
outMat = outputs[0].getUMat(ACCESS_WRITE);
int batch_size = inpMat.size[0];
return convolutionOp->Forward(inpMat,
umat_blobs[0],
(hasBias() || fusedBias) ? umat_blobs[1] : UMat(),
outMat,
batch_size);
}
#endif
......@@ -693,11 +748,6 @@ public:
CV_Assert(inputs.size() == (size_t)1 && inputs[0]->size[1] % blobs[0].size[1] == 0);
int ngroups = inputs[0]->size[1]/blobs[0].size[1];
CV_Assert(outputs[0].size[1] % ngroups == 0);
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
forward_ocl(inputs, outputs, internals))
int k, outCn = blobs[0].size[0];
if( weightsMat.empty() )
......@@ -761,6 +811,11 @@ public:
}
}
#ifdef HAVE_OPENCL
if (shiftptr || shiftptr2)
fusedBias = true;
#endif
for( int i = 0; i < outCn; i++ )
{
float s1 = scaleptr ? scaleptr[i] : 1.f;
......@@ -784,7 +839,12 @@ public:
{
Ptr<ReLULayer> activ_relu = activ.dynamicCast<ReLULayer>();
if( !activ_relu.empty() )
{
reluslope.assign(outCn+2, activ_relu->negativeSlope);
#ifdef HAVE_OPENCL
activType = OCL4DNN_CONV_FUSED_ACTIV_RELU;
#endif
}
Ptr<ChannelsPReLULayer> activ_chprelu = activ.dynamicCast<ChannelsPReLULayer>();
if( !activ_chprelu.empty() )
......@@ -795,9 +855,16 @@ public:
reluslope.resize(outCn+2);
std::copy(mdata, mdata + outCn, reluslope.begin());
reluslope[outCn] = reluslope[outCn+1] = reluslope[outCn-1];
#ifdef HAVE_OPENCL
activType = OCL4DNN_CONV_FUSED_ACTIV_PRELU;
#endif
}
}
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
forward_ocl(inputs, outputs, internals))
int nstripes = std::max(getNumThreads(), 1);
ParallelConv::run(*inputs[0], outputs[0], weightsMat, biasvec, reluslope,
......
......@@ -73,6 +73,11 @@ struct OCL4DNNConvConfig
bool bias_term; // = false;
};
typedef enum {
OCL4DNN_CONV_FUSED_ACTIV_NONE = 0,
OCL4DNN_CONV_FUSED_ACTIV_RELU = 1,
OCL4DNN_CONV_FUSED_ACTIV_PRELU = 2,
} ocl4dnnFusedActiv_t;
template<typename Dtype>
class OCL4DNNConvSpatial
......@@ -80,9 +85,13 @@ class OCL4DNNConvSpatial
public:
explicit OCL4DNNConvSpatial(OCL4DNNConvConfig config);
~OCL4DNNConvSpatial();
bool Forward(const UMat& bottom_data, const UMat& weight,
bool Forward(const UMat& bottom_data,
const UMat& weight,
const UMat& bias,
UMat& top_data, int32_t batch_size);
void setActivReLU(bool fuse_activ, float slope);
void setActivPReLU(bool fuse_activ, std::vector<float> &slope);
void setBias(bool bias_term);
private:
struct kernelConfig
......@@ -194,9 +203,9 @@ class OCL4DNNConvSpatial
int32_t blockWidth,
int32_t blockHeight,
int32_t blockDepth);
bool setupIDLF(int32_t blockWidth,
int32_t blockHeight,
int32_t blockDepth);
bool createIDLFKernel(int32_t blockWidth,
int32_t blockHeight,
int32_t blockDepth);
bool createBasicKernel(int32_t blockWidth,
int32_t blockHeight,
int32_t blockDepth);
......@@ -244,10 +253,13 @@ class OCL4DNNConvSpatial
int lx, int ly, int lz,
bool swizzle, bool nullLocal);
void generateTunerItems(std::vector< cv::Ptr<tunerParam> > &tunerItems);
void setFusionDefine(ocl4dnnFusedActiv_t fused_activ);
void setFusionArg(ocl4dnnFusedActiv_t fused_activ, ocl::Kernel &kernel, cl_uint &argIdx);
int32_t group_;
bool bias_term_;
UMat swizzled_weights_umat;
UMat bottom_data2_;
int32_t bottom_index_;
int32_t output_h_;
......@@ -291,6 +303,9 @@ class OCL4DNNConvSpatial
std::stringstream options_;
cv::ocl::ProgramSource src_;
int32_t prev_kernel_type_;
bool negative_slope_;
UMat negative_slope_umat_;
ocl4dnnFusedActiv_t fused_activ_;
};
typedef enum {
......
......@@ -78,6 +78,8 @@ OCL4DNNConvSpatial<Dtype>::OCL4DNNConvSpatial(OCL4DNNConvConfig config)
num_output_ = config.out_shape[dims - spatial_dims - 1];
group_ = config.group;
fused_activ_ = OCL4DNN_CONV_FUSED_ACTIV_NONE;
negative_slope_ = 0;
prev_kernel_type_ = -1;
tuned_ = false;
......@@ -138,6 +140,38 @@ OCL4DNNConvSpatial<Dtype>::~OCL4DNNConvSpatial()
}
}
template<typename Dtype>
void OCL4DNNConvSpatial<Dtype>::setFusionDefine(ocl4dnnFusedActiv_t fused_activ)
{
switch (fused_activ) {
case OCL4DNN_CONV_FUSED_ACTIV_RELU:
addDef("FUSED_CONV_RELU", 1);
break;
case OCL4DNN_CONV_FUSED_ACTIV_PRELU:
addDef("FUSED_CONV_PRELU", 1);
break;
default:
;
}
return;
}
template<typename Dtype>
void OCL4DNNConvSpatial<Dtype>::setFusionArg(ocl4dnnFusedActiv_t fused_activ, ocl::Kernel &kernel, cl_uint &argIdx)
{
switch (fused_activ) {
case OCL4DNN_CONV_FUSED_ACTIV_RELU:
kernel.set(argIdx++, (float)negative_slope_);
break;
case OCL4DNN_CONV_FUSED_ACTIV_PRELU:
kernel.set(argIdx++, (cl_mem)negative_slope_umat_.handle(ACCESS_READ));
break;
default:
;
}
return;
}
template<typename Dtype>
void OCL4DNNConvSpatial<Dtype>::collectCommonInformation()
{
......@@ -221,6 +255,7 @@ void OCL4DNNConvSpatial<Dtype>::setupKernelDetails(int32_t kernelType,
addDef("ALIGNED_NUM_FILTERS", (int)alignSize(M_, simd_size));
addDef("OUT_BLOCK_SIZE", (output_block_width*output_block_height));
addDef("APPLY_BIAS", bias_term_);
setFusionDefine(fused_activ_);
src_ = cv::ocl::dnn::conv_layer_spatial_oclsrc;
}
......@@ -242,6 +277,7 @@ void OCL4DNNConvSpatial<Dtype>::setupKernelDetails(int32_t kernelType,
addDef("APPLY_BIAS", bias_term_);
addDef("OUTPUT_Z", M_);
addDef("ZPAR", 1);
setFusionDefine(fused_activ_);
src_ = cv::ocl::dnn::conv_layer_spatial_oclsrc;
}
......@@ -278,6 +314,7 @@ void OCL4DNNConvSpatial<Dtype>::setupKernelDetails(int32_t kernelType,
addDef("TILE_N_LAST", M_ % 32);
addDef("TILE_N_LAST_DIV8", (M_ % 32) / 8);
addDef("APPLY_BIAS", bias_term_);
setFusionDefine(fused_activ_);
src_ = ocl::dnn::conv_layer_spatial_oclsrc;
}
}
......@@ -302,6 +339,37 @@ void OCL4DNNConvSpatial<Dtype>::setupKernel()
setupKernelDetails(kernelType_, blockM_, blockK_, blockN_);
}
template<typename Dtype>
void OCL4DNNConvSpatial<Dtype>::setBias(bool bias_term)
{
bias_term_ = bias_term;
}
template<typename Dtype>
void OCL4DNNConvSpatial<Dtype>::setActivReLU(bool fuse_activ, float slope)
{
if ( fuse_activ )
{
fused_activ_ = OCL4DNN_CONV_FUSED_ACTIV_RELU;
negative_slope_ = slope;
}
else
fused_activ_ = OCL4DNN_CONV_FUSED_ACTIV_NONE;
}
template<typename Dtype>
void OCL4DNNConvSpatial<Dtype>::setActivPReLU(bool fuse_activ, std::vector<float> &slope)
{
if ( fuse_activ )
{
fused_activ_ = OCL4DNN_CONV_FUSED_ACTIV_PRELU;
Mat tmpMat = Mat(num_output_, 1, CV_32FC1, (uchar*)&slope[0]);
tmpMat.copyTo(negative_slope_umat_);
}
else
fused_activ_ = OCL4DNN_CONV_FUSED_ACTIV_NONE;
}
template<typename Dtype>
bool OCL4DNNConvSpatial<Dtype>::Forward(const UMat& bottom,
const UMat& weight,
......@@ -310,7 +378,6 @@ bool OCL4DNNConvSpatial<Dtype>::Forward(const UMat& bottom,
int32_t numImages)
{
num_ = numImages;
prepareKernel(bottom, top, weight, bias, numImages);
return convolve(bottom, top, weight, bias, numImages, bestKernelConfig, cv::ocl::Queue::getDefault());
}
......@@ -358,7 +425,9 @@ void OCL4DNNConvSpatial<Dtype>::generateKey()
<< "in" << TUNING_SIZE(width_) << "x" << TUNING_SIZE(height_) << "_"
<< "p" << pad_w_ << "x" << pad_h_ << "_"
<< "num" << num_ << "_"
<< "M" << M_;
<< "M" << M_ << "_"
<< "activ" << fused_activ_;
key_ = ocl::Device::getDefault().vendorName() + "_EU" + cv::format("%d", ocl::Device::getDefault().maxComputeUnits()) + "_" + keyBuilder.str();
key_sanitized_ = key_;
......@@ -608,6 +677,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
return false;
cl_uint argIdx = 0;
setFusionArg(fused_activ_, kernel, argIdx);
UMat img_buffer;
if (image_offset)
......@@ -700,6 +770,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
return false;
cl_uint argIdx = 0;
setFusionArg(fused_activ_, kernel, argIdx);
UMat img_buffer;
if (image_offset)
......@@ -807,13 +878,16 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
int32_t output_image_offset = n * top_dim_
+ output_w_ * output_h_ * M_ * g;
cl_uint argIdx = 0;
int32_t kernel_offset = kernel_h_ * kernel_w_ * (channels_ / group_) * M_ * g;
int32_t kernel_offset = kernel_h_ * kernel_w_ *
(channels_ / group_) * M_
* g;
ocl::Kernel kernel(config->kernelName.c_str(), program);
if (kernel.empty())
return false;
cl_uint argIdx = 0;
setFusionArg(fused_activ_, kernel, argIdx);
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
kernel.set(argIdx++, image_offset);
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
......@@ -1058,9 +1132,9 @@ bool OCL4DNNConvSpatial<float>::createGEMMLikeConvKernel(int32_t blockM,
}
template<>
bool OCL4DNNConvSpatial<float>::setupIDLF(int32_t blockWidth,
int32_t blockHeight,
int32_t simd_size)
bool OCL4DNNConvSpatial<float>::createIDLFKernel(int32_t blockWidth,
int32_t blockHeight,
int32_t simd_size)
{
int32_t workItemOutput[3] = { blockWidth, blockHeight, simd_size };
const int32_t num_output_maps = M_;
......@@ -1122,7 +1196,7 @@ bool OCL4DNNConvSpatial<float>::createConvolutionKernel(int32_t kernelType,
src_ = ocl::ProgramSource();
if (kernelType == KERNEL_TYPE_INTEL_IDLF)
return setupIDLF(blockWidth, blockHeight, blockDepth);
return createIDLFKernel(blockWidth, blockHeight, blockDepth);
else if (kernelType == KERNEL_TYPE_BASIC)
return createBasicKernel(blockWidth, blockHeight, blockDepth);
else if (kernelType == KERNEL_TYPE_GEMM_LIKE)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment