Commit 438e456c authored by Alexander Alekhin's avatar Alexander Alekhin

Merge pull request #10113 from wzw-intel:fusion

parents 75b980ac 45d11dde
...@@ -1233,12 +1233,13 @@ struct Net::Impl ...@@ -1233,12 +1233,13 @@ struct Net::Impl
} }
} }
// For now, OpenCL target only support fusion with activation of ReLU/ChannelsPReLU // For now, OpenCL target only support fusion with activation of ReLU/ChannelsPReLU/Power
if ( preferableTarget != DNN_TARGET_OPENCL || if ( preferableTarget != DNN_TARGET_OPENCL ||
(preferableTarget == DNN_TARGET_OPENCL && (preferableTarget == DNN_TARGET_OPENCL &&
nextData && nextData &&
(!nextData->type.compare("ReLU") || (!nextData->type.compare("ReLU") ||
!nextData->type.compare("ChannelsPReLU"))) ) !nextData->type.compare("ChannelsPReLU") ||
!nextData->type.compare("Power"))) )
{ {
Ptr<ActivationLayer> nextActivLayer; Ptr<ActivationLayer> nextActivLayer;
...@@ -1253,6 +1254,78 @@ struct Net::Impl ...@@ -1253,6 +1254,78 @@ struct Net::Impl
printf_(("\tfused with %s\n", nextActivLayer->name.c_str())); printf_(("\tfused with %s\n", nextActivLayer->name.c_str()));
activData->skipFlags[DNN_BACKEND_DEFAULT] = true; activData->skipFlags[DNN_BACKEND_DEFAULT] = true;
ld.outputBlobs = layers[lpNext.lid].outputBlobs; ld.outputBlobs = layers[lpNext.lid].outputBlobs;
if ( preferableTarget == DNN_TARGET_OPENCL )
{
nextData = &layers[activData->consumers[0].lid];
lpNext = LayerPin(activData->consumers[0].lid, 0);
}
}
}
// fuse convlution layer followed by eltwise + relu
if ( preferableTarget == DNN_TARGET_OPENCL )
{
Ptr<EltwiseLayer> nextEltwiseLayer;
if( nextData )
nextEltwiseLayer = nextData->layerInstance.dynamicCast<EltwiseLayer>();
if( !nextEltwiseLayer.empty() && pinsToKeep.count(lpNext) == 0 )
{
LayerData *eltwiseData = nextData;
// go down from the second input and find the first non-skipped layer.
LayerData *downLayerData = &layers[eltwiseData->inputBlobsId[1].lid];
while (downLayerData->skipFlags[DNN_BACKEND_DEFAULT])
{
downLayerData = &layers[downLayerData->inputBlobsId[0].lid];
}
// second input layer is current layer.
if ( ld.id == downLayerData->id )
{
// go down from the first input and find the first non-skipped layer
downLayerData = &layers[eltwiseData->inputBlobsId[0].lid];
while (downLayerData->skipFlags[DNN_BACKEND_DEFAULT])
{
if ( !downLayerData->type.compare("Eltwise") )
downLayerData = &layers[downLayerData->inputBlobsId[1].lid];
else
downLayerData = &layers[downLayerData->inputBlobsId[0].lid];
}
Ptr<ConvolutionLayer> convLayer;
if( downLayerData )
convLayer = downLayerData->layerInstance.dynamicCast<ConvolutionLayer>();
// first input layer is convolution layer
if( !convLayer.empty() )
{
// fuse eltwise + activation layer
LayerData *firstConvLayerData = downLayerData;
{
nextData = &layers[eltwiseData->consumers[0].lid];
lpNext = LayerPin(eltwiseData->consumers[0].lid, 0);
Ptr<ActivationLayer> nextActivLayer;
if( nextData )
nextActivLayer = nextData->layerInstance.dynamicCast<ActivationLayer>();
if( !nextActivLayer.empty() && pinsToKeep.count(lpNext) == 0 &&
(!nextData->type.compare("ReLU") ||
!nextData->type.compare("ChannelsPReLU") ||
!nextData->type.compare("Power")) &&
currLayer->setActivation(nextActivLayer) )
{
CV_Assert(firstConvLayerData->outputBlobs.size() == 1 && ld.inputBlobs.size() == 1);
ld.inputBlobs.push_back(&firstConvLayerData->outputBlobs[0]);
printf_(("\tfused with %s\n", nextEltwiseLayer->name.c_str()));
printf_(("\tfused with %s\n", nextActivLayer->name.c_str()));
eltwiseData->skipFlags[DNN_BACKEND_DEFAULT] = true;
nextData->skipFlags[DNN_BACKEND_DEFAULT] = true;
ld.outputBlobs = layers[lpNext.lid].outputBlobs;
}
}
}
}
} }
} }
} }
......
...@@ -142,6 +142,9 @@ public: ...@@ -142,6 +142,9 @@ public:
} }
}; };
#define IS_POWER_LAYER(layer) \
(!layer.empty() && !layer->type.compare("Power"))
//TODO: simultaneously convolution and bias addition for cache optimization //TODO: simultaneously convolution and bias addition for cache optimization
class ConvolutionLayerImpl : public BaseConvolutionLayerImpl class ConvolutionLayerImpl : public BaseConvolutionLayerImpl
{ {
...@@ -161,6 +164,7 @@ public: ...@@ -161,6 +164,7 @@ public:
bool newWeightAndBias; bool newWeightAndBias;
bool newActiv; bool newActiv;
ocl4dnnFusedActiv_t activType; ocl4dnnFusedActiv_t activType;
float power;
#endif #endif
ConvolutionLayerImpl() ConvolutionLayerImpl()
{ {
...@@ -169,6 +173,7 @@ public: ...@@ -169,6 +173,7 @@ public:
newWeightAndBias = false; newWeightAndBias = false;
newActiv = false; newActiv = false;
activType = OCL4DNN_CONV_FUSED_ACTIV_NONE; activType = OCL4DNN_CONV_FUSED_ACTIV_NONE;
power = 0.f;
#endif #endif
} }
...@@ -225,6 +230,22 @@ public: ...@@ -225,6 +230,22 @@ public:
#ifdef HAVE_OPENCL #ifdef HAVE_OPENCL
newActiv = true; newActiv = true;
activType = OCL4DNN_CONV_FUSED_ACTIV_NONE; activType = OCL4DNN_CONV_FUSED_ACTIV_NONE;
if (preferableTarget == DNN_TARGET_OPENCL)
{
Ptr<PowerLayer> activ_power = activ.dynamicCast<PowerLayer>();
if (!activ_power.empty())
{
if (activ_power->scale != 1.f || activ_power->shift != 0.f)
newWeightAndBias = true;
if (activ_power->scale != 1.f)
weightsMat.release();
power = activ_power->power;
activType = OCL4DNN_CONV_FUSED_ACTIV_POWER;
}
}
#endif #endif
return !activ.empty(); return !activ.empty();
} }
...@@ -727,11 +748,12 @@ public: ...@@ -727,11 +748,12 @@ public:
biasvec[k] = biasMat.at<float>(k); biasvec[k] = biasMat.at<float>(k);
} }
if( !bnorm.empty() || !scaleLayer.empty() ) if( !bnorm.empty() || !scaleLayer.empty() || IS_POWER_LAYER(activ))
{ {
Mat scale, shift, scale2, shift2; Mat scale, shift, scale2, shift2;
const float *scaleptr = 0, *shiftptr = 0; const float *scaleptr = 0, *shiftptr = 0;
const float *scaleptr2 = 0, *shiftptr2 = 0; const float *scaleptr2 = 0, *shiftptr2 = 0;
float a = 1.f, b = 0.f;
if( !bnorm.empty() ) if( !bnorm.empty() )
{ {
...@@ -758,7 +780,14 @@ public: ...@@ -758,7 +780,14 @@ public:
} }
} }
if (shiftptr || shiftptr2) if( IS_POWER_LAYER(activ) )
{
Ptr<PowerLayer> activ_power = activ.dynamicCast<PowerLayer>();
a = activ_power->scale;
b = activ_power->shift;
}
if (shiftptr || shiftptr2 || b != 0.f)
fusedBias = true; fusedBias = true;
for( int i = 0; i < outCn; i++ ) for( int i = 0; i < outCn; i++ )
...@@ -771,9 +800,9 @@ public: ...@@ -771,9 +800,9 @@ public:
int j, wcols = weightsMat.cols; int j, wcols = weightsMat.cols;
for( j = 0; j < wcols; j++ ) for( j = 0; j < wcols; j++ )
w_i[j] *= (s1*s2); w_i[j] *= (s1*s2*a);
biasvec[i] = biasvec[i]*(s1*s2) + (delta1*s2 + delta2); biasvec[i] = biasvec[i]*(s1*s2*a) + (delta1*s2*a + delta2*a + b);
} }
} }
biasvec[outCn] = biasvec[outCn+1] = biasvec[outCn-1]; biasvec[outCn] = biasvec[outCn+1] = biasvec[outCn-1];
...@@ -827,10 +856,15 @@ public: ...@@ -827,10 +856,15 @@ public:
CV_Assert(!reluslope.empty()); CV_Assert(!reluslope.empty());
convolutionOp->setActivPReLU(true, reluslope); convolutionOp->setActivPReLU(true, reluslope);
} }
else if ( activType == OCL4DNN_CONV_FUSED_ACTIV_POWER)
{
convolutionOp->setActivPower(true, power);
}
else else
{ {
convolutionOp->setActivReLU(false, 0); convolutionOp->setActivReLU(false, 0);
convolutionOp->setActivPReLU(false, reluslope); convolutionOp->setActivPReLU(false, reluslope);
convolutionOp->setActivPower(false, 1.f);
} }
newActiv = false; newActiv = false;
} }
...@@ -840,6 +874,7 @@ public: ...@@ -840,6 +874,7 @@ public:
int batch_size = inpMat.size[0]; int batch_size = inpMat.size[0];
return convolutionOp->Forward(inpMat, return convolutionOp->Forward(inpMat,
inputs.size() == 2 ? inputs[1] : UMat(),
umat_blobs[0], umat_blobs[0],
(hasBias() || fusedBias) ? umat_blobs[1] : UMat(), (hasBias() || fusedBias) ? umat_blobs[1] : UMat(),
outMat, outMat,
......
...@@ -77,6 +77,7 @@ typedef enum { ...@@ -77,6 +77,7 @@ typedef enum {
OCL4DNN_CONV_FUSED_ACTIV_NONE = 0, OCL4DNN_CONV_FUSED_ACTIV_NONE = 0,
OCL4DNN_CONV_FUSED_ACTIV_RELU = 1, OCL4DNN_CONV_FUSED_ACTIV_RELU = 1,
OCL4DNN_CONV_FUSED_ACTIV_PRELU = 2, OCL4DNN_CONV_FUSED_ACTIV_PRELU = 2,
OCL4DNN_CONV_FUSED_ACTIV_POWER = 3
} ocl4dnnFusedActiv_t; } ocl4dnnFusedActiv_t;
template<typename Dtype> template<typename Dtype>
...@@ -86,11 +87,13 @@ class OCL4DNNConvSpatial ...@@ -86,11 +87,13 @@ class OCL4DNNConvSpatial
explicit OCL4DNNConvSpatial(OCL4DNNConvConfig config); explicit OCL4DNNConvSpatial(OCL4DNNConvConfig config);
~OCL4DNNConvSpatial(); ~OCL4DNNConvSpatial();
bool Forward(const UMat& bottom_data, bool Forward(const UMat& bottom_data,
const UMat& bottom_data2,
const UMat& weight, const UMat& weight,
const UMat& bias, const UMat& bias,
UMat& top_data, int32_t batch_size); UMat& top_data, int32_t batch_size);
void setActivReLU(bool fuse_activ, float slope); void setActivReLU(bool fuse_activ, float slope);
void setActivPReLU(bool fuse_activ, std::vector<float> &slope); void setActivPReLU(bool fuse_activ, std::vector<float> &slope);
void setActivPower(bool fuse_activ, float power);
void setBias(bool bias_term); void setBias(bool bias_term);
private: private:
...@@ -252,8 +255,8 @@ class OCL4DNNConvSpatial ...@@ -252,8 +255,8 @@ class OCL4DNNConvSpatial
int lx, int ly, int lz, int lx, int ly, int lz,
bool swizzle, bool nullLocal); bool swizzle, bool nullLocal);
void generateTunerItems(std::vector< cv::Ptr<tunerParam> > &tunerItems); void generateTunerItems(std::vector< cv::Ptr<tunerParam> > &tunerItems);
void setFusionDefine(ocl4dnnFusedActiv_t fused_activ); void setFusionDefine(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise);
void setFusionArg(ocl4dnnFusedActiv_t fused_activ, ocl::Kernel &kernel, cl_uint &argIdx); void setFusionArg(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise, ocl::Kernel &kernel, cl_uint &argIdx);
int32_t group_; int32_t group_;
bool bias_term_; bool bias_term_;
...@@ -305,6 +308,8 @@ class OCL4DNNConvSpatial ...@@ -305,6 +308,8 @@ class OCL4DNNConvSpatial
float negative_slope_; float negative_slope_;
UMat negative_slope_umat_; UMat negative_slope_umat_;
ocl4dnnFusedActiv_t fused_activ_; ocl4dnnFusedActiv_t fused_activ_;
float power_;
bool fused_eltwise_;
}; };
typedef enum { typedef enum {
......
...@@ -79,6 +79,8 @@ OCL4DNNConvSpatial<Dtype>::OCL4DNNConvSpatial(OCL4DNNConvConfig config) ...@@ -79,6 +79,8 @@ OCL4DNNConvSpatial<Dtype>::OCL4DNNConvSpatial(OCL4DNNConvConfig config)
group_ = config.group; group_ = config.group;
fused_activ_ = OCL4DNN_CONV_FUSED_ACTIV_NONE; fused_activ_ = OCL4DNN_CONV_FUSED_ACTIV_NONE;
fused_eltwise_ = false;
power_ = 1.f;
negative_slope_ = 0; negative_slope_ = 0;
prev_kernel_type_ = -1; prev_kernel_type_ = -1;
tuned_ = false; tuned_ = false;
...@@ -141,8 +143,11 @@ OCL4DNNConvSpatial<Dtype>::~OCL4DNNConvSpatial() ...@@ -141,8 +143,11 @@ OCL4DNNConvSpatial<Dtype>::~OCL4DNNConvSpatial()
} }
template<typename Dtype> template<typename Dtype>
void OCL4DNNConvSpatial<Dtype>::setFusionDefine(ocl4dnnFusedActiv_t fused_activ) void OCL4DNNConvSpatial<Dtype>::setFusionDefine(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise)
{ {
if (fused_eltwise)
addDef("FUSED_CONV_ELTWISE", 1);
switch (fused_activ) { switch (fused_activ) {
case OCL4DNN_CONV_FUSED_ACTIV_RELU: case OCL4DNN_CONV_FUSED_ACTIV_RELU:
addDef("FUSED_CONV_RELU", 1); addDef("FUSED_CONV_RELU", 1);
...@@ -150,6 +155,9 @@ void OCL4DNNConvSpatial<Dtype>::setFusionDefine(ocl4dnnFusedActiv_t fused_activ) ...@@ -150,6 +155,9 @@ void OCL4DNNConvSpatial<Dtype>::setFusionDefine(ocl4dnnFusedActiv_t fused_activ)
case OCL4DNN_CONV_FUSED_ACTIV_PRELU: case OCL4DNN_CONV_FUSED_ACTIV_PRELU:
addDef("FUSED_CONV_PRELU", 1); addDef("FUSED_CONV_PRELU", 1);
break; break;
case OCL4DNN_CONV_FUSED_ACTIV_POWER:
addDef("FUSED_CONV_POWER", 1);
break;
default: default:
; ;
} }
...@@ -157,8 +165,11 @@ void OCL4DNNConvSpatial<Dtype>::setFusionDefine(ocl4dnnFusedActiv_t fused_activ) ...@@ -157,8 +165,11 @@ void OCL4DNNConvSpatial<Dtype>::setFusionDefine(ocl4dnnFusedActiv_t fused_activ)
} }
template<typename Dtype> template<typename Dtype>
void OCL4DNNConvSpatial<Dtype>::setFusionArg(ocl4dnnFusedActiv_t fused_activ, ocl::Kernel &kernel, cl_uint &argIdx) void OCL4DNNConvSpatial<Dtype>::setFusionArg(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise, ocl::Kernel &kernel, cl_uint &argIdx)
{ {
if (fused_eltwise)
kernel.set(argIdx++, (cl_mem)bottom_data2_.handle(ACCESS_READ));
switch (fused_activ) { switch (fused_activ) {
case OCL4DNN_CONV_FUSED_ACTIV_RELU: case OCL4DNN_CONV_FUSED_ACTIV_RELU:
kernel.set(argIdx++, (float)negative_slope_); kernel.set(argIdx++, (float)negative_slope_);
...@@ -166,6 +177,9 @@ void OCL4DNNConvSpatial<Dtype>::setFusionArg(ocl4dnnFusedActiv_t fused_activ, oc ...@@ -166,6 +177,9 @@ void OCL4DNNConvSpatial<Dtype>::setFusionArg(ocl4dnnFusedActiv_t fused_activ, oc
case OCL4DNN_CONV_FUSED_ACTIV_PRELU: case OCL4DNN_CONV_FUSED_ACTIV_PRELU:
kernel.set(argIdx++, (cl_mem)negative_slope_umat_.handle(ACCESS_READ)); kernel.set(argIdx++, (cl_mem)negative_slope_umat_.handle(ACCESS_READ));
break; break;
case OCL4DNN_CONV_FUSED_ACTIV_POWER:
kernel.set(argIdx++, (float)power_);
break;
default: default:
; ;
} }
...@@ -255,7 +269,7 @@ void OCL4DNNConvSpatial<Dtype>::setupKernelDetails(int32_t kernelType, ...@@ -255,7 +269,7 @@ void OCL4DNNConvSpatial<Dtype>::setupKernelDetails(int32_t kernelType,
addDef("ALIGNED_NUM_FILTERS", (int)alignSize(M_, simd_size)); addDef("ALIGNED_NUM_FILTERS", (int)alignSize(M_, simd_size));
addDef("OUT_BLOCK_SIZE", (output_block_width*output_block_height)); addDef("OUT_BLOCK_SIZE", (output_block_width*output_block_height));
addDef("APPLY_BIAS", bias_term_); addDef("APPLY_BIAS", bias_term_);
setFusionDefine(fused_activ_); setFusionDefine(fused_activ_, fused_eltwise_);
src_ = cv::ocl::dnn::conv_layer_spatial_oclsrc; src_ = cv::ocl::dnn::conv_layer_spatial_oclsrc;
} }
...@@ -277,7 +291,7 @@ void OCL4DNNConvSpatial<Dtype>::setupKernelDetails(int32_t kernelType, ...@@ -277,7 +291,7 @@ void OCL4DNNConvSpatial<Dtype>::setupKernelDetails(int32_t kernelType,
addDef("APPLY_BIAS", bias_term_); addDef("APPLY_BIAS", bias_term_);
addDef("OUTPUT_Z", M_); addDef("OUTPUT_Z", M_);
addDef("ZPAR", 1); addDef("ZPAR", 1);
setFusionDefine(fused_activ_); setFusionDefine(fused_activ_, fused_eltwise_);
src_ = cv::ocl::dnn::conv_layer_spatial_oclsrc; src_ = cv::ocl::dnn::conv_layer_spatial_oclsrc;
} }
...@@ -314,7 +328,7 @@ void OCL4DNNConvSpatial<Dtype>::setupKernelDetails(int32_t kernelType, ...@@ -314,7 +328,7 @@ void OCL4DNNConvSpatial<Dtype>::setupKernelDetails(int32_t kernelType,
addDef("TILE_N_LAST", M_ % 32); addDef("TILE_N_LAST", M_ % 32);
addDef("TILE_N_LAST_DIV8", (M_ % 32) / 8); addDef("TILE_N_LAST_DIV8", (M_ % 32) / 8);
addDef("APPLY_BIAS", bias_term_); addDef("APPLY_BIAS", bias_term_);
setFusionDefine(fused_activ_); setFusionDefine(fused_activ_, fused_eltwise_);
src_ = ocl::dnn::conv_layer_spatial_oclsrc; src_ = ocl::dnn::conv_layer_spatial_oclsrc;
} }
} }
...@@ -370,14 +384,37 @@ void OCL4DNNConvSpatial<Dtype>::setActivPReLU(bool fuse_activ, std::vector<float ...@@ -370,14 +384,37 @@ void OCL4DNNConvSpatial<Dtype>::setActivPReLU(bool fuse_activ, std::vector<float
fused_activ_ = OCL4DNN_CONV_FUSED_ACTIV_NONE; fused_activ_ = OCL4DNN_CONV_FUSED_ACTIV_NONE;
} }
template<typename Dtype>
void OCL4DNNConvSpatial<Dtype>::setActivPower(bool fuse_activ, float power)
{
if ( fuse_activ )
{
fused_activ_ = OCL4DNN_CONV_FUSED_ACTIV_POWER;
power_ = power;
}
else
fused_activ_ = OCL4DNN_CONV_FUSED_ACTIV_NONE;
}
template<typename Dtype> template<typename Dtype>
bool OCL4DNNConvSpatial<Dtype>::Forward(const UMat& bottom, bool OCL4DNNConvSpatial<Dtype>::Forward(const UMat& bottom,
const UMat& bottom2,
const UMat& weight, const UMat& weight,
const UMat& bias, const UMat& bias,
UMat& top, UMat& top,
int32_t numImages) int32_t numImages)
{ {
num_ = numImages; num_ = numImages;
if (!bottom2.empty())
{
fused_eltwise_ = true;
bottom_data2_ = bottom2;
}
else
{
fused_eltwise_ = false;
}
prepareKernel(bottom, top, weight, bias, numImages); prepareKernel(bottom, top, weight, bias, numImages);
if (bestKernelConfig.empty()) if (bestKernelConfig.empty())
return false; return false;
...@@ -428,7 +465,8 @@ void OCL4DNNConvSpatial<Dtype>::generateKey() ...@@ -428,7 +465,8 @@ void OCL4DNNConvSpatial<Dtype>::generateKey()
<< "p" << pad_w_ << "x" << pad_h_ << "_" << "p" << pad_w_ << "x" << pad_h_ << "_"
<< "num" << num_ << "_" << "num" << num_ << "_"
<< "M" << M_ << "_" << "M" << M_ << "_"
<< "activ" << fused_activ_; << "activ" << fused_activ_ << "_"
<< "eltwise" << fused_eltwise_;
key_ = ocl::Device::getDefault().vendorName() + "_EU" + cv::format("%d", ocl::Device::getDefault().maxComputeUnits()) + "_" + keyBuilder.str(); key_ = ocl::Device::getDefault().vendorName() + "_EU" + cv::format("%d", ocl::Device::getDefault().maxComputeUnits()) + "_" + keyBuilder.str();
...@@ -678,7 +716,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top, ...@@ -678,7 +716,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
return false; return false;
cl_uint argIdx = 0; cl_uint argIdx = 0;
setFusionArg(fused_activ_, kernel, argIdx); setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
UMat img_buffer; UMat img_buffer;
if (image_offset) if (image_offset)
...@@ -771,7 +809,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top, ...@@ -771,7 +809,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
return false; return false;
cl_uint argIdx = 0; cl_uint argIdx = 0;
setFusionArg(fused_activ_, kernel, argIdx); setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
UMat img_buffer; UMat img_buffer;
if (image_offset) if (image_offset)
...@@ -888,7 +926,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top, ...@@ -888,7 +926,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
return false; return false;
cl_uint argIdx = 0; cl_uint argIdx = 0;
setFusionArg(fused_activ_, kernel, argIdx); setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom)); kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
kernel.set(argIdx++, image_offset); kernel.set(argIdx++, image_offset);
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight)); kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
...@@ -1491,6 +1529,7 @@ void OCL4DNNConvSpatial<Dtype>::prepareKernel(const UMat &bottom, UMat &top, ...@@ -1491,6 +1529,7 @@ void OCL4DNNConvSpatial<Dtype>::prepareKernel(const UMat &bottom, UMat &top,
if (loadCachedConfig()) // check in-memory cache if (loadCachedConfig()) // check in-memory cache
return; return;
if (loadTunedConfig()) // check external storage if (loadTunedConfig()) // check external storage
return; return;
......
...@@ -52,12 +52,21 @@ ...@@ -52,12 +52,21 @@
#elif defined(FUSED_CONV_PRELU) #elif defined(FUSED_CONV_PRELU)
#define ACTIVATION_RELU_FUNCTION(x, c) ((Dtype)(x) > 0 ? (Dtype)(x) : ((Dtype)(x) * (Dtype)(negative_slope[c]))) #define ACTIVATION_RELU_FUNCTION(x, c) ((Dtype)(x) > 0 ? (Dtype)(x) : ((Dtype)(x) * (Dtype)(negative_slope[c])))
#define NEGATIVE_SLOPE_ARG __global const Dtype *negative_slope, #define NEGATIVE_SLOPE_ARG __global const Dtype *negative_slope,
#elif defined(FUSED_CONV_POWER)
#define ACTIVATION_RELU_FUNCTION(x, c) pow(x, power)
#define NEGATIVE_SLOPE_ARG Dtype power,
#else #else
#define ACTIVATION_RELU_FUNCTION(x, c) (x) #define ACTIVATION_RELU_FUNCTION(x, c) (x)
#define NEGATIVE_SLOPE_ARG #define NEGATIVE_SLOPE_ARG
#endif #endif
#ifdef FUSED_CONV_ELTWISE
#define ACTIVATION_FUNCTION(_dst_, _offset_, _data_, _channel_) do { (_dst_)[(_offset_)] = ACTIVATION_RELU_FUNCTION(eltwise_data[(_offset_)] + (_data_), _channel_);} while(0)
#define ELTWISE_DATA_ARG __global Dtype* eltwise_data,
#else
#define ACTIVATION_FUNCTION(_dst_, _offset_, _data_, _channel_) do { (_dst_)[(_offset_)] = ACTIVATION_RELU_FUNCTION(_data_, _channel_);} while(0) #define ACTIVATION_FUNCTION(_dst_, _offset_, _data_, _channel_) do { (_dst_)[(_offset_)] = ACTIVATION_RELU_FUNCTION(_data_, _channel_);} while(0)
#define ELTWISE_DATA_ARG
#endif
#define __CAT(x, y) x##y #define __CAT(x, y) x##y
...@@ -99,6 +108,7 @@ ...@@ -99,6 +108,7 @@
#ifdef KERNEL_BASIC #ifdef KERNEL_BASIC
__kernel void ConvolveBasic( __kernel void ConvolveBasic(
ELTWISE_DATA_ARG
NEGATIVE_SLOPE_ARG NEGATIVE_SLOPE_ARG
__global Dtype* image_data, __global Dtype* image_data,
int image_offset, int image_offset,
...@@ -193,6 +203,7 @@ __attribute__((intel_reqd_sub_group_size(SIMD_SIZE))) ...@@ -193,6 +203,7 @@ __attribute__((intel_reqd_sub_group_size(SIMD_SIZE)))
#endif #endif
__kernel void __kernel void
convolve_simd( convolve_simd(
ELTWISE_DATA_ARG
NEGATIVE_SLOPE_ARG NEGATIVE_SLOPE_ARG
__global Dtype* inputs_base, __global Dtype* inputs_base,
filter_qualifier Dtype* weights_base, filter_qualifier Dtype* weights_base,
...@@ -413,6 +424,7 @@ typedef struct float0 { float s0; } float0; //never used but makes compiler happ ...@@ -413,6 +424,7 @@ typedef struct float0 { float s0; } float0; //never used but makes compiler happ
#define ROW_PITCH input_width #define ROW_PITCH input_width
#define GEMM_LIKE_KERNEL_ARGS \ #define GEMM_LIKE_KERNEL_ARGS \
ELTWISE_DATA_ARG \
NEGATIVE_SLOPE_ARG \ NEGATIVE_SLOPE_ARG \
const __global Dtype *src0, \ const __global Dtype *src0, \
const __global Dtype *src1, \ const __global Dtype *src1, \
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment