Commit e0e40405 authored by Vadim Pisarevsky's avatar Vadim Pisarevsky

Merge pull request #9847 from wzw-intel:ocl4dnn_fusion

parents ff037ebe 2d8f2c2a
...@@ -1028,7 +1028,7 @@ struct Net::Impl ...@@ -1028,7 +1028,7 @@ struct Net::Impl
void fuseLayers(const std::vector<LayerPin>& blobsToKeep_) void fuseLayers(const std::vector<LayerPin>& blobsToKeep_)
{ {
if( !fusion || !(preferableBackend == DNN_BACKEND_DEFAULT && preferableTarget == DNN_TARGET_CPU)) if( !fusion || preferableBackend != DNN_BACKEND_DEFAULT)
return; return;
CV_TRACE_FUNCTION(); CV_TRACE_FUNCTION();
...@@ -1056,6 +1056,11 @@ struct Net::Impl ...@@ -1056,6 +1056,11 @@ struct Net::Impl
// with the current layer if they follow it. Normally, the are fused with the convolution layer, // with the current layer if they follow it. Normally, the are fused with the convolution layer,
// but some of them (like activation) may be fused with fully-connected, elemwise (+) and // but some of them (like activation) may be fused with fully-connected, elemwise (+) and
// some other layers. // some other layers.
// TODO: OpenCL target support more fusion styles.
if ( preferableTarget == DNN_TARGET_OPENCL && ld.layerInstance->type.compare("Convolution") )
continue;
Ptr<Layer>& currLayer = ld.layerInstance; Ptr<Layer>& currLayer = ld.layerInstance;
if( ld.consumers.size() == 1 && pinsToKeep.count(LayerPin(lid, 0)) == 0 ) if( ld.consumers.size() == 1 && pinsToKeep.count(LayerPin(lid, 0)) == 0 )
{ {
...@@ -1100,16 +1105,27 @@ struct Net::Impl ...@@ -1100,16 +1105,27 @@ struct Net::Impl
} }
} }
Ptr<ActivationLayer> nextActivLayer; // For now, OpenCL target only support fusion with activation of ReLU/ChannelsPReLU
if( nextData ) if ( preferableTarget != DNN_TARGET_OPENCL ||
nextActivLayer = nextData->layerInstance.dynamicCast<ActivationLayer>(); (preferableTarget == DNN_TARGET_OPENCL &&
nextData &&
if( !nextActivLayer.empty() && pinsToKeep.count(lpNext) == 0 (!nextData->type.compare("ReLU") ||
&& currLayer->setActivation(nextActivLayer) ) !nextData->type.compare("ChannelsPReLU"))) )
{ {
printf_(("\tfused with %s\n", nextActivLayer->name.c_str()));
nextData->skipFlags[DNN_BACKEND_DEFAULT] = true; Ptr<ActivationLayer> nextActivLayer;
ld.outputBlobs = layers[lpNext.lid].outputBlobs;
if( nextData )
nextActivLayer = nextData->layerInstance.dynamicCast<ActivationLayer>();
if( !nextActivLayer.empty() && pinsToKeep.count(lpNext) == 0
&& currLayer->setActivation(nextActivLayer) )
{
LayerData *activData = nextData;
printf_(("\tfused with %s\n", nextActivLayer->name.c_str()));
activData->skipFlags[DNN_BACKEND_DEFAULT] = true;
ld.outputBlobs = layers[lpNext.lid].outputBlobs;
}
} }
} }
......
...@@ -157,7 +157,20 @@ public: ...@@ -157,7 +157,20 @@ public:
#ifdef HAVE_OPENCL #ifdef HAVE_OPENCL
Ptr<OCL4DNNConvSpatial<float> > convolutionOp; Ptr<OCL4DNNConvSpatial<float> > convolutionOp;
std::vector<UMat> umat_blobs; std::vector<UMat> umat_blobs;
bool fusedBias;
bool newWeightAndBias;
bool newActiv;
ocl4dnnFusedActiv_t activType;
#endif #endif
ConvolutionLayerImpl()
{
#ifdef HAVE_OPENCL
fusedBias = false;
newWeightAndBias = false;
newActiv = false;
activType = OCL4DNN_CONV_FUSED_ACTIV_NONE;
#endif
}
MatShape computeColRowShape(const MatShape &inpShape, const MatShape &outShape) const MatShape computeColRowShape(const MatShape &inpShape, const MatShape &outShape) const
{ {
...@@ -209,6 +222,10 @@ public: ...@@ -209,6 +222,10 @@ public:
activ = layer; activ = layer;
if (activ.empty()) if (activ.empty())
reluslope.clear(); reluslope.clear();
#ifdef HAVE_OPENCL
newActiv = true;
activType = OCL4DNN_CONV_FUSED_ACTIV_NONE;
#endif
return !activ.empty(); return !activ.empty();
} }
...@@ -221,6 +238,10 @@ public: ...@@ -221,6 +238,10 @@ public:
// we will need to re-compute the weights with the batch // we will need to re-compute the weights with the batch
// norm coefficients taken into account // norm coefficients taken into account
weightsMat.release(); weightsMat.release();
#ifdef HAVE_OPENCL
newWeightAndBias = true;
fusedBias = false;
#endif
return !bnorm.empty(); return !bnorm.empty();
} }
...@@ -230,6 +251,10 @@ public: ...@@ -230,6 +251,10 @@ public:
// we will need to re-compute the weights with the scaling // we will need to re-compute the weights with the scaling
// coefficients taken into account // coefficients taken into account
weightsMat.release(); weightsMat.release();
#ifdef HAVE_OPENCL
newWeightAndBias = true;
fusedBias = false;
#endif
return !scaleLayer.empty(); return !scaleLayer.empty();
} }
...@@ -665,19 +690,49 @@ public: ...@@ -665,19 +690,49 @@ public:
convolutionOp = Ptr<OCL4DNNConvSpatial<float> >(new OCL4DNNConvSpatial<float>(config)); convolutionOp = Ptr<OCL4DNNConvSpatial<float> >(new OCL4DNNConvSpatial<float>(config));
} }
for (size_t ii = 0; ii < outputs.size(); ii++) if ( newWeightAndBias )
{ {
UMat inpMat, outMat; weightsMat.copyTo(umat_blobs[0]);
inpMat = inputs[ii]->getUMat(ACCESS_READ); if ( fusedBias )
outMat = outputs[ii].getUMat(ACCESS_WRITE); {
if ( umat_blobs.size() < 2 )
int batch_size = inpMat.size[0]; umat_blobs.resize(2);
umat_blobs[1] = UMat(biasvec, true);
}
convolutionOp->setBias(fusedBias || hasBias());
newWeightAndBias = false;
}
if (!convolutionOp->Forward(inpMat, umat_blobs[0], hasBias() ? umat_blobs[1] : UMat(), if ( newActiv )
outMat, batch_size)) {
return false; if ( activType == OCL4DNN_CONV_FUSED_ACTIV_RELU )
{
CV_Assert(!reluslope.empty());
convolutionOp->setActivReLU(true, reluslope[0]);
}
else if ( activType == OCL4DNN_CONV_FUSED_ACTIV_PRELU)
{
CV_Assert(!reluslope.empty());
convolutionOp->setActivPReLU(true, reluslope);
}
else
{
convolutionOp->setActivReLU(false, 0);
convolutionOp->setActivPReLU(false, reluslope);
}
newActiv = false;
} }
return true;
UMat inpMat, outMat;
inpMat = inputs[0]->getUMat(ACCESS_READ);
outMat = outputs[0].getUMat(ACCESS_WRITE);
int batch_size = inpMat.size[0];
return convolutionOp->Forward(inpMat,
umat_blobs[0],
(hasBias() || fusedBias) ? umat_blobs[1] : UMat(),
outMat,
batch_size);
} }
#endif #endif
...@@ -693,11 +748,6 @@ public: ...@@ -693,11 +748,6 @@ public:
CV_Assert(inputs.size() == (size_t)1 && inputs[0]->size[1] % blobs[0].size[1] == 0); CV_Assert(inputs.size() == (size_t)1 && inputs[0]->size[1] % blobs[0].size[1] == 0);
int ngroups = inputs[0]->size[1]/blobs[0].size[1]; int ngroups = inputs[0]->size[1]/blobs[0].size[1];
CV_Assert(outputs[0].size[1] % ngroups == 0); CV_Assert(outputs[0].size[1] % ngroups == 0);
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
forward_ocl(inputs, outputs, internals))
int k, outCn = blobs[0].size[0]; int k, outCn = blobs[0].size[0];
if( weightsMat.empty() ) if( weightsMat.empty() )
...@@ -761,6 +811,11 @@ public: ...@@ -761,6 +811,11 @@ public:
} }
} }
#ifdef HAVE_OPENCL
if (shiftptr || shiftptr2)
fusedBias = true;
#endif
for( int i = 0; i < outCn; i++ ) for( int i = 0; i < outCn; i++ )
{ {
float s1 = scaleptr ? scaleptr[i] : 1.f; float s1 = scaleptr ? scaleptr[i] : 1.f;
...@@ -784,7 +839,12 @@ public: ...@@ -784,7 +839,12 @@ public:
{ {
Ptr<ReLULayer> activ_relu = activ.dynamicCast<ReLULayer>(); Ptr<ReLULayer> activ_relu = activ.dynamicCast<ReLULayer>();
if( !activ_relu.empty() ) if( !activ_relu.empty() )
{
reluslope.assign(outCn+2, activ_relu->negativeSlope); reluslope.assign(outCn+2, activ_relu->negativeSlope);
#ifdef HAVE_OPENCL
activType = OCL4DNN_CONV_FUSED_ACTIV_RELU;
#endif
}
Ptr<ChannelsPReLULayer> activ_chprelu = activ.dynamicCast<ChannelsPReLULayer>(); Ptr<ChannelsPReLULayer> activ_chprelu = activ.dynamicCast<ChannelsPReLULayer>();
if( !activ_chprelu.empty() ) if( !activ_chprelu.empty() )
...@@ -795,9 +855,16 @@ public: ...@@ -795,9 +855,16 @@ public:
reluslope.resize(outCn+2); reluslope.resize(outCn+2);
std::copy(mdata, mdata + outCn, reluslope.begin()); std::copy(mdata, mdata + outCn, reluslope.begin());
reluslope[outCn] = reluslope[outCn+1] = reluslope[outCn-1]; reluslope[outCn] = reluslope[outCn+1] = reluslope[outCn-1];
#ifdef HAVE_OPENCL
activType = OCL4DNN_CONV_FUSED_ACTIV_PRELU;
#endif
} }
} }
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
forward_ocl(inputs, outputs, internals))
int nstripes = std::max(getNumThreads(), 1); int nstripes = std::max(getNumThreads(), 1);
ParallelConv::run(*inputs[0], outputs[0], weightsMat, biasvec, reluslope, ParallelConv::run(*inputs[0], outputs[0], weightsMat, biasvec, reluslope,
......
...@@ -73,6 +73,11 @@ struct OCL4DNNConvConfig ...@@ -73,6 +73,11 @@ struct OCL4DNNConvConfig
bool bias_term; // = false; bool bias_term; // = false;
}; };
typedef enum {
OCL4DNN_CONV_FUSED_ACTIV_NONE = 0,
OCL4DNN_CONV_FUSED_ACTIV_RELU = 1,
OCL4DNN_CONV_FUSED_ACTIV_PRELU = 2,
} ocl4dnnFusedActiv_t;
template<typename Dtype> template<typename Dtype>
class OCL4DNNConvSpatial class OCL4DNNConvSpatial
...@@ -80,9 +85,13 @@ class OCL4DNNConvSpatial ...@@ -80,9 +85,13 @@ class OCL4DNNConvSpatial
public: public:
explicit OCL4DNNConvSpatial(OCL4DNNConvConfig config); explicit OCL4DNNConvSpatial(OCL4DNNConvConfig config);
~OCL4DNNConvSpatial(); ~OCL4DNNConvSpatial();
bool Forward(const UMat& bottom_data, const UMat& weight, bool Forward(const UMat& bottom_data,
const UMat& weight,
const UMat& bias, const UMat& bias,
UMat& top_data, int32_t batch_size); UMat& top_data, int32_t batch_size);
void setActivReLU(bool fuse_activ, float slope);
void setActivPReLU(bool fuse_activ, std::vector<float> &slope);
void setBias(bool bias_term);
private: private:
struct kernelConfig struct kernelConfig
...@@ -194,9 +203,9 @@ class OCL4DNNConvSpatial ...@@ -194,9 +203,9 @@ class OCL4DNNConvSpatial
int32_t blockWidth, int32_t blockWidth,
int32_t blockHeight, int32_t blockHeight,
int32_t blockDepth); int32_t blockDepth);
bool setupIDLF(int32_t blockWidth, bool createIDLFKernel(int32_t blockWidth,
int32_t blockHeight, int32_t blockHeight,
int32_t blockDepth); int32_t blockDepth);
bool createBasicKernel(int32_t blockWidth, bool createBasicKernel(int32_t blockWidth,
int32_t blockHeight, int32_t blockHeight,
int32_t blockDepth); int32_t blockDepth);
...@@ -244,10 +253,13 @@ class OCL4DNNConvSpatial ...@@ -244,10 +253,13 @@ class OCL4DNNConvSpatial
int lx, int ly, int lz, int lx, int ly, int lz,
bool swizzle, bool nullLocal); bool swizzle, bool nullLocal);
void generateTunerItems(std::vector< cv::Ptr<tunerParam> > &tunerItems); void generateTunerItems(std::vector< cv::Ptr<tunerParam> > &tunerItems);
void setFusionDefine(ocl4dnnFusedActiv_t fused_activ);
void setFusionArg(ocl4dnnFusedActiv_t fused_activ, ocl::Kernel &kernel, cl_uint &argIdx);
int32_t group_; int32_t group_;
bool bias_term_; bool bias_term_;
UMat swizzled_weights_umat; UMat swizzled_weights_umat;
UMat bottom_data2_;
int32_t bottom_index_; int32_t bottom_index_;
int32_t output_h_; int32_t output_h_;
...@@ -291,6 +303,9 @@ class OCL4DNNConvSpatial ...@@ -291,6 +303,9 @@ class OCL4DNNConvSpatial
std::stringstream options_; std::stringstream options_;
cv::ocl::ProgramSource src_; cv::ocl::ProgramSource src_;
int32_t prev_kernel_type_; int32_t prev_kernel_type_;
bool negative_slope_;
UMat negative_slope_umat_;
ocl4dnnFusedActiv_t fused_activ_;
}; };
typedef enum { typedef enum {
......
...@@ -78,6 +78,8 @@ OCL4DNNConvSpatial<Dtype>::OCL4DNNConvSpatial(OCL4DNNConvConfig config) ...@@ -78,6 +78,8 @@ OCL4DNNConvSpatial<Dtype>::OCL4DNNConvSpatial(OCL4DNNConvConfig config)
num_output_ = config.out_shape[dims - spatial_dims - 1]; num_output_ = config.out_shape[dims - spatial_dims - 1];
group_ = config.group; group_ = config.group;
fused_activ_ = OCL4DNN_CONV_FUSED_ACTIV_NONE;
negative_slope_ = 0;
prev_kernel_type_ = -1; prev_kernel_type_ = -1;
tuned_ = false; tuned_ = false;
...@@ -138,6 +140,38 @@ OCL4DNNConvSpatial<Dtype>::~OCL4DNNConvSpatial() ...@@ -138,6 +140,38 @@ OCL4DNNConvSpatial<Dtype>::~OCL4DNNConvSpatial()
} }
} }
template<typename Dtype>
void OCL4DNNConvSpatial<Dtype>::setFusionDefine(ocl4dnnFusedActiv_t fused_activ)
{
switch (fused_activ) {
case OCL4DNN_CONV_FUSED_ACTIV_RELU:
addDef("FUSED_CONV_RELU", 1);
break;
case OCL4DNN_CONV_FUSED_ACTIV_PRELU:
addDef("FUSED_CONV_PRELU", 1);
break;
default:
;
}
return;
}
template<typename Dtype>
void OCL4DNNConvSpatial<Dtype>::setFusionArg(ocl4dnnFusedActiv_t fused_activ, ocl::Kernel &kernel, cl_uint &argIdx)
{
switch (fused_activ) {
case OCL4DNN_CONV_FUSED_ACTIV_RELU:
kernel.set(argIdx++, (float)negative_slope_);
break;
case OCL4DNN_CONV_FUSED_ACTIV_PRELU:
kernel.set(argIdx++, (cl_mem)negative_slope_umat_.handle(ACCESS_READ));
break;
default:
;
}
return;
}
template<typename Dtype> template<typename Dtype>
void OCL4DNNConvSpatial<Dtype>::collectCommonInformation() void OCL4DNNConvSpatial<Dtype>::collectCommonInformation()
{ {
...@@ -221,6 +255,7 @@ void OCL4DNNConvSpatial<Dtype>::setupKernelDetails(int32_t kernelType, ...@@ -221,6 +255,7 @@ void OCL4DNNConvSpatial<Dtype>::setupKernelDetails(int32_t kernelType,
addDef("ALIGNED_NUM_FILTERS", (int)alignSize(M_, simd_size)); addDef("ALIGNED_NUM_FILTERS", (int)alignSize(M_, simd_size));
addDef("OUT_BLOCK_SIZE", (output_block_width*output_block_height)); addDef("OUT_BLOCK_SIZE", (output_block_width*output_block_height));
addDef("APPLY_BIAS", bias_term_); addDef("APPLY_BIAS", bias_term_);
setFusionDefine(fused_activ_);
src_ = cv::ocl::dnn::conv_layer_spatial_oclsrc; src_ = cv::ocl::dnn::conv_layer_spatial_oclsrc;
} }
...@@ -242,6 +277,7 @@ void OCL4DNNConvSpatial<Dtype>::setupKernelDetails(int32_t kernelType, ...@@ -242,6 +277,7 @@ void OCL4DNNConvSpatial<Dtype>::setupKernelDetails(int32_t kernelType,
addDef("APPLY_BIAS", bias_term_); addDef("APPLY_BIAS", bias_term_);
addDef("OUTPUT_Z", M_); addDef("OUTPUT_Z", M_);
addDef("ZPAR", 1); addDef("ZPAR", 1);
setFusionDefine(fused_activ_);
src_ = cv::ocl::dnn::conv_layer_spatial_oclsrc; src_ = cv::ocl::dnn::conv_layer_spatial_oclsrc;
} }
...@@ -278,6 +314,7 @@ void OCL4DNNConvSpatial<Dtype>::setupKernelDetails(int32_t kernelType, ...@@ -278,6 +314,7 @@ void OCL4DNNConvSpatial<Dtype>::setupKernelDetails(int32_t kernelType,
addDef("TILE_N_LAST", M_ % 32); addDef("TILE_N_LAST", M_ % 32);
addDef("TILE_N_LAST_DIV8", (M_ % 32) / 8); addDef("TILE_N_LAST_DIV8", (M_ % 32) / 8);
addDef("APPLY_BIAS", bias_term_); addDef("APPLY_BIAS", bias_term_);
setFusionDefine(fused_activ_);
src_ = ocl::dnn::conv_layer_spatial_oclsrc; src_ = ocl::dnn::conv_layer_spatial_oclsrc;
} }
} }
...@@ -302,6 +339,37 @@ void OCL4DNNConvSpatial<Dtype>::setupKernel() ...@@ -302,6 +339,37 @@ void OCL4DNNConvSpatial<Dtype>::setupKernel()
setupKernelDetails(kernelType_, blockM_, blockK_, blockN_); setupKernelDetails(kernelType_, blockM_, blockK_, blockN_);
} }
template<typename Dtype>
void OCL4DNNConvSpatial<Dtype>::setBias(bool bias_term)
{
bias_term_ = bias_term;
}
template<typename Dtype>
void OCL4DNNConvSpatial<Dtype>::setActivReLU(bool fuse_activ, float slope)
{
if ( fuse_activ )
{
fused_activ_ = OCL4DNN_CONV_FUSED_ACTIV_RELU;
negative_slope_ = slope;
}
else
fused_activ_ = OCL4DNN_CONV_FUSED_ACTIV_NONE;
}
template<typename Dtype>
void OCL4DNNConvSpatial<Dtype>::setActivPReLU(bool fuse_activ, std::vector<float> &slope)
{
if ( fuse_activ )
{
fused_activ_ = OCL4DNN_CONV_FUSED_ACTIV_PRELU;
Mat tmpMat = Mat(num_output_, 1, CV_32FC1, (uchar*)&slope[0]);
tmpMat.copyTo(negative_slope_umat_);
}
else
fused_activ_ = OCL4DNN_CONV_FUSED_ACTIV_NONE;
}
template<typename Dtype> template<typename Dtype>
bool OCL4DNNConvSpatial<Dtype>::Forward(const UMat& bottom, bool OCL4DNNConvSpatial<Dtype>::Forward(const UMat& bottom,
const UMat& weight, const UMat& weight,
...@@ -310,7 +378,6 @@ bool OCL4DNNConvSpatial<Dtype>::Forward(const UMat& bottom, ...@@ -310,7 +378,6 @@ bool OCL4DNNConvSpatial<Dtype>::Forward(const UMat& bottom,
int32_t numImages) int32_t numImages)
{ {
num_ = numImages; num_ = numImages;
prepareKernel(bottom, top, weight, bias, numImages); prepareKernel(bottom, top, weight, bias, numImages);
return convolve(bottom, top, weight, bias, numImages, bestKernelConfig, cv::ocl::Queue::getDefault()); return convolve(bottom, top, weight, bias, numImages, bestKernelConfig, cv::ocl::Queue::getDefault());
} }
...@@ -358,7 +425,9 @@ void OCL4DNNConvSpatial<Dtype>::generateKey() ...@@ -358,7 +425,9 @@ void OCL4DNNConvSpatial<Dtype>::generateKey()
<< "in" << TUNING_SIZE(width_) << "x" << TUNING_SIZE(height_) << "_" << "in" << TUNING_SIZE(width_) << "x" << TUNING_SIZE(height_) << "_"
<< "p" << pad_w_ << "x" << pad_h_ << "_" << "p" << pad_w_ << "x" << pad_h_ << "_"
<< "num" << num_ << "_" << "num" << num_ << "_"
<< "M" << M_; << "M" << M_ << "_"
<< "activ" << fused_activ_;
key_ = ocl::Device::getDefault().vendorName() + "_EU" + cv::format("%d", ocl::Device::getDefault().maxComputeUnits()) + "_" + keyBuilder.str(); key_ = ocl::Device::getDefault().vendorName() + "_EU" + cv::format("%d", ocl::Device::getDefault().maxComputeUnits()) + "_" + keyBuilder.str();
key_sanitized_ = key_; key_sanitized_ = key_;
...@@ -608,6 +677,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top, ...@@ -608,6 +677,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
return false; return false;
cl_uint argIdx = 0; cl_uint argIdx = 0;
setFusionArg(fused_activ_, kernel, argIdx);
UMat img_buffer; UMat img_buffer;
if (image_offset) if (image_offset)
...@@ -700,6 +770,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top, ...@@ -700,6 +770,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
return false; return false;
cl_uint argIdx = 0; cl_uint argIdx = 0;
setFusionArg(fused_activ_, kernel, argIdx);
UMat img_buffer; UMat img_buffer;
if (image_offset) if (image_offset)
...@@ -807,13 +878,16 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top, ...@@ -807,13 +878,16 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
int32_t output_image_offset = n * top_dim_ int32_t output_image_offset = n * top_dim_
+ output_w_ * output_h_ * M_ * g; + output_w_ * output_h_ * M_ * g;
cl_uint argIdx = 0; int32_t kernel_offset = kernel_h_ * kernel_w_ *
int32_t kernel_offset = kernel_h_ * kernel_w_ * (channels_ / group_) * M_ * g; (channels_ / group_) * M_
* g;
ocl::Kernel kernel(config->kernelName.c_str(), program); ocl::Kernel kernel(config->kernelName.c_str(), program);
if (kernel.empty()) if (kernel.empty())
return false; return false;
cl_uint argIdx = 0;
setFusionArg(fused_activ_, kernel, argIdx);
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom)); kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
kernel.set(argIdx++, image_offset); kernel.set(argIdx++, image_offset);
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight)); kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
...@@ -1058,9 +1132,9 @@ bool OCL4DNNConvSpatial<float>::createGEMMLikeConvKernel(int32_t blockM, ...@@ -1058,9 +1132,9 @@ bool OCL4DNNConvSpatial<float>::createGEMMLikeConvKernel(int32_t blockM,
} }
template<> template<>
bool OCL4DNNConvSpatial<float>::setupIDLF(int32_t blockWidth, bool OCL4DNNConvSpatial<float>::createIDLFKernel(int32_t blockWidth,
int32_t blockHeight, int32_t blockHeight,
int32_t simd_size) int32_t simd_size)
{ {
int32_t workItemOutput[3] = { blockWidth, blockHeight, simd_size }; int32_t workItemOutput[3] = { blockWidth, blockHeight, simd_size };
const int32_t num_output_maps = M_; const int32_t num_output_maps = M_;
...@@ -1122,7 +1196,7 @@ bool OCL4DNNConvSpatial<float>::createConvolutionKernel(int32_t kernelType, ...@@ -1122,7 +1196,7 @@ bool OCL4DNNConvSpatial<float>::createConvolutionKernel(int32_t kernelType,
src_ = ocl::ProgramSource(); src_ = ocl::ProgramSource();
if (kernelType == KERNEL_TYPE_INTEL_IDLF) if (kernelType == KERNEL_TYPE_INTEL_IDLF)
return setupIDLF(blockWidth, blockHeight, blockDepth); return createIDLFKernel(blockWidth, blockHeight, blockDepth);
else if (kernelType == KERNEL_TYPE_BASIC) else if (kernelType == KERNEL_TYPE_BASIC)
return createBasicKernel(blockWidth, blockHeight, blockDepth); return createBasicKernel(blockWidth, blockHeight, blockDepth);
else if (kernelType == KERNEL_TYPE_GEMM_LIKE) else if (kernelType == KERNEL_TYPE_GEMM_LIKE)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment