Merge pull request #9847 from wzw-intel:ocl4dnn_fusion

e0e40405 · Vadim Pisarevsky · ff037ebe · 2d8f2c2a · e0e40405 · e0e40405
Commit e0e40405 authored Oct 27, 2017 by Vadim Pisarevsky
6 changed files
--- a/modules/dnn/src/dnn.cpp
+++ b/modules/dnn/src/dnn.cpp
@@ -1028,7 +1028,7 @@ struct Net::Impl
    void fuseLayers(const std::vector<LayerPin>& blobsToKeep_)
    {
-        if( !fusion || !(preferableBackend == DNN_BACKEND_DEFAULT && preferableTarget == DNN_TARGET_CPU))
+        if( !fusion || preferableBackend != DNN_BACKEND_DEFAULT)
            return;
        CV_TRACE_FUNCTION();
@@ -1056,6 +1056,11 @@ struct Net::Impl
            // with the current layer if they follow it. Normally, the are fused with the convolution layer,
            // but some of them (like activation) may be fused with fully-connected, elemwise (+) and
            // some other layers.
+            // TODO: OpenCL target support more fusion styles.
+            if ( preferableTarget == DNN_TARGET_OPENCL && ld.layerInstance->type.compare("Convolution") )
+                continue;
            Ptr<Layer>& currLayer = ld.layerInstance;
            if( ld.consumers.size() == 1 && pinsToKeep.count(LayerPin(lid, 0)) == 0 )
            {
@@ -1100,16 +1105,27 @@ struct Net::Impl
                    }
                }
-                Ptr<ActivationLayer> nextActivLayer;
+                // For now,  OpenCL target only support fusion with activation of ReLU/ChannelsPReLU
-                if( nextData )
+                if ( preferableTarget != DNN_TARGET_OPENCL ||
-                    nextActivLayer = nextData->layerInstance.dynamicCast<ActivationLayer>();
+                        (preferableTarget == DNN_TARGET_OPENCL &&
+                         nextData &&
-                if( !nextActivLayer.empty() && pinsToKeep.count(lpNext) == 0
+                        (!nextData->type.compare("ReLU") ||
-                        && currLayer->setActivation(nextActivLayer) )
+                         !nextData->type.compare("ChannelsPReLU"))) )
                {
-                    printf_(("\tfused with %s\n", nextActivLayer->name.c_str()));
-                    nextData->skipFlags[DNN_BACKEND_DEFAULT] = true;
+                    Ptr<ActivationLayer> nextActivLayer;
-                    ld.outputBlobs = layers[lpNext.lid].outputBlobs;
+                    if( nextData )
+                        nextActivLayer = nextData->layerInstance.dynamicCast<ActivationLayer>();
+                    if( !nextActivLayer.empty() && pinsToKeep.count(lpNext) == 0
+                            && currLayer->setActivation(nextActivLayer) )
+                    {
+                        LayerData *activData = nextData;
+                        printf_(("\tfused with %s\n", nextActivLayer->name.c_str()));
+                        activData->skipFlags[DNN_BACKEND_DEFAULT] = true;
+                        ld.outputBlobs = layers[lpNext.lid].outputBlobs;
+                    }
                }
            }

--- a/modules/dnn/src/layers/convolution_layer.cpp
+++ b/modules/dnn/src/layers/convolution_layer.cpp
@@ -157,7 +157,20 @@ public:
 #ifdef HAVE_OPENCL
    Ptr<OCL4DNNConvSpatial<float> > convolutionOp;
    std::vector<UMat> umat_blobs;
+    bool fusedBias;
+    bool newWeightAndBias;
+    bool newActiv;
+    ocl4dnnFusedActiv_t activType;
 #endif
+    ConvolutionLayerImpl()
+    {
+#ifdef HAVE_OPENCL
+        fusedBias = false;
+        newWeightAndBias = false;
+        newActiv = false;
+        activType = OCL4DNN_CONV_FUSED_ACTIV_NONE;
+#endif
+    }
    MatShape computeColRowShape(const MatShape &inpShape, const MatShape &outShape) const
    {
@@ -209,6 +222,10 @@ public:
        activ = layer;
        if (activ.empty())
            reluslope.clear();
+#ifdef HAVE_OPENCL
+        newActiv = true;
+        activType = OCL4DNN_CONV_FUSED_ACTIV_NONE;
+#endif
        return !activ.empty();
    }
@@ -221,6 +238,10 @@ public:
        // we will need to re-compute the weights with the batch
        // norm coefficients taken into account
        weightsMat.release();
+#ifdef HAVE_OPENCL
+        newWeightAndBias = true;
+        fusedBias = false;
+#endif
        return !bnorm.empty();
    }
@@ -230,6 +251,10 @@ public:
        // we will need to re-compute the weights with the scaling
        // coefficients taken into account
        weightsMat.release();
+#ifdef HAVE_OPENCL
+        newWeightAndBias = true;
+        fusedBias = false;
+#endif
        return !scaleLayer.empty();
    }
@@ -665,19 +690,49 @@ public:
            convolutionOp = Ptr<OCL4DNNConvSpatial<float> >(new OCL4DNNConvSpatial<float>(config));
        }
-        for (size_t ii = 0; ii < outputs.size(); ii++)
+        if ( newWeightAndBias )
        {
-            UMat inpMat, outMat;
+            weightsMat.copyTo(umat_blobs[0]);
-            inpMat = inputs[ii]->getUMat(ACCESS_READ);
+            if ( fusedBias )
-            outMat = outputs[ii].getUMat(ACCESS_WRITE);
+            {
+                if ( umat_blobs.size() < 2 )
-            int batch_size = inpMat.size[0];
+                    umat_blobs.resize(2);
+                umat_blobs[1] = UMat(biasvec, true);
+            }
+            convolutionOp->setBias(fusedBias || hasBias());
+            newWeightAndBias = false;
+        }
-            if (!convolutionOp->Forward(inpMat, umat_blobs[0], hasBias() ? umat_blobs[1] : UMat(),
+        if ( newActiv )
-                                        outMat, batch_size))
+        {
-               return false;
+            if ( activType == OCL4DNN_CONV_FUSED_ACTIV_RELU )
+            {
+                CV_Assert(!reluslope.empty());
+                convolutionOp->setActivReLU(true, reluslope[0]);
+            }
+            else if ( activType == OCL4DNN_CONV_FUSED_ACTIV_PRELU)
+            {
+                CV_Assert(!reluslope.empty());
+                convolutionOp->setActivPReLU(true, reluslope);
+            }
+            else
+            {
+                convolutionOp->setActivReLU(false, 0);
+                convolutionOp->setActivPReLU(false, reluslope);
+            }
+            newActiv = false;
        }
-        return true;
+        UMat inpMat, outMat;
+        inpMat = inputs[0]->getUMat(ACCESS_READ);
+        outMat = outputs[0].getUMat(ACCESS_WRITE);
+        int batch_size = inpMat.size[0];
+        return convolutionOp->Forward(inpMat,
+                                      umat_blobs[0],
+                                      (hasBias() || fusedBias) ? umat_blobs[1] : UMat(),
+                                      outMat,
+                                      batch_size);
    }
 #endif
@@ -693,11 +748,6 @@ public:
        CV_Assert(inputs.size() == (size_t)1 && inputs[0]->size[1] % blobs[0].size[1] == 0);
        int ngroups = inputs[0]->size[1]/blobs[0].size[1];
        CV_Assert(outputs[0].size[1] % ngroups == 0);
-        CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
-                   OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
-                   forward_ocl(inputs, outputs, internals))
        int k, outCn = blobs[0].size[0];
        if( weightsMat.empty() )
@@ -761,6 +811,11 @@ public:
                    }
                }
+#ifdef HAVE_OPENCL
+                if (shiftptr || shiftptr2)
+                    fusedBias = true;
+#endif
                for( int i = 0; i < outCn; i++ )
                {
                    float s1 = scaleptr ? scaleptr[i] : 1.f;
@@ -784,7 +839,12 @@ public:
        {
            Ptr<ReLULayer> activ_relu = activ.dynamicCast<ReLULayer>();
            if( !activ_relu.empty() )
+            {
                reluslope.assign(outCn+2, activ_relu->negativeSlope);
+#ifdef HAVE_OPENCL
+                activType = OCL4DNN_CONV_FUSED_ACTIV_RELU;
+#endif
+            }
            Ptr<ChannelsPReLULayer> activ_chprelu = activ.dynamicCast<ChannelsPReLULayer>();
            if( !activ_chprelu.empty() )
@@ -795,9 +855,16 @@ public:
                reluslope.resize(outCn+2);
                std::copy(mdata, mdata + outCn, reluslope.begin());
                reluslope[outCn] = reluslope[outCn+1] = reluslope[outCn-1];
+#ifdef HAVE_OPENCL
+                activType = OCL4DNN_CONV_FUSED_ACTIV_PRELU;
+#endif
            }
        }
+        CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
+                   OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
+                   forward_ocl(inputs, outputs, internals))
        int nstripes = std::max(getNumThreads(), 1);
        ParallelConv::run(*inputs[0], outputs[0], weightsMat, biasvec, reluslope,

--- a/modules/dnn/src/ocl4dnn/include/default_kernel_config.hpp
+++ b/modules/dnn/src/ocl4dnn/include/default_kernel_config.hpp
--- a/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp
+++ b/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp
@@ -73,6 +73,11 @@ struct OCL4DNNConvConfig
    bool bias_term; // = false;
 };
+typedef enum {
+    OCL4DNN_CONV_FUSED_ACTIV_NONE                 = 0,
+    OCL4DNN_CONV_FUSED_ACTIV_RELU                 = 1,
+    OCL4DNN_CONV_FUSED_ACTIV_PRELU                = 2,
+} ocl4dnnFusedActiv_t;
 template<typename Dtype>
 class OCL4DNNConvSpatial
@@ -80,9 +85,13 @@ class OCL4DNNConvSpatial
    public:
        explicit OCL4DNNConvSpatial(OCL4DNNConvConfig config);
        ~OCL4DNNConvSpatial();
-        bool Forward(const UMat& bottom_data, const UMat& weight,
+        bool Forward(const UMat& bottom_data,
+                     const UMat& weight,
                     const UMat& bias,
                     UMat& top_data, int32_t batch_size);
+        void setActivReLU(bool fuse_activ, float slope);
+        void setActivPReLU(bool fuse_activ, std::vector<float> &slope);
+        void setBias(bool bias_term);
    private:
        struct kernelConfig
@@ -194,9 +203,9 @@ class OCL4DNNConvSpatial
                                     int32_t blockWidth,
                                     int32_t blockHeight,
                                     int32_t blockDepth);
-        bool setupIDLF(int32_t blockWidth,
+        bool createIDLFKernel(int32_t blockWidth,
-                       int32_t blockHeight,
+                              int32_t blockHeight,
-                       int32_t blockDepth);
+                              int32_t blockDepth);
        bool createBasicKernel(int32_t blockWidth,
                               int32_t blockHeight,
                               int32_t blockDepth);
@@ -244,10 +253,13 @@ class OCL4DNNConvSpatial
                                 int lx, int ly, int lz,
                                 bool swizzle, bool nullLocal);
        void generateTunerItems(std::vector< cv::Ptr<tunerParam> > &tunerItems);
+        void setFusionDefine(ocl4dnnFusedActiv_t fused_activ);
+        void setFusionArg(ocl4dnnFusedActiv_t fused_activ, ocl::Kernel &kernel, cl_uint &argIdx);
        int32_t group_;
        bool bias_term_;
        UMat swizzled_weights_umat;
+        UMat bottom_data2_;
        int32_t bottom_index_;
        int32_t output_h_;
@@ -291,6 +303,9 @@ class OCL4DNNConvSpatial
        std::stringstream options_;
        cv::ocl::ProgramSource src_;
        int32_t prev_kernel_type_;
+        bool negative_slope_;
+        UMat negative_slope_umat_;
+        ocl4dnnFusedActiv_t fused_activ_;
 };
 typedef enum {

--- a/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
+++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
@@ -78,6 +78,8 @@ OCL4DNNConvSpatial<Dtype>::OCL4DNNConvSpatial(OCL4DNNConvConfig config)
    num_output_ = config.out_shape[dims - spatial_dims - 1];
    group_ = config.group;
+    fused_activ_ = OCL4DNN_CONV_FUSED_ACTIV_NONE;
+    negative_slope_ = 0;
    prev_kernel_type_ = -1;
    tuned_ = false;
@@ -138,6 +140,38 @@ OCL4DNNConvSpatial<Dtype>::~OCL4DNNConvSpatial()
    }
 }
+template<typename Dtype>
+void OCL4DNNConvSpatial<Dtype>::setFusionDefine(ocl4dnnFusedActiv_t fused_activ)
+{
+    switch (fused_activ) {
+        case OCL4DNN_CONV_FUSED_ACTIV_RELU:
+            addDef("FUSED_CONV_RELU", 1);
+            break;
+        case OCL4DNN_CONV_FUSED_ACTIV_PRELU:
+            addDef("FUSED_CONV_PRELU", 1);
+            break;
+        default:
+            ;
+    }
+    return;
+}
+template<typename Dtype>
+void OCL4DNNConvSpatial<Dtype>::setFusionArg(ocl4dnnFusedActiv_t fused_activ, ocl::Kernel &kernel, cl_uint &argIdx)
+{
+    switch (fused_activ) {
+        case OCL4DNN_CONV_FUSED_ACTIV_RELU:
+            kernel.set(argIdx++, (float)negative_slope_);
+            break;
+        case OCL4DNN_CONV_FUSED_ACTIV_PRELU:
+            kernel.set(argIdx++, (cl_mem)negative_slope_umat_.handle(ACCESS_READ));
+            break;
+        default:
+            ;
+    }
+    return;
+}
 template<typename Dtype>
 void OCL4DNNConvSpatial<Dtype>::collectCommonInformation()
 {
@@ -221,6 +255,7 @@ void OCL4DNNConvSpatial<Dtype>::setupKernelDetails(int32_t kernelType,
        addDef("ALIGNED_NUM_FILTERS", (int)alignSize(M_, simd_size));
        addDef("OUT_BLOCK_SIZE", (output_block_width*output_block_height));
        addDef("APPLY_BIAS", bias_term_);
+        setFusionDefine(fused_activ_);
        src_ = cv::ocl::dnn::conv_layer_spatial_oclsrc;
    }
@@ -242,6 +277,7 @@ void OCL4DNNConvSpatial<Dtype>::setupKernelDetails(int32_t kernelType,
        addDef("APPLY_BIAS", bias_term_);
        addDef("OUTPUT_Z", M_);
        addDef("ZPAR", 1);
+        setFusionDefine(fused_activ_);
        src_ = cv::ocl::dnn::conv_layer_spatial_oclsrc;
    }
@@ -278,6 +314,7 @@ void OCL4DNNConvSpatial<Dtype>::setupKernelDetails(int32_t kernelType,
        addDef("TILE_N_LAST", M_ % 32);
        addDef("TILE_N_LAST_DIV8", (M_ % 32) / 8);
        addDef("APPLY_BIAS", bias_term_);
+        setFusionDefine(fused_activ_);
        src_ = ocl::dnn::conv_layer_spatial_oclsrc;
    }
 }
@@ -302,6 +339,37 @@ void OCL4DNNConvSpatial<Dtype>::setupKernel()
    setupKernelDetails(kernelType_, blockM_, blockK_, blockN_);
 }
+template<typename Dtype>
+void OCL4DNNConvSpatial<Dtype>::setBias(bool bias_term)
+{
+    bias_term_ = bias_term;
+}
+template<typename Dtype>
+void OCL4DNNConvSpatial<Dtype>::setActivReLU(bool fuse_activ, float slope)
+{
+    if ( fuse_activ )
+    {
+        fused_activ_ = OCL4DNN_CONV_FUSED_ACTIV_RELU;
+        negative_slope_ = slope;
+    }
+    else
+        fused_activ_ = OCL4DNN_CONV_FUSED_ACTIV_NONE;
+}
+template<typename Dtype>
+void OCL4DNNConvSpatial<Dtype>::setActivPReLU(bool fuse_activ, std::vector<float> &slope)
+{
+    if ( fuse_activ )
+    {
+        fused_activ_ = OCL4DNN_CONV_FUSED_ACTIV_PRELU;
+        Mat tmpMat = Mat(num_output_, 1, CV_32FC1, (uchar*)&slope[0]);
+        tmpMat.copyTo(negative_slope_umat_);
+    }
+    else
+        fused_activ_ = OCL4DNN_CONV_FUSED_ACTIV_NONE;
+}
 template<typename Dtype>
 bool OCL4DNNConvSpatial<Dtype>::Forward(const UMat& bottom,
                                        const UMat& weight,
@@ -310,7 +378,6 @@ bool OCL4DNNConvSpatial<Dtype>::Forward(const UMat& bottom,
                                        int32_t numImages)
 {
    num_ = numImages;
    prepareKernel(bottom, top, weight, bias, numImages);
    return convolve(bottom, top, weight, bias, numImages, bestKernelConfig, cv::ocl::Queue::getDefault());
 }
@@ -358,7 +425,9 @@ void OCL4DNNConvSpatial<Dtype>::generateKey()
               << "in" << TUNING_SIZE(width_) << "x" << TUNING_SIZE(height_) << "_"
               << "p" << pad_w_ << "x" << pad_h_ << "_"
               << "num" << num_ << "_"
-               << "M" << M_;
+               << "M" << M_ << "_"
+               << "activ" << fused_activ_;
    key_ = ocl::Device::getDefault().vendorName() + "_EU" + cv::format("%d", ocl::Device::getDefault().maxComputeUnits()) + "_" + keyBuilder.str();
    key_sanitized_ = key_;
@@ -608,6 +677,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
                return false;
            cl_uint argIdx = 0;
+            setFusionArg(fused_activ_, kernel, argIdx);
            UMat img_buffer;
            if (image_offset)
@@ -700,6 +770,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
                return false;
            cl_uint argIdx = 0;
+            setFusionArg(fused_activ_, kernel, argIdx);
            UMat img_buffer;
            if (image_offset)
@@ -807,13 +878,16 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
                int32_t output_image_offset = n * top_dim_
                    + output_w_ * output_h_ * M_ * g;
-                cl_uint argIdx = 0;
+                int32_t kernel_offset = kernel_h_ * kernel_w_ *
-                int32_t kernel_offset = kernel_h_ * kernel_w_ * (channels_ / group_) * M_ * g;
+                                       (channels_ / group_) * M_
+                                       * g;
                ocl::Kernel kernel(config->kernelName.c_str(), program);
                if (kernel.empty())
                    return false;
+                cl_uint argIdx = 0;
+                setFusionArg(fused_activ_, kernel, argIdx);
                kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
                kernel.set(argIdx++, image_offset);
                kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
@@ -1058,9 +1132,9 @@ bool OCL4DNNConvSpatial<float>::createGEMMLikeConvKernel(int32_t blockM,
 }
 template<>
-bool OCL4DNNConvSpatial<float>::setupIDLF(int32_t blockWidth,
+bool OCL4DNNConvSpatial<float>::createIDLFKernel(int32_t blockWidth,
-                                          int32_t blockHeight,
+                                                 int32_t blockHeight,
-                                          int32_t simd_size)
+                                                 int32_t simd_size)
 {
    int32_t workItemOutput[3] = { blockWidth, blockHeight, simd_size };
    const int32_t num_output_maps = M_;
@@ -1122,7 +1196,7 @@ bool OCL4DNNConvSpatial<float>::createConvolutionKernel(int32_t kernelType,
    src_ = ocl::ProgramSource();
    if (kernelType == KERNEL_TYPE_INTEL_IDLF)
-        return setupIDLF(blockWidth, blockHeight, blockDepth);
+        return createIDLFKernel(blockWidth, blockHeight, blockDepth);
    else if (kernelType == KERNEL_TYPE_BASIC)
        return createBasicKernel(blockWidth, blockHeight, blockDepth);
    else if (kernelType == KERNEL_TYPE_GEMM_LIKE)

--- a/modules/dnn/src/opencl/conv_layer_spatial.cl
+++ b/modules/dnn/src/opencl/conv_layer_spatial.cl