Merge pull request #9090 from vpisarev:dnn_optim_scale_concat

86e8a105 · Alexander Alekhin · a586ef72 · 0488d9bd · 86e8a105 · 86e8a105
Commit 86e8a105 authored Jul 14, 2017 by Alexander Alekhin
5 changed files
--- a/modules/dnn/include/opencv2/dnn/dnn.hpp
+++ b/modules/dnn/include/opencv2/dnn/dnn.hpp
@@ -152,6 +152,7 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN

    class CV_EXPORTS ActivationLayer;
    class CV_EXPORTS BatchNormLayer;
+    class CV_EXPORTS ScaleLayer;

    /** @brief This interface class allows to build new Layers - are building blocks of networks.
     *
@@ -269,6 +270,19 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
         */
        virtual bool setBatchNorm(const Ptr<BatchNormLayer>& layer);

+        /**
+         * @brief Tries to attach to the layer the subsequent scaling layer, i.e. do the layer fusion in a partial case.
+         * @param[in] layer The subsequent scaling layer.
+         *
+         * Returns true if the scaling layer has been attached successfully.
+         */
+        virtual bool setScale(const Ptr<ScaleLayer>& layer);
+
+        /**
+         * @brief "Deattaches" all the layers, attached to particular layer.
+         */
+        virtual void unsetAttached();
+
        virtual bool getMemoryShapes(const std::vector<MatShape> &inputs,
                                     const int requiredOutputs,
                                     std::vector<MatShape> &outputs,
@@ -495,9 +509,10 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN

        /** @overload */
        CV_WRAP void getLayerShapes(const std::vector<MatShape>& netInputShapes,
-                                     const int layerId,
-                                     std::vector<MatShape>* inLayerShapes,
-                                     std::vector<MatShape>* outLayerShapes) const;
+                                    const int layerId,
+                                    std::vector<MatShape>* inLayerShapes,
+                                    std::vector<MatShape>* outLayerShapes) const;
+
        /** @brief Computes FLOP for whole loaded model with specified input shapes.
         * @param netInputShapes vector of shapes for all net inputs.
         * @returns computed FLOP.
@@ -507,10 +522,10 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
        CV_WRAP int64 getFLOPS(const MatShape& netInputShape) const;
        /** @overload */
        CV_WRAP int64 getFLOPS(const int layerId,
-                              const std::vector<MatShape>& netInputShapes) const;
+                               const std::vector<MatShape>& netInputShapes) const;
        /** @overload */
        CV_WRAP int64 getFLOPS(const int layerId,
-                              const MatShape& netInputShape) const;
+                               const MatShape& netInputShape) const;

        /** @brief Returns list of types for layer used in model.
         * @param layersTypes output parameter for returning types.
@@ -557,8 +572,13 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
        CV_WRAP void getMemoryConsumption(const MatShape& netInputShape,
                                          CV_OUT std::vector<int>& layerIds, CV_OUT std::vector<size_t>& weights,
                                          CV_OUT std::vector<size_t>& blobs) const;
-    private:

+        /** @brief Enables or disables layer fusion in the network.
+         * @param fusion true to enable the fusion, false to disable. The fusion is enabled by default.
+         */
+        CV_WRAP void enableFusion(bool fusion);
+
+    private:
        struct Impl;
        Ptr<Impl> impl;
    };

--- a/modules/dnn/src/dnn.cpp
+++ b/modules/dnn/src/dnn.cpp
--- a/modules/dnn/src/layers/concat_layer.cpp
+++ b/modules/dnn/src/layers/concat_layer.cpp
@@ -94,6 +94,78 @@ public:
               backendId == DNN_BACKEND_HALIDE && haveHalide() && axis == 1;  // By channels
    }

+    class ChannelConcatInvoker : public ParallelLoopBody
+    {
+    public:
+        std::vector<Mat*>* inputs;
+        Mat* output;
+        int nstripes;
+        std::vector<const float*> chptrs;
+
+        static void run(std::vector<Mat*>& inputs, Mat& output, int nstripes)
+        {
+            ChannelConcatInvoker cc;
+            cc.inputs = &inputs;
+            cc.output = &output;
+            cc.nstripes = nstripes;
+
+            size_t i, ninputs = inputs.size();
+            int nchannels = 0, batchsz = output.size[0];
+            for( i = 0; i < ninputs; i++ )
+            {
+                Mat& inp = *inputs[i];
+                CV_Assert( inp.isContinuous() && inp.type() == CV_32F &&
+                           inp.dims == 4 && inp.size[0] == output.size[0] &&
+                           inp.size[2] == output.size[2] &&
+                           inp.size[3] == output.size[3] );
+                nchannels += inp.size[1];
+            }
+            CV_Assert( nchannels == output.size[1] );
+            CV_Assert( output.isContinuous() && output.type() == CV_32F );
+
+            cc.chptrs.resize(nchannels*batchsz);
+
+            int ofs = 0;
+            for( i = 0; i < ninputs; i++)
+            {
+                Mat& inp = *inputs[i];
+                for( int j = 0; j < batchsz; j++ )
+                    for( int k = 0; k < inp.size[1]; k++ )
+                    {
+                        const float* ptr = inp.ptr<float>(j, k);
+                        cc.chptrs[ofs + j*nchannels + k] = ptr;
+                    }
+                ofs += inp.size[1];
+            }
+
+            parallel_for_(Range(0, nstripes), cc, nstripes);
+        }
+
+        ChannelConcatInvoker() {}
+
+        void operator()(const Range& r) const
+        {
+            size_t planeSize = (size_t)output->size[2]*output->size[3];
+            size_t nch = chptrs.size();
+            size_t total = nch*planeSize;
+            size_t stripeSize = (total + nstripes - 1)/nstripes;
+            size_t stripeStart = r.start*stripeSize;
+            size_t stripeEnd = std::min(total, r.end*stripeSize);
+            const float** ptrs = (const float**)&chptrs[0];
+            float* outptr = output->ptr<float>();
+            size_t blockSize0 = 1 << 16;
+
+            for( size_t ofs0 = stripeStart; ofs0 < stripeEnd; )
+            {
+                size_t ch = ofs0/planeSize;
+                size_t ofs = ofs0 - ch*planeSize;
+                size_t blockSize = std::min(blockSize0, planeSize - ofs);
+                memcpy(outptr + ofs0, ptrs[ch] + ofs, blockSize*sizeof(outptr[0]));
+                ofs0 += blockSize;
+            }
+        }
+    };
+
    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
    {
        CV_TRACE_FUNCTION();
@@ -101,14 +173,23 @@ public:

        int cAxis = clamp(axis, inputs[0]->dims);
        Mat& outMat = outputs[0];
-        std::vector<Range> ranges(outputs[0].dims, Range::all());

-        ranges[cAxis].start = 0;
-        for (size_t i = 0; i < inputs.size(); i++)
+        if( cAxis == 1 && outMat.dims == 4 )
+        {
+            int nstripes = getNumThreads();
+            ChannelConcatInvoker::run(inputs, outMat, nstripes);
+        }
+        else
        {
-            ranges[cAxis].end = ranges[cAxis].start + inputs[i]->size[cAxis];
-            inputs[i]->copyTo(outMat(&ranges[0]));
-            ranges[cAxis].start = ranges[cAxis].end;
+            std::vector<Range> ranges(outputs[0].dims, Range::all());
+
+            ranges[cAxis].start = 0;
+            for (size_t i = 0; i < inputs.size(); i++)
+            {
+                ranges[cAxis].end = ranges[cAxis].start + inputs[i]->size[cAxis];
+                inputs[i]->copyTo(outMat(&ranges[0]));
+                ranges[cAxis].start = ranges[cAxis].end;
+            }
        }
    }


--- a/modules/dnn/src/layers/convolution_layer.cpp
+++ b/modules/dnn/src/layers/convolution_layer.cpp
@@ -148,6 +148,7 @@ public:
    std::vector<float> reluslope;
    Ptr<ActivationLayer> activ;
    Ptr<BatchNormLayer> bnorm;
+    Ptr<ScaleLayer> scaleLayer;

    MatShape computeColRowShape(const MatShape &inpShape, const MatShape &outShape) const
    {
@@ -202,6 +203,9 @@ public:

    bool setBatchNorm(const Ptr<BatchNormLayer>& layer )
    {
+        // for now the scale layer followed by the batch norm cannot be fused, only vice versa.
+        if( !scaleLayer.empty() )
+            return false;
        bnorm = layer;
        // we will need to re-compute the weights with the batch
        // norm coefficients taken into account
@@ -209,6 +213,15 @@ public:
        return !bnorm.empty();
    }

+    bool setScale(const Ptr<ScaleLayer>& layer)
+    {
+        scaleLayer = layer;
+        // we will need to re-compute the weights with the scaling
+        // coefficients taken into account
+        weightsMat.release();
+        return !scaleLayer.empty();
+    }
+
    virtual Ptr<BackendNode> initHalide(const std::vector<Ptr<BackendWrapper> > &inputs)
    {
 #ifdef HAVE_HALIDE
@@ -678,32 +691,56 @@ public:
                    biasvec[k] = biasMat.at<float>(k);
            }

-            if( !bnorm.empty() )
+            if( !bnorm.empty() || !scaleLayer.empty() )
            {
-                Mat scale, shift;
-                bnorm->getScaleShift(scale, shift);
+                Mat scale, shift, scale2, shift2;
+                const float *scaleptr = 0, *shiftptr = 0;
+                const float *scaleptr2 = 0, *shiftptr2 = 0;

-                CV_Assert( scale.isContinuous() && shift.isContinuous() &&
-                           scale.type() == CV_32F && shift.type() == CV_32F &&
-                           scale.total() == (size_t)outCn &&
-                           shift.total() == (size_t)outCn );
+                if( !bnorm.empty() )
+                {
+                    bnorm->getScaleShift(scale, shift);
+                    CV_Assert( scale.isContinuous() && shift.isContinuous() &&
+                               scale.type() == CV_32F && shift.type() == CV_32F &&
+                               scale.total() == (size_t)outCn &&
+                               shift.total() == (size_t)outCn );
+                    scaleptr = scale.ptr<float>();
+                    shiftptr = shift.ptr<float>();
+                }
+                if( !scaleLayer.empty() )
+                {
+                    scale2 = scaleLayer->blobs[0];
+                    CV_Assert( scale2.isContinuous() && scale2.type() == CV_32F &&
+                               scale2.total() == (size_t)outCn );
+                    scaleptr2 = scale2.ptr<float>();
+                    if( scaleLayer->hasBias )
+                    {
+                        shift2 = scaleLayer->blobs[1];
+                        CV_Assert( shift2.isContinuous() && shift2.type() == CV_32F &&
+                                   shift2.total() == (size_t)outCn );
+                        shiftptr2 = shift2.ptr<float>();
+                    }
+                }

                for( int i = 0; i < outCn; i++ )
                {
-                    float s = scale.at<float>(i);
-                    float delta = shift.at<float>(i);
+                    float s1 = scaleptr ? scaleptr[i] : 1.f;
+                    float delta1 = shiftptr ? shiftptr[i] : 0.f;
+                    float s2 = scaleptr2 ? scaleptr2[i] : 1.f;
+                    float delta2 = shiftptr2 ? shiftptr2[i] : 0.f;
                    float* w_i = weightsMat.ptr<float>(i);
                    int j, wcols = weightsMat.cols;

                    for( j = 0; j < wcols; j++ )
-                        w_i[j] *= s;
+                        w_i[j] *= (s1*s2);

-                    biasvec[i] = biasvec[i]*s + delta;
+                    biasvec[i] = biasvec[i]*(s1*s2) + (delta1*s2 + delta2);
                }
            }
            biasvec[outCn] = biasvec[outCn+1] = biasvec[outCn-1];
        }

+        reluslope.clear();
        if( activ )
        {
            Ptr<ReLULayer> activ_relu = activ.dynamicCast<ReLULayer>();

--- a/modules/dnn/test/test_halide_layers.cpp
+++ b/modules/dnn/test/test_halide_layers.cpp
@@ -517,7 +517,8 @@ TEST_P(Concat, Accuracy)

    Net net;

-    std::vector<int> convLayerIds(numChannels.channels);
+    std::vector<int> convLayerIds;
+    convLayerIds.reserve(numChannels.channels);
    for (int i = 0, n = numChannels.channels; i < n; ++i)
    {
        if (!numChannels[i])
@@ -537,8 +538,9 @@ TEST_P(Concat, Accuracy)
        convParam.name = ss.str();
        convParam.blobs.push_back(weights);

-        convLayerIds[i] = net.addLayer(convParam.name, convParam.type, convParam);
-        net.connect(0, 0, convLayerIds[i], 0);
+        int layerId = net.addLayer(convParam.name, convParam.type, convParam);
+        convLayerIds.push_back(layerId);
+        net.connect(0, 0, layerId, 0);
    }

    LayerParams concatParam;