optimize out scaleLayer & concatLayer whenever possible

fixed problem in concat layer by disabling memory re-use in layers with multiple inputs trying to fix the tests when Halide is used to run deep nets another attempt to fix Halide tests see if the Halide tests will pass with concat layer fusion turned off trying to fix failures in halide tests; another try one more experiment to make halide_concat & halide_enet tests pass continue attempts to fix halide tests moving on uncomment parallel concat layer seemingly fixed failures in Halide tests and re-enabled concat layer fusion; thanks to dkurt for the patch

optimize out scaleLayer & concatLayer whenever possible
fixed problem in concat layer by disabling memory re-use in layers with multiple inputs trying to fix the tests when Halide is used to run deep nets another attempt to fix Halide tests see if the Halide tests will pass with concat layer fusion turned off trying to fix failures in halide tests; another try one more experiment to make halide_concat & halide_enet tests pass continue attempts to fix halide tests moving on uncomment parallel concat layer seemingly fixed failures in Halide tests and re-enabled concat layer fusion; thanks to dkurt for the patch
0488d9bd · Vadim Pisarevsky · 431e2e6d · 0488d9bd · 0488d9bd · 0488d9bd
Commit 0488d9bd authored Jul 04, 2017 by Vadim Pisarevsky
5 changed files
--- a/modules/dnn/include/opencv2/dnn/dnn.hpp
+++ b/modules/dnn/include/opencv2/dnn/dnn.hpp
@@ -152,6 +152,7 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
    class CV_EXPORTS ActivationLayer;
    class CV_EXPORTS BatchNormLayer;
+    class CV_EXPORTS ScaleLayer;
    /** @brief This interface class allows to build new Layers - are building blocks of networks.
     *
@@ -269,6 +270,19 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
         */
        virtual bool setBatchNorm(const Ptr<BatchNormLayer>& layer);
+        /**
+         * @brief Tries to attach to the layer the subsequent scaling layer, i.e. do the layer fusion in a partial case.
+         * @param[in] layer The subsequent scaling layer.
+         *
+         * Returns true if the scaling layer has been attached successfully.
+         */
+        virtual bool setScale(const Ptr<ScaleLayer>& layer);
+        /**
+         * @brief "Deattaches" all the layers, attached to particular layer.
+         */
+        virtual void unsetAttached();
        virtual bool getMemoryShapes(const std::vector<MatShape> &inputs,
                                     const int requiredOutputs,
                                     std::vector<MatShape> &outputs,
@@ -495,9 +509,10 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
        /** @overload */
        CV_WRAP void getLayerShapes(const std::vector<MatShape>& netInputShapes,
-                                     const int layerId,
+                                    const int layerId,
-                                     std::vector<MatShape>* inLayerShapes,
+                                    std::vector<MatShape>* inLayerShapes,
-                                     std::vector<MatShape>* outLayerShapes) const;
+                                    std::vector<MatShape>* outLayerShapes) const;
        /** @brief Computes FLOP for whole loaded model with specified input shapes.
         * @param netInputShapes vector of shapes for all net inputs.
         * @returns computed FLOP.
@@ -507,10 +522,10 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
        CV_WRAP int64 getFLOPS(const MatShape& netInputShape) const;
        /** @overload */
        CV_WRAP int64 getFLOPS(const int layerId,
-                              const std::vector<MatShape>& netInputShapes) const;
+                               const std::vector<MatShape>& netInputShapes) const;
        /** @overload */
        CV_WRAP int64 getFLOPS(const int layerId,
-                              const MatShape& netInputShape) const;
+                               const MatShape& netInputShape) const;
        /** @brief Returns list of types for layer used in model.
         * @param layersTypes output parameter for returning types.
@@ -557,8 +572,13 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
        CV_WRAP void getMemoryConsumption(const MatShape& netInputShape,
                                          CV_OUT std::vector<int>& layerIds, CV_OUT std::vector<size_t>& weights,
                                          CV_OUT std::vector<size_t>& blobs) const;
-    private:
+        /** @brief Enables or disables layer fusion in the network.
+         * @param fusion true to enable the fusion, false to disable. The fusion is enabled by default.
+         */
+        CV_WRAP void enableFusion(bool fusion);
+    private:
        struct Impl;
        Ptr<Impl> impl;
    };

--- a/modules/dnn/src/dnn.cpp
+++ b/modules/dnn/src/dnn.cpp
--- a/modules/dnn/src/layers/concat_layer.cpp
+++ b/modules/dnn/src/layers/concat_layer.cpp
@@ -94,6 +94,78 @@ public:
               backendId == DNN_BACKEND_HALIDE && haveHalide() && axis == 1;  // By channels
    }
+    class ChannelConcatInvoker : public ParallelLoopBody
+    {
+    public:
+        std::vector<Mat*>* inputs;
+        Mat* output;
+        int nstripes;
+        std::vector<const float*> chptrs;
+        static void run(std::vector<Mat*>& inputs, Mat& output, int nstripes)
+        {
+            ChannelConcatInvoker cc;
+            cc.inputs = &inputs;
+            cc.output = &output;
+            cc.nstripes = nstripes;
+            size_t i, ninputs = inputs.size();
+            int nchannels = 0, batchsz = output.size[0];
+            for( i = 0; i < ninputs; i++ )
+            {
+                Mat& inp = *inputs[i];
+                CV_Assert( inp.isContinuous() && inp.type() == CV_32F &&
+                           inp.dims == 4 && inp.size[0] == output.size[0] &&
+                           inp.size[2] == output.size[2] &&
+                           inp.size[3] == output.size[3] );
+                nchannels += inp.size[1];
+            }
+            CV_Assert( nchannels == output.size[1] );
+            CV_Assert( output.isContinuous() && output.type() == CV_32F );
+            cc.chptrs.resize(nchannels*batchsz);
+            int ofs = 0;
+            for( i = 0; i < ninputs; i++)
+            {
+                Mat& inp = *inputs[i];
+                for( int j = 0; j < batchsz; j++ )
+                    for( int k = 0; k < inp.size[1]; k++ )
+                    {
+                        const float* ptr = inp.ptr<float>(j, k);
+                        cc.chptrs[ofs + j*nchannels + k] = ptr;
+                    }
+                ofs += inp.size[1];
+            }
+            parallel_for_(Range(0, nstripes), cc, nstripes);
+        }
+        ChannelConcatInvoker() {}
+        void operator()(const Range& r) const
+        {
+            size_t planeSize = (size_t)output->size[2]*output->size[3];
+            size_t nch = chptrs.size();
+            size_t total = nch*planeSize;
+            size_t stripeSize = (total + nstripes - 1)/nstripes;
+            size_t stripeStart = r.start*stripeSize;
+            size_t stripeEnd = std::min(total, r.end*stripeSize);
+            const float** ptrs = (const float**)&chptrs[0];
+            float* outptr = output->ptr<float>();
+            size_t blockSize0 = 1 << 16;
+            for( size_t ofs0 = stripeStart; ofs0 < stripeEnd; )
+            {
+                size_t ch = ofs0/planeSize;
+                size_t ofs = ofs0 - ch*planeSize;
+                size_t blockSize = std::min(blockSize0, planeSize - ofs);
+                memcpy(outptr + ofs0, ptrs[ch] + ofs, blockSize*sizeof(outptr[0]));
+                ofs0 += blockSize;
+            }
+        }
+    };
    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
    {
        CV_TRACE_FUNCTION();
@@ -101,14 +173,23 @@ public:
        int cAxis = clamp(axis, inputs[0]->dims);
        Mat& outMat = outputs[0];
-        std::vector<Range> ranges(outputs[0].dims, Range::all());
-        ranges[cAxis].start = 0;
+        if( cAxis == 1 && outMat.dims == 4 )
-        for (size_t i = 0; i < inputs.size(); i++)
+        {
+            int nstripes = getNumThreads();
+            ChannelConcatInvoker::run(inputs, outMat, nstripes);
+        }
+        else
        {
-            ranges[cAxis].end = ranges[cAxis].start + inputs[i]->size[cAxis];
+            std::vector<Range> ranges(outputs[0].dims, Range::all());
-            inputs[i]->copyTo(outMat(&ranges[0]));
-            ranges[cAxis].start = ranges[cAxis].end;
+            ranges[cAxis].start = 0;
+            for (size_t i = 0; i < inputs.size(); i++)
+            {
+                ranges[cAxis].end = ranges[cAxis].start + inputs[i]->size[cAxis];
+                inputs[i]->copyTo(outMat(&ranges[0]));
+                ranges[cAxis].start = ranges[cAxis].end;
+            }
        }
    }

--- a/modules/dnn/src/layers/convolution_layer.cpp
+++ b/modules/dnn/src/layers/convolution_layer.cpp
@@ -148,6 +148,7 @@ public:
    std::vector<float> reluslope;
    Ptr<ActivationLayer> activ;
    Ptr<BatchNormLayer> bnorm;
+    Ptr<ScaleLayer> scaleLayer;
    MatShape computeColRowShape(const MatShape &inpShape, const MatShape &outShape) const
    {
@@ -202,6 +203,9 @@ public:
    bool setBatchNorm(const Ptr<BatchNormLayer>& layer )
    {
+        // for now the scale layer followed by the batch norm cannot be fused, only vice versa.
+        if( !scaleLayer.empty() )
+            return false;
        bnorm = layer;
        // we will need to re-compute the weights with the batch
        // norm coefficients taken into account
@@ -209,6 +213,15 @@ public:
        return !bnorm.empty();
    }
+    bool setScale(const Ptr<ScaleLayer>& layer)
+    {
+        scaleLayer = layer;
+        // we will need to re-compute the weights with the scaling
+        // coefficients taken into account
+        weightsMat.release();
+        return !scaleLayer.empty();
+    }
    virtual Ptr<BackendNode> initHalide(const std::vector<Ptr<BackendWrapper> > &inputs)
    {
 #ifdef HAVE_HALIDE
@@ -678,32 +691,56 @@ public:
                    biasvec[k] = biasMat.at<float>(k);
            }
-            if( !bnorm.empty() )
+            if( !bnorm.empty() || !scaleLayer.empty() )
            {
-                Mat scale, shift;
+                Mat scale, shift, scale2, shift2;
-                bnorm->getScaleShift(scale, shift);
+                const float *scaleptr = 0, *shiftptr = 0;
+                const float *scaleptr2 = 0, *shiftptr2 = 0;
-                CV_Assert( scale.isContinuous() && shift.isContinuous() &&
+                if( !bnorm.empty() )
-                           scale.type() == CV_32F && shift.type() == CV_32F &&
+                {
-                           scale.total() == (size_t)outCn &&
+                    bnorm->getScaleShift(scale, shift);
-                           shift.total() == (size_t)outCn );
+                    CV_Assert( scale.isContinuous() && shift.isContinuous() &&
+                               scale.type() == CV_32F && shift.type() == CV_32F &&
+                               scale.total() == (size_t)outCn &&
+                               shift.total() == (size_t)outCn );
+                    scaleptr = scale.ptr<float>();
+                    shiftptr = shift.ptr<float>();
+                }
+                if( !scaleLayer.empty() )
+                {
+                    scale2 = scaleLayer->blobs[0];
+                    CV_Assert( scale2.isContinuous() && scale2.type() == CV_32F &&
+                               scale2.total() == (size_t)outCn );
+                    scaleptr2 = scale2.ptr<float>();
+                    if( scaleLayer->hasBias )
+                    {
+                        shift2 = scaleLayer->blobs[1];
+                        CV_Assert( shift2.isContinuous() && shift2.type() == CV_32F &&
+                                   shift2.total() == (size_t)outCn );
+                        shiftptr2 = shift2.ptr<float>();
+                    }
+                }
                for( int i = 0; i < outCn; i++ )
                {
-                    float s = scale.at<float>(i);
+                    float s1 = scaleptr ? scaleptr[i] : 1.f;
-                    float delta = shift.at<float>(i);
+                    float delta1 = shiftptr ? shiftptr[i] : 0.f;
+                    float s2 = scaleptr2 ? scaleptr2[i] : 1.f;
+                    float delta2 = shiftptr2 ? shiftptr2[i] : 0.f;
                    float* w_i = weightsMat.ptr<float>(i);
                    int j, wcols = weightsMat.cols;
                    for( j = 0; j < wcols; j++ )
-                        w_i[j] *= s;
+                        w_i[j] *= (s1*s2);
-                    biasvec[i] = biasvec[i]*s + delta;
+                    biasvec[i] = biasvec[i]*(s1*s2) + (delta1*s2 + delta2);
                }
            }
            biasvec[outCn] = biasvec[outCn+1] = biasvec[outCn-1];
        }
+        reluslope.clear();
        if( activ )
        {
            Ptr<ReLULayer> activ_relu = activ.dynamicCast<ReLULayer>();

--- a/modules/dnn/test/test_halide_layers.cpp
+++ b/modules/dnn/test/test_halide_layers.cpp
@@ -517,7 +517,8 @@ TEST_P(Concat, Accuracy)
    Net net;
-    std::vector<int> convLayerIds(numChannels.channels);
+    std::vector<int> convLayerIds;
+    convLayerIds.reserve(numChannels.channels);
    for (int i = 0, n = numChannels.channels; i < n; ++i)
    {
        if (!numChannels[i])
@@ -537,8 +538,9 @@ TEST_P(Concat, Accuracy)
        convParam.name = ss.str();
        convParam.blobs.push_back(weights);
-        convLayerIds[i] = net.addLayer(convParam.name, convParam.type, convParam);
+        int layerId = net.addLayer(convParam.name, convParam.type, convParam);
-        net.connect(0, 0, convLayerIds[i], 0);
+        convLayerIds.push_back(layerId);
+        net.connect(0, 0, layerId, 0);
    }
    LayerParams concatParam;