improved speed of ENet processing.

9e26b24d · Vadim Pisarevsky · 3f5b4655 · 9e26b24d · 9e26b24d · 9e26b24d
Commit 9e26b24d authored Apr 25, 2017 by Vadim Pisarevsky
7 changed files
--- a/modules/dnn/samples/torch_enet.cpp
+++ b/modules/dnn/samples/torch_enet.cpp
@@ -98,14 +98,19 @@ int main(int argc, char **argv)
    net.setBlob("", inputBlob);        //set the network input
    //! [Set input blob]
+    const int N = 3;
    TickMeter tm;
-    tm.start();
    //! [Make forward pass]
-    net.forward();                          //compute output
+    for( int i = 0; i < N; i++ )
-    //! [Make forward pass]
+    {
+        TickMeter tm_;
-    tm.stop();
+        tm_.start();
+        net.forward();                          //compute output
+        tm_.stop();
+        if( i == 0 || tm_.getTimeTicks() < tm.getTimeTicks() )
+            tm = tm_;
+    }
    //! [Gather output]

--- a/modules/dnn/src/layers/batch_norm_layer.cpp
+++ b/modules/dnn/src/layers/batch_norm_layer.cpp
@@ -41,6 +41,15 @@ public:
            Mat* inp = inputs[i];
            outputs[i].create(inp->dims, &inp->size.p[0], inp->type());
        }
+        varMeanScale = 1.f;
+        if (!hasWeights && !hasBias) {
+            varMeanScale = *blobs[2].ptr<float>();
+            if (varMeanScale != 0)
+                varMeanScale = 1/varMeanScale;
+        }
+        cv::pow(blobs[1]*varMeanScale + epsilon, -0.5, invStdMat);
    }
    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
@@ -52,16 +61,6 @@ public:
        int weightsBlobIndex = 2;
        int biasBlobIndex = weightsBlobIndex + hasWeights;
-        float varMeanScale = 1;
-        if (!hasWeights && !hasBias) {
-            varMeanScale = *blobs[2].ptr<float>();
-            if (varMeanScale != 0)
-                varMeanScale = 1/varMeanScale;
-        }
-        Mat invStdMat;
-        cv::pow(blobs[1]*varMeanScale + epsilon, -0.5, invStdMat);
        int rows = inpBlob.size[2];
        int cols = inpBlob.size[3];
@@ -92,7 +91,8 @@ public:
    }
    bool hasWeights, hasBias;
-    float epsilon;
+    float epsilon, varMeanScale;
+    Mat invStdMat;
 };
 Ptr<BatchNormLayer> BatchNormLayer::create(const LayerParams& params)

--- a/modules/dnn/src/layers/convolution_layer.cpp
+++ b/modules/dnn/src/layers/convolution_layer.cpp
--- a/modules/dnn/src/layers/elementwise_layers.cpp
+++ b/modules/dnn/src/layers/elementwise_layers.cpp
@@ -15,8 +15,7 @@ using std::pow;
 template<typename Func>
 class ElementWiseLayer : public Func::Layer
 {
-    Func func;
+public:
    template<typename Dtype>
    class PBody : public cv::ParallelLoopBody
    {
@@ -35,9 +34,7 @@ class ElementWiseLayer : public Func::Layer
        }
    };
-public:
+    ElementWiseLayer(bool run_parallel_=false, const Func &f=Func()) : func(f), run_parallel(run_parallel_) {}
-    ElementWiseLayer(const Func &f=Func()) : func(f) {}
    void allocate(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
    {
@@ -58,9 +55,16 @@ public:
            Range sizeRange = Range(0, dst.total());
            CV_Assert(src.type() == CV_32F);
-            cv::parallel_for_(sizeRange, PBody<float>(dst, func));
+            PBody<float> body(dst, func);
+            if( run_parallel )
+                cv::parallel_for_(sizeRange, body);
+            else
+                body(sizeRange);
        }
    }
+    Func func;
+    bool run_parallel;
 };
 struct ReLUFunctor
@@ -135,8 +139,24 @@ struct PowerFunctor
    template<typename TFloat>
    inline TFloat operator()(TFloat x) const
    {
-        return power == 1.0f ? (TFloat)shift + (TFloat)scale * x :
+        return pow((TFloat)shift + (TFloat)scale * x, (TFloat)power);
-            pow((TFloat)shift + (TFloat)scale * x, (TFloat)power);
+    }
+};
+struct PowerFunctor1
+{
+    typedef PowerLayer Layer;
+    const float scale;
+    const float shift;
+    PowerFunctor1(float scale_ = 1.f, float shift_ = 0)
+    : scale(scale_), shift(shift_) {}
+    template<typename TFloat>
+    inline TFloat operator()(TFloat x) const
+    {
+        return (TFloat)shift + (TFloat)scale * x;
    }
 };
@@ -165,12 +185,12 @@ public:
    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
    {
        CV_Assert(inputs.size() == 1);
        Mat &inpBlob = *inputs[0];
        for (size_t ii = 0; ii < outputs.size(); ii++)
        {
            Mat &outBlob = outputs[ii];
+            CV_Assert(inpBlob.isContinuous() && outBlob.isContinuous());
            CV_Assert(blobs[0].total() == inpBlob.size[1]);
@@ -181,8 +201,16 @@ public:
                Mat inpBlobPlane = getPlane(inpBlob, 0, n);
                Mat outBlobPlane = getPlane(outBlob, 0, n);
-                threshold(inpBlobPlane, outBlobPlane, 0, 0, cv::THRESH_TOZERO_INV);
+                size_t i, planeTotal = inpBlobPlane.total();
-                scaleAdd(outBlobPlane, slopeWeight-1, inpBlobPlane, outBlobPlane);
+                const float* inptr = inpBlobPlane.ptr<float>();
+                float* outptr = outBlobPlane.ptr<float>();
+                for( i = 0; i < planeTotal; i++ )
+                {
+                    float val = inptr[i];
+                    outptr[i] = val*(val >= 0.f ? 1.f : slopeWeight);
+                }
+                //threshold(inpBlobPlane, outBlobPlane, 0, 0, cv::THRESH_TOZERO_INV);
+                //scaleAdd(outBlobPlane, slopeWeight-1, inpBlobPlane, outBlobPlane);
            }
        }
    }
@@ -196,7 +224,7 @@ Ptr<_Layer> _Layer::create() { \
 Ptr<ReLULayer> ReLULayer::create(const LayerParams& params)
 {
    float negativeSlope = params.get<float>("negative_slope", 0.f);
-    Ptr<ReLULayer> l(new ElementWiseLayer<ReLUFunctor>(ReLUFunctor(negativeSlope)));
+    Ptr<ReLULayer> l(new ElementWiseLayer<ReLUFunctor>(false, ReLUFunctor(negativeSlope)));
    l->setParamsFrom(params);
    return l;
@@ -204,7 +232,7 @@ Ptr<ReLULayer> ReLULayer::create(const LayerParams& params)
 Ptr<TanHLayer> TanHLayer::create(const LayerParams& params)
 {
-    Ptr<TanHLayer> l(new ElementWiseLayer<TanHFunctor>());
+    Ptr<TanHLayer> l(new ElementWiseLayer<TanHFunctor>(true));
    l->setParamsFrom(params);
    return l;
@@ -212,7 +240,7 @@ Ptr<TanHLayer> TanHLayer::create(const LayerParams& params)
 Ptr<SigmoidLayer> SigmoidLayer::create(const LayerParams& params)
 {
-    Ptr<SigmoidLayer> l(new ElementWiseLayer<SigmoidFunctor>());
+    Ptr<SigmoidLayer> l(new ElementWiseLayer<SigmoidFunctor>(true));
    l->setParamsFrom(params);
    return l;
@@ -228,7 +256,7 @@ Ptr<AbsLayer> AbsLayer::create(const LayerParams& params)
 Ptr<BNLLLayer> BNLLLayer::create(const LayerParams& params)
 {
-    Ptr<BNLLLayer> l(new ElementWiseLayer<BNLLFunctor>());
+    Ptr<BNLLLayer> l(new ElementWiseLayer<BNLLFunctor>(true));
    l->setParamsFrom(params);
    return l;
@@ -239,7 +267,9 @@ Ptr<PowerLayer> PowerLayer::create(const LayerParams& params)
    float power = params.get<float>("power", 1.0f);
    float scale = params.get<float>("scale", 1.0f);
    float shift = params.get<float>("shift", 0.0f);
-    Ptr<PowerLayer> l(new ElementWiseLayer<PowerFunctor>(PowerFunctor(power, scale, shift)));
+    Ptr<PowerLayer> l(power == 1.f ?
+                      (PowerLayer*)(new ElementWiseLayer<PowerFunctor1>(false, PowerFunctor1(scale, shift))) :
+                      (PowerLayer*)(new ElementWiseLayer<PowerFunctor>(true, PowerFunctor(power, scale, shift))));
    l->setParamsFrom(params);
    return l;

--- a/modules/dnn/src/layers/eltwise_layer.cpp
+++ b/modules/dnn/src/layers/eltwise_layer.cpp
@@ -98,15 +98,14 @@ public:
    void forward(std::vector<Mat *> &inputs, std::vector<Mat> &outputs)
    {
+        Mat& output = outputs[0];
        switch (op)
        {
            case SUM:
-            {
                CV_Assert(coeffs.size() == 0 || coeffs.size() == inputs.size());
-                Mat& output = outputs[0];
-                output.setTo(0.);
                if (0 < coeffs.size())
                {
+                    output.setTo(0.);
                    for (size_t i = 0; i < inputs.size(); i++)
                    {
                        output += *inputs[i] * coeffs[i];
@@ -114,32 +113,26 @@ public:
                }
                else
                {
-                    for (size_t i = 0; i < inputs.size(); i++)
+                    add(*inputs[0], *inputs[1], output);
+                    for (size_t i = 2; i < inputs.size(); i++)
                    {
                        output += *inputs[i];
                    }
                }
-            }
                break;
            case PROD:
-            {
-                Mat& output = outputs[0];
                output.setTo(1.);
                for (size_t i = 0; i < inputs.size(); i++)
                {
                    output = output.mul(*inputs[i]);
                }
-            }
                break;
            case MAX:
-            {
-                Mat& output = outputs[0];
                cv::max(*inputs[0], *inputs[1], output);
                for (size_t i = 2; i < inputs.size(); i++)
                {
                    cv::max(output, *inputs[i], output);
                }
-            }
                break;
            default:
                CV_Assert(0);

--- a/modules/dnn/src/layers/op_im2col.cpp
+++ b/modules/dnn/src/layers/op_im2col.cpp
--- a/modules/dnn/src/layers/op_im2col.hpp
+++ b/modules/dnn/src/layers/op_im2col.hpp
@@ -49,264 +49,15 @@ namespace cv
 namespace dnn
 {
-template <typename Dtype>
+void im2row(const float* data_im, int channels, int height, int width,
-class im2col_CpuPBody : public cv::ParallelLoopBody
+            int kernel_h, int kernel_w, int pad_h, int pad_w,
-{
+            int stride_h, int stride_w, int dilation_h, int dilation_w,
-    const Dtype* data_im;
+            int height_col, int width_col, float* data_col);
-    int channels, height, width;
-    int kernel_h, kernel_w;
+void col2im(const float* data_col, int channels, int height, int width,
-    int pad_h, pad_w;
+            int kernel_h, int kernel_w, int pad_h, int pad_w,
-    int stride_h, stride_w;
+            int stride_h, int stride_w, int dilation_h, int dilation_w,
-    int dilation_h, dilation_w;
+            float* data_im, const int* ofsbuf);
-    Dtype* data_col;
-    int height_col, width_col, channels_col;
-    im2col_CpuPBody() {}
-public:
-    static void run(const Dtype* data_im,
-                    int channels, int height, int width,
-                    int kernel_h, int kernel_w,
-                    int pad_h, int pad_w,
-                    int stride_h, int stride_w,
-                    int dilation_h, int dilation_w,
-                    int height_col, int width_col,
-                    Dtype* data_col)
-    {
-        im2col_CpuPBody<Dtype> t;
-        t.data_im = data_im;
-        t.data_col = data_col;
-        t.channels = channels; t.height = height; t.width = width;
-        t.kernel_h = kernel_h; t.kernel_w = kernel_w;
-        t.pad_h = pad_h; t.pad_w = pad_w;
-        t.stride_h = stride_h; t.stride_w = stride_w;
-        t.dilation_h = dilation_h; t.dilation_w = dilation_w;
-        t.height_col = height_col;
-        t.width_col = width_col;
-        t.channels_col = channels * kernel_h * kernel_w;
-        cv::parallel_for_(Range(0, t.channels_col), t);
-    }
-    virtual void operator ()(const Range &r) const
-    {
-        for (int c = r.start; c < r.end; ++c)
-        {
-            int w_offset = c % kernel_w;
-            int h_offset = (c / kernel_w) % kernel_h;
-            int c_im = c / kernel_h / kernel_w;
-            for (int h = 0; h < height_col; ++h)
-            {
-                for (int w = 0; w < width_col; ++w)
-                {
-                    int h_pad = h * stride_h - pad_h + h_offset * dilation_h;
-                    int w_pad = w * stride_w - pad_w + w_offset * dilation_w;
-                    if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
-                        data_col[(c * height_col + h) * width_col + w] =
-                            data_im[(c_im * height + h_pad) * width + w_pad];
-                    else
-                        data_col[(c * height_col + h) * width_col + w] = 0;
-                }
-            }
-        }
-    }
-};
-template <typename Dtype>
-class im2row_CpuPBody : public cv::ParallelLoopBody
-{
-    const Dtype* data_im;
-    int channels, height, width;
-    int kernel_h, kernel_w;
-    int pad_h, pad_w;
-    int stride_h, stride_w;
-    int dilation_h, dilation_w;
-    Dtype* data_col;
-    int height_col, width_col, channels_col;
-    im2row_CpuPBody() {}
-public:
-    static void run(const Dtype* data_im,
-                    int channels, int height, int width,
-                    int kernel_h, int kernel_w,
-                    int pad_h, int pad_w,
-                    int stride_h, int stride_w,
-                    int dilation_h, int dilation_w,
-                    int height_col, int width_col,
-                    Dtype* data_col)
-    {
-        im2row_CpuPBody<Dtype> t;
-        t.data_im = data_im;
-        t.data_col = data_col;
-        t.channels = channels; t.height = height; t.width = width;
-        t.kernel_h = kernel_h; t.kernel_w = kernel_w;
-        t.pad_h = pad_h; t.pad_w = pad_w;
-        t.stride_h = stride_h; t.stride_w = stride_w;
-        t.dilation_h = dilation_h; t.dilation_w = dilation_w;
-        t.height_col = height_col;
-        t.width_col = width_col;
-        t.channels_col = channels * kernel_h * kernel_w;
-        cv::parallel_for_(Range(0, t.height_col*t.width_col), t, 16);
-    }
-    virtual void operator ()(const Range &r) const
-    {
-        int dh = dilation_h, dw = dilation_w;
-        Dtype* data_col_ = data_col;
-        const Dtype* data_im_ = data_im;
-        for (int row = r.start; row < r.end; ++row)
-        {
-            int out_c = row % width_col;
-            int out_r = row / width_col;
-            int out_row_offset = row*kernel_h*kernel_w*channels;
-            int start_in_r = out_r * stride_h - pad_h;
-            int start_in_c = out_c * stride_w - pad_w;
-            int start_k_r = std::max(0, cvCeil(-start_in_r/(float)dilation_h));
-            int end_k_r = std::min(kernel_h, cvCeil((height - start_in_r)/(float)dilation_h));
-            int start_k_c = std::max(0, cvCeil(-start_in_c/(float)dilation_w));
-            int end_k_c = std::min(kernel_w, cvCeil((width - start_in_c)/(float)dilation_w));
-            for(int i_c = 0; i_c < channels; i_c++)
-            {
-                int channels_offset = i_c * width * height;
-                int out_ch_offset = i_c*kernel_h*kernel_w;
-                int in_r = start_in_r + start_k_r*dilation_h;
-                for(int k_r = start_k_r; k_r < end_k_r; k_r++, in_r += dh)
-                {
-                    int row_offset = in_r*width;
-                    int out_col_offset = k_r*kernel_w;
-                    int in_c = start_in_c + start_k_c*dilation_w;
-                    for(int k_c = start_k_c; k_c < end_k_c; k_c++, in_c += dw)
-                    {
-                        int in_index = channels_offset + row_offset + in_c;
-                        int out_index = out_row_offset + out_ch_offset + out_col_offset + k_c;
-                        data_col_[out_index] = data_im_[in_index];
-                    }
-                }
-            }
-        }
-    }
-};
-template <typename Dtype>
-class col2im_CpuPBody : public cv::ParallelLoopBody
-{
-    const Dtype* data_col;
-    int channels, height, width;
-    int kernel_h, kernel_w;
-    int pad_h, pad_w;
-    int stride_h, stride_w;
-    Dtype* data_im;
-    int height_col, width_col;
-    col2im_CpuPBody() {}
-public:
-    static void run(const Dtype* data_col,
-                    int channels, int height, int width,
-                    int kernel_h, int kernel_w,
-                    int pad_h, int pad_w,
-                    int stride_h, int stride_w,
-                    Dtype* data_im)
-    {
-        //TODO: single-threaded version switch
-        col2im_CpuPBody t;
-        t.data_col = data_col;
-        t.data_im = data_im;
-        t.channels = channels; t.height = height; t.width = width;
-        t.kernel_h = kernel_h; t.kernel_w = kernel_w;
-        t.pad_h = pad_h; t.pad_w = pad_w;
-        t.stride_h = stride_h; t.stride_w = stride_w;
-        t.height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
-        t.width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
-        int img_total = channels * height * width;
-        cv::parallel_for_(Range(0, img_total), t);
-    }
-    virtual void operator ()(const Range &r) const
-    {
-        const Dtype* data_col_ = data_col;
-        Dtype* data_im_ = data_im;
-        int coeff_h_col = (1 - stride_h * kernel_w * height_col) * width_col;
-        int coeff_w_col = (1 - stride_w * height_col * width_col);
-        for (int index = r.start; index < r.end; index++)
-        {
-            Dtype val = 0;
-            int w = index % width + pad_w;
-            int h = (index / width) % height + pad_h;
-            int c = index / (width * height);
-            // compute the start and end of the output
-            int w_col_start = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
-            int w_col_end = std::min(w / stride_w + 1, width_col);
-            int h_col_start = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
-            int h_col_end = std::min(h / stride_h + 1, height_col);
-            // equivalent implementation
-            int offset =
-            (c * kernel_h * kernel_w + h * kernel_w + w) * height_col * width_col;
-            for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
-              for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
-                val += data_col_[offset + h_col * coeff_h_col + w_col * coeff_w_col];
-              }
-            }
-            data_im_[index] = val;
-        }
-    }
-};
-//single-threaded version
-template <typename Dtype>
-void col2im_cpu(const Dtype* data_col,
-                int channels, int height, int width,
-                int kernel_h, int kernel_w,
-                int pad_h, int pad_w,
-                int stride_h, int stride_w,
-                int dilation_h, int dilation_w,
-                Dtype* data_im)
-{
-    int height_col = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
-    int width_col = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
-    int channels_col = channels * kernel_h * kernel_w;
-    std::memset(data_im, 0, height * width * channels * sizeof(Dtype));
-    for (int c = 0; c < channels_col; ++c)
-    {
-        int w_offset = c % kernel_w;
-        int h_offset = (c / kernel_w) % kernel_h;
-        int c_im = c / kernel_h / kernel_w;
-        for (int h = 0; h < height_col; ++h)
-        {
-            for (int w = 0; w < width_col; ++w)
-            {
-                int h_pad = h * stride_h - pad_h + h_offset * dilation_h;
-                int w_pad = w * stride_w - pad_w + w_offset * dilation_w;
-                if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
-                    data_im[(c_im * height + h_pad) * width + w_pad] +=
-                        data_col[(c * height_col + h) * width_col + w];
-            }
-        }
-    }
-}
 }
 }