Adding CPU parallelization for element-wise layers and im2col operation

4cb108ef · Vitaliy Lyudvichenko · 6d3cb808 · 4cb108ef · 4cb108ef · 4cb108ef
Commit 4cb108ef authored Jun 23, 2016 by Vitaliy Lyudvichenko
4 changed files
--- a/modules/dnn/src/layers/convolution_layer.cpp
+++ b/modules/dnn/src/layers/convolution_layer.cpp
@@ -179,9 +179,9 @@ namespace dnn
 #endif // HAVE_OPENCL
        if (inpBlob.type() == CV_32F)
-            im2col_cpu((float *)srcPtr, inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, (float *)colMat.ptr());
+            im2col_CpuPBody<float>::run((float*)srcPtr, inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, (float *)colMat.ptr());
        if (inpBlob.type() == CV_64F)
-            im2col_cpu((double*)srcPtr, inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, (double*)colMat.ptr());
+            im2col_CpuPBody<double>::run((double*)srcPtr, inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, (double*)colMat.ptr());
    }
    void ConvolutionLayer::computeInpOutShape(const Blob &inpBlob)
@@ -253,9 +253,9 @@ namespace dnn
        if (is1x1()) return;
        if (dstMat.type() == CV_32F)
-            col2im_cpu((float*)colMat.ptr(), inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, (float*)dstMat.ptr());
+            col2im_cpu(colMat.ptr<float>(), inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, dstMat.ptr<float>());
        if (dstMat.type() == CV_64F)
-            col2im_cpu((double*)colMat.ptr(), inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, (double*)dstMat.ptr());
+            col2im_cpu(colMat.ptr<double>(), inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, dstMat.ptr<double>());
    }
 }
 }
--- a/modules/dnn/src/layers/elementwise_layers.hpp
+++ b/modules/dnn/src/layers/elementwise_layers.hpp
@@ -55,130 +55,147 @@ using std::exp;
 using std::tanh;
 using std::pow;
-    template<typename Func>
+template<typename Func>
-    class ElementWiseLayer : public Layer
+class ElementWiseLayer : public Layer
+{
+    Func func;
+    template<typename Dtype>
+    class PBody : public cv::ParallelLoopBody
    {
-        Func func;
+        Dtype *data;
+        Func &func;
    public:
-        ElementWiseLayer(LayerParams &_params) : func(_params) {}
+        PBody(Blob &blob, Func &func_) :
+            func(func_), data(blob.ptr<Dtype>())
+        {}
-        void allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
+        void operator()(const Range &r) const
        {
-            outputs.resize(inputs.size());
+            for (int i = r.start; i < r.end; i++)
-            for (size_t i = 0; i < inputs.size(); i++)
+                data[i] = func(data[i]);
-                outputs[i].shareFrom(*inputs[i]); //no data copy
-        }
-        void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
-        {
-            for (size_t i = 0; i < inputs.size(); i++)
-            {
-                CV_Assert(inputs[i]->ptr() == outputs[i].ptr() && inputs[i]->type() == outputs[i].type());
-                size_t size = outputs[i].total();
-                if (outputs[i].type() == CV_32F)
-                {
-                    float *data = outputs[i].ptrf();
-                    for (size_t j = 0; j < size; j++)
-                        data[j] = func(data[j]);
-                }
-                else if (outputs[i].type() == CV_64F)
-                {
-                    double *data = outputs[i].ptr<double>();
-                    for (size_t j = 0; j < size; j++)
-                        data[j] = func(data[j]);
-                }
-                else
-                {
-                    CV_Error(Error::StsNotImplemented, "Only CV_32F and CV_64F blobs are supported");
-                }
-            }
        }
    };
+public:
-    struct ReLUFunctor
+    ElementWiseLayer(LayerParams &_params) : func(_params) {}
+    void allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
    {
-        float negative_slope;
+        outputs.resize(inputs.size());
+        for (size_t i = 0; i < inputs.size(); i++)
+            outputs[i].shareFrom(*inputs[i]); //no data copy
+    }
-        ReLUFunctor(LayerParams &params)
+    void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
+    {
+        for (size_t i = 0; i < inputs.size(); i++)
        {
-            if (params.has("negative_slope"))
+            CV_Assert(inputs[i]->ptr() == outputs[i].ptr() && inputs[i]->type() == outputs[i].type());
-                negative_slope = params.get<float>("negative_slope");
+            CV_Assert(inputs[i]->matRefConst().isContinuous());
+            Range sizeRange = Range(0, outputs[i].total());
+            if (outputs[i].type() == CV_32F)
+            {
+                cv::parallel_for_(sizeRange, PBody<float>(outputs[i], func));
+            }
+            else if (outputs[i].type() == CV_64F)
+            {
+                cv::parallel_for_(sizeRange, PBody<double>(outputs[i], func));
+            }
            else
-                negative_slope = 0.f;
+            {
+                CV_Error(Error::StsNotImplemented, "Only CV_32F and CV_64F blobs are supported");
+            }
        }
+    }
+};
-        template<typename TFloat>
-        inline TFloat operator()(TFloat x)
-        {
-            return (x >= (TFloat)0) ? x : negative_slope * x;
-        }
-    };
-    struct TanHFunctor
+struct ReLUFunctor
+{
+    float negative_slope;
+    ReLUFunctor(LayerParams &params)
    {
-        TanHFunctor(LayerParams&) {}
+        if (params.has("negative_slope"))
+            negative_slope = params.get<float>("negative_slope");
+        else
+            negative_slope = 0.f;
+    }
+    template<typename TFloat>
+    inline TFloat operator()(TFloat x) const
+    {
+        return (x >= (TFloat)0) ? x : negative_slope * x;
+    }
+};
-        template<typename TFloat>
+struct TanHFunctor
-        inline TFloat operator()(TFloat x)
+{
-        {
+    TanHFunctor(LayerParams&) {}
-            return tanh(x);
-        }
-    };
-    struct SigmoidFunctor
+    template<typename TFloat>
+    inline TFloat operator()(TFloat x) const
    {
-        SigmoidFunctor(LayerParams&) {}
+        return tanh(x);
+    }
+};
-        template<typename TFloat>
+struct SigmoidFunctor
-        inline TFloat operator()(TFloat x)
+{
-        {
+    SigmoidFunctor(LayerParams&) {}
-            return (TFloat)1 / ((TFloat)1 + exp(-x));
-        }
-    };
-    struct AbsValFunctor
+    template<typename TFloat>
+    inline TFloat operator()(TFloat x) const
    {
-        AbsValFunctor(LayerParams&) {}
+        return (TFloat)1 / ((TFloat)1 + exp(-x));
+    }
+};
-        template<typename TFloat>
+struct AbsValFunctor
-        inline TFloat operator()(TFloat x)
+{
-        {
+    AbsValFunctor(LayerParams&) {}
-            return abs(x);
-        }
-    };
-    struct PowerFunctor
+    template<typename TFloat>
+    inline TFloat operator()(TFloat x) const
    {
-        float power, scale, shift;
+        return abs(x);
+    }
+};
-        PowerFunctor(LayerParams &params)
+struct PowerFunctor
-        {
+{
-            power = params.get<float>("power", 1.0f);
+    float power, scale, shift;
-            scale = params.get<float>("scale", 1.0f);
-            shift = params.get<float>("shift", 0.0f);
-        }
-        template<typename TFloat>
+    PowerFunctor(LayerParams &params)
-        inline TFloat operator()(TFloat x)
+    {
-        {
+        power = params.get<float>("power", 1.0f);
-            return pow((TFloat)shift + (TFloat)scale * x, (TFloat)power);
+        scale = params.get<float>("scale", 1.0f);
-        }
+        shift = params.get<float>("shift", 0.0f);
-    };
+    }
-    struct BNLLFunctor
+    template<typename TFloat>
+    inline TFloat operator()(TFloat x) const
    {
-        BNLLFunctor(LayerParams&) {}
+        return pow((TFloat)shift + (TFloat)scale * x, (TFloat)power);
+    }
+};
+struct BNLLFunctor
+{
+    BNLLFunctor(LayerParams&) {}
+    template<typename TFloat>
+    inline TFloat operator()(TFloat x) const
+    {
+        return log((TFloat)1 + exp(-abs(x)));
+    }
+};
-        template<typename TFloat>
-        inline TFloat operator()(TFloat x)
-        {
-            return log((TFloat)1 + exp(-abs(x)));
-        }
-    };
 }
 }
 #endif
--- a/modules/dnn/src/layers/op_im2col.hpp
+++ b/modules/dnn/src/layers/op_im2col.hpp
@@ -41,6 +41,8 @@
 #ifndef __OPENCV_DNN_LAYERS_IM2COL_HPP__
 #define __OPENCV_DNN_LAYERS_IM2COL_HPP__
+#include <opencv2/core.hpp>
+#include <iostream>
 namespace cv
 {
@@ -48,33 +50,67 @@ namespace dnn
 {
 template <typename Dtype>
-void im2col_cpu(const Dtype* data_im,
+class im2col_CpuPBody : public cv::ParallelLoopBody
-                int channels, int height, int width,
-                int kernel_h, int kernel_w,
-                int pad_h, int pad_w,
-                int stride_h, int stride_w,
-                Dtype* data_col)
 {
-    int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
+    const Dtype* data_im;
-    int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
+    int channels, height, width;
-    int channels_col = channels * kernel_h * kernel_w;
+    int kernel_h, kernel_w;
-    for (int c = 0; c < channels_col; ++c) {
+    int pad_h, pad_w;
-        int w_offset = c % kernel_w;
+    int stride_h, stride_w;
-        int h_offset = (c / kernel_w) % kernel_h;
+    Dtype* data_col;
-        int c_im = c / kernel_h / kernel_w;
+    int height_col, width_col, channels_col;
-        for (int h = 0; h < height_col; ++h) {
-            for (int w = 0; w < width_col; ++w) {
+public:
-                int h_pad = h * stride_h - pad_h + h_offset;
-                int w_pad = w * stride_w - pad_w + w_offset;
+    im2col_CpuPBody(const Dtype* data_im_,
-                if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
+                     int channels_, int height_, int width_,
-                    data_col[(c * height_col + h) * width_col + w] =
+                     int kernel_h_, int kernel_w_,
-                    data_im[(c_im * height + h_pad) * width + w_pad];
+                     int pad_h_, int pad_w_,
-                else
+                     int stride_h_, int stride_w_,
-                    data_col[(c * height_col + h) * width_col + w] = 0;
+                     Dtype* data_col_) :
+        data_im(data_im_),
+        channels(channels_), height(height_), width(width_),
+        kernel_h(kernel_h_), kernel_w(kernel_w_),
+        pad_h(pad_h_), pad_w(pad_w_),
+        stride_h(stride_h_), stride_w(stride_w_),
+        data_col(data_col_)
+    {
+        height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
+        width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
+        channels_col = channels * kernel_h * kernel_w;
+    }
+    static void run(const Dtype* data_im,
+                    int channels, int height, int width,
+                    int kernel_h, int kernel_w,
+                    int pad_h, int pad_w,
+                    int stride_h, int stride_w,
+                    Dtype* data_col)
+    {
+        im2col_CpuPBody<Dtype> pb(data_im, channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, data_col);
+        cv::parallel_for_(Range(0, pb.channels_col), pb);
+    }
+    virtual void operator ()(const Range &r) const
+    {
+        for (int c = r.start; c < r.end; ++c) {
+            int w_offset = c % kernel_w;
+            int h_offset = (c / kernel_w) % kernel_h;
+            int c_im = c / kernel_h / kernel_w;
+            for (int h = 0; h < height_col; ++h) {
+                for (int w = 0; w < width_col; ++w) {
+                    int h_pad = h * stride_h - pad_h + h_offset;
+                    int w_pad = w * stride_w - pad_w + w_offset;
+                    if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
+                        data_col[(c * height_col + h) * width_col + w] =
+                        data_im[(c_im * height + h_pad) * width + w_pad];
+                    else
+                        data_col[(c * height_col + h) * width_col + w] = 0;
+                }
            }
        }
    }
-}
+};
 template <typename Dtype>
 void col2im_cpu(const Dtype* data_col,

--- a/modules/dnn/test/test_layers.cpp
+++ b/modules/dnn/test/test_layers.cpp
@@ -65,6 +65,8 @@ static void testLayer(String basename, bool useCaffeModel = false, bool useCommo
    String inpfile = (useCommonInputBlob) ? _tf("blob.npy") : _tf(basename + ".input.npy");
    String outfile = _tf(basename + ".npy");
+    cv::setNumThreads(cv::getNumberOfCPUs());
    Net net;
    {
        Ptr<Importer> importer = createCaffeImporter(prototxt, (useCaffeModel) ? caffemodel : String());