Adding CPU parallelization for element-wise layers and im2col operation

4cb108ef · Vitaliy Lyudvichenko · 6d3cb808 · 4cb108ef · 4cb108ef · 4cb108ef
Commit 4cb108ef authored Jun 23, 2016 by Vitaliy Lyudvichenko
4 changed files
--- a/modules/dnn/src/layers/convolution_layer.cpp
+++ b/modules/dnn/src/layers/convolution_layer.cpp
@@ -179,9 +179,9 @@ namespace dnn
 #endif // HAVE_OPENCL
        if (inpBlob.type() == CV_32F)
-            im2col_cpu((float *)srcPtr, inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, (float *)colMat.ptr());
+            im2col_CpuPBody<float>::run((float*)srcPtr, inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, (float *)colMat.ptr());
        if (inpBlob.type() == CV_64F)
-            im2col_cpu((double*)srcPtr, inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, (double*)colMat.ptr());
+            im2col_CpuPBody<double>::run((double*)srcPtr, inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, (double*)colMat.ptr());
    }
    void ConvolutionLayer::computeInpOutShape(const Blob &inpBlob)
@@ -253,9 +253,9 @@ namespace dnn
        if (is1x1()) return;
        if (dstMat.type() == CV_32F)
-            col2im_cpu((float*)colMat.ptr(), inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, (float*)dstMat.ptr());
+            col2im_cpu(colMat.ptr<float>(), inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, dstMat.ptr<float>());
        if (dstMat.type() == CV_64F)
-            col2im_cpu((double*)colMat.ptr(), inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, (double*)dstMat.ptr());
+            col2im_cpu(colMat.ptr<double>(), inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, dstMat.ptr<double>());
    }
 }
 }
--- a/modules/dnn/src/layers/elementwise_layers.hpp
+++ b/modules/dnn/src/layers/elementwise_layers.hpp
@@ -55,12 +55,31 @@ using std::exp;
 using std::tanh;
 using std::pow;
-    template<typename Func>
+template<typename Func>
-    class ElementWiseLayer : public Layer
+class ElementWiseLayer : public Layer
-    {
+{
    Func func;
+    template<typename Dtype>
+    class PBody : public cv::ParallelLoopBody
+    {
+        Dtype *data;
+        Func &func;
    public:
+        PBody(Blob &blob, Func &func_) :
+            func(func_), data(blob.ptr<Dtype>())
+        {}
+        void operator()(const Range &r) const
+        {
+            for (int i = r.start; i < r.end; i++)
+                data[i] = func(data[i]);
+        }
+    };
+public:
    ElementWiseLayer(LayerParams &_params) : func(_params) {}
    void allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
@@ -75,20 +94,17 @@ using std::pow;
        for (size_t i = 0; i < inputs.size(); i++)
        {
            CV_Assert(inputs[i]->ptr() == outputs[i].ptr() && inputs[i]->type() == outputs[i].type());
+            CV_Assert(inputs[i]->matRefConst().isContinuous());
-                size_t size = outputs[i].total();
+            Range sizeRange = Range(0, outputs[i].total());
            if (outputs[i].type() == CV_32F)
            {
-                    float *data = outputs[i].ptrf();
+                cv::parallel_for_(sizeRange, PBody<float>(outputs[i], func));
-                    for (size_t j = 0; j < size; j++)
-                        data[j] = func(data[j]);
            }
            else if (outputs[i].type() == CV_64F)
            {
-                    double *data = outputs[i].ptr<double>();
+                cv::parallel_for_(sizeRange, PBody<double>(outputs[i], func));
-                    for (size_t j = 0; j < size; j++)
-                        data[j] = func(data[j]);
            }
            else
            {
@@ -96,11 +112,11 @@ using std::pow;
            }
        }
    }
-    };
+};
-    struct ReLUFunctor
+struct ReLUFunctor
-    {
+{
    float negative_slope;
    ReLUFunctor(LayerParams &params)
@@ -112,47 +128,47 @@ using std::pow;
    }
    template<typename TFloat>
-        inline TFloat operator()(TFloat x)
+    inline TFloat operator()(TFloat x) const
    {
        return (x >= (TFloat)0) ? x : negative_slope * x;
    }
-    };
+};
-    struct TanHFunctor
+struct TanHFunctor
-    {
+{
    TanHFunctor(LayerParams&) {}
    template<typename TFloat>
-        inline TFloat operator()(TFloat x)
+    inline TFloat operator()(TFloat x) const
    {
        return tanh(x);
    }
-    };
+};
-    struct SigmoidFunctor
+struct SigmoidFunctor
-    {
+{
    SigmoidFunctor(LayerParams&) {}
    template<typename TFloat>
-        inline TFloat operator()(TFloat x)
+    inline TFloat operator()(TFloat x) const
    {
        return (TFloat)1 / ((TFloat)1 + exp(-x));
    }
-    };
+};
-    struct AbsValFunctor
+struct AbsValFunctor
-    {
+{
    AbsValFunctor(LayerParams&) {}
    template<typename TFloat>
-        inline TFloat operator()(TFloat x)
+    inline TFloat operator()(TFloat x) const
    {
        return abs(x);
    }
-    };
+};
-    struct PowerFunctor
+struct PowerFunctor
-    {
+{
    float power, scale, shift;
    PowerFunctor(LayerParams &params)
@@ -163,22 +179,23 @@ using std::pow;
    }
    template<typename TFloat>
-        inline TFloat operator()(TFloat x)
+    inline TFloat operator()(TFloat x) const
    {
        return pow((TFloat)shift + (TFloat)scale * x, (TFloat)power);
    }
-    };
+};
-    struct BNLLFunctor
+struct BNLLFunctor
-    {
+{
    BNLLFunctor(LayerParams&) {}
    template<typename TFloat>
-        inline TFloat operator()(TFloat x)
+    inline TFloat operator()(TFloat x) const
    {
        return log((TFloat)1 + exp(-abs(x)));
    }
-    };
+};
 }
 }
 #endif
--- a/modules/dnn/src/layers/op_im2col.hpp
+++ b/modules/dnn/src/layers/op_im2col.hpp
@@ -41,6 +41,8 @@
 #ifndef __OPENCV_DNN_LAYERS_IM2COL_HPP__
 #define __OPENCV_DNN_LAYERS_IM2COL_HPP__
+#include <opencv2/core.hpp>
+#include <iostream>
 namespace cv
 {
@@ -48,17 +50,50 @@ namespace dnn
 {
 template <typename Dtype>
-void im2col_cpu(const Dtype* data_im,
+class im2col_CpuPBody : public cv::ParallelLoopBody
+{
+    const Dtype* data_im;
+    int channels, height, width;
+    int kernel_h, kernel_w;
+    int pad_h, pad_w;
+    int stride_h, stride_w;
+    Dtype* data_col;
+    int height_col, width_col, channels_col;
+public:
+    im2col_CpuPBody(const Dtype* data_im_,
+                     int channels_, int height_, int width_,
+                     int kernel_h_, int kernel_w_,
+                     int pad_h_, int pad_w_,
+                     int stride_h_, int stride_w_,
+                     Dtype* data_col_) :
+        data_im(data_im_),
+        channels(channels_), height(height_), width(width_),
+        kernel_h(kernel_h_), kernel_w(kernel_w_),
+        pad_h(pad_h_), pad_w(pad_w_),
+        stride_h(stride_h_), stride_w(stride_w_),
+        data_col(data_col_)
+    {
+        height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
+        width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
+        channels_col = channels * kernel_h * kernel_w;
+    }
+    static void run(const Dtype* data_im,
                    int channels, int height, int width,
                    int kernel_h, int kernel_w,
                    int pad_h, int pad_w,
                    int stride_h, int stride_w,
                    Dtype* data_col)
-{
+    {
-    int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
+        im2col_CpuPBody<Dtype> pb(data_im, channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, data_col);
-    int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
+        cv::parallel_for_(Range(0, pb.channels_col), pb);
-    int channels_col = channels * kernel_h * kernel_w;
+    }
-    for (int c = 0; c < channels_col; ++c) {
+    virtual void operator ()(const Range &r) const
+    {
+        for (int c = r.start; c < r.end; ++c) {
            int w_offset = c % kernel_w;
            int h_offset = (c / kernel_w) % kernel_h;
            int c_im = c / kernel_h / kernel_w;
@@ -74,7 +109,8 @@ void im2col_cpu(const Dtype* data_im,
                }
            }
        }
-}
+    }
+};
 template <typename Dtype>
 void col2im_cpu(const Dtype* data_col,

--- a/modules/dnn/test/test_layers.cpp
+++ b/modules/dnn/test/test_layers.cpp
@@ -65,6 +65,8 @@ static void testLayer(String basename, bool useCaffeModel = false, bool useCommo
    String inpfile = (useCommonInputBlob) ? _tf("blob.npy") : _tf(basename + ".input.npy");
    String outfile = _tf(basename + ".npy");
+    cv::setNumThreads(cv::getNumberOfCPUs());
    Net net;
    {
        Ptr<Importer> importer = createCaffeImporter(prototxt, (useCaffeModel) ? caffemodel : String());