Merge pull request #1136 from vpisarev:dnn5

b97931e0 · Vadim Pisarevsky · 3908909d · 75789089 · b97931e0 · b97931e0
Commit b97931e0 authored Apr 26, 2017 by Vadim Pisarevsky
7 changed files
--- a/modules/dnn/samples/torch_enet.cpp
+++ b/modules/dnn/samples/torch_enet.cpp
@@ -98,14 +98,19 @@ int main(int argc, char **argv)
    net.setBlob("", inputBlob);        //set the network input
    //! [Set input blob]

+    const int N = 3;
    TickMeter tm;
-    tm.start();

    //! [Make forward pass]
+    for( int i = 0; i < N; i++ )
+    {
+        TickMeter tm_;
+        tm_.start();
        net.forward();                          //compute output
-    //! [Make forward pass]
-
-    tm.stop();
+        tm_.stop();
+        if( i == 0 || tm_.getTimeTicks() < tm.getTimeTicks() )
+            tm = tm_;
+    }

    //! [Gather output]


--- a/modules/dnn/src/layers/batch_norm_layer.cpp
+++ b/modules/dnn/src/layers/batch_norm_layer.cpp
@@ -41,6 +41,15 @@ public:
            Mat* inp = inputs[i];
            outputs[i].create(inp->dims, &inp->size.p[0], inp->type());
        }
+
+        varMeanScale = 1.f;
+        if (!hasWeights && !hasBias) {
+            varMeanScale = *blobs[2].ptr<float>();
+            if (varMeanScale != 0)
+                varMeanScale = 1/varMeanScale;
+        }
+
+        cv::pow(blobs[1]*varMeanScale + epsilon, -0.5, invStdMat);
    }

    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
@@ -52,16 +61,6 @@ public:
        int weightsBlobIndex = 2;
        int biasBlobIndex = weightsBlobIndex + hasWeights;

-        float varMeanScale = 1;
-        if (!hasWeights && !hasBias) {
-            varMeanScale = *blobs[2].ptr<float>();
-            if (varMeanScale != 0)
-                varMeanScale = 1/varMeanScale;
-        }
-
-        Mat invStdMat;
-        cv::pow(blobs[1]*varMeanScale + epsilon, -0.5, invStdMat);
-
        int rows = inpBlob.size[2];
        int cols = inpBlob.size[3];

@@ -92,7 +91,8 @@ public:
    }

    bool hasWeights, hasBias;
-    float epsilon;
+    float epsilon, varMeanScale;
+    Mat invStdMat;
 };

 Ptr<BatchNormLayer> BatchNormLayer::create(const LayerParams& params)

--- a/modules/dnn/src/layers/convolution_layer.cpp
+++ b/modules/dnn/src/layers/convolution_layer.cpp
@@ -54,70 +54,25 @@ namespace dnn
 class BaseConvolutionLayerImpl : public ConvolutionLayer
 {
 public:
-    BaseConvolutionLayerImpl();
-    virtual void allocate(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs);
-
-    void init();
-    virtual void computeInpOutShape(const Mat &inpBlob) = 0;
-    bool is1x1() const;
-
-    int numOutput, group;
-    int inpH, inpW, inpCn;
-    int outH, outW, outCn;
-    int inpGroupCn, outGroupCn;
-    int ksize;
-    std::vector<int> colRowBlobShape;
-
-    bool bias;
-    Mat colRowBlob, biasOnesBlob;
-};
-
-//TODO: simultaneously convolution and bias addition for cache optimization
-class ConvolutionLayerImpl : public BaseConvolutionLayerImpl
-{
-public:
-    virtual void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs);
-    virtual void computeInpOutShape(const Mat &inpBlob);
-
-    void im2col(const  Mat &srcImg,  Mat &dstCol);
-    void im2row(const  Mat &srcImg,  Mat &dstRow);
-};
-
-class DeConvolutionLayerImpl : public BaseConvolutionLayerImpl
-{
-public:
-    virtual void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs);
-
-    virtual void computeInpOutShape(const Mat &inpBlob);
-    void col2im(const  Mat &colMat, Mat  &dstImg);
-};
-
-
-BaseConvolutionLayerImpl::BaseConvolutionLayerImpl():
-    numOutput(-1), group(-1),
-    inpH(0), inpW(0), inpCn(0),
-    outH(0), outW(0), outCn(0),
-    inpGroupCn(0), outGroupCn(0),
-    ksize(0), bias(false)
-{
+    BaseConvolutionLayerImpl()
+    {
+        numOutput = -1;
+        group = -1;
+        inpH = inpW = inpCn = 0;
+        outH = outW = outCn = 0;
+        inpGroupCn = outGroupCn = 0;
+        ksize = 0;
+        bias = false;
 #ifdef HAVE_LAPACK
-    if (getBlasThreads() != cv::getThreadNum())
+        int nthreads = cv::getThreadNum();
+        if (getBlasThreads() != nthreads)
        {
-        setBlasThreads(cv::getThreadNum());
+            setBlasThreads(nthreads);
        }
 #endif
-}
-
-void BaseConvolutionLayerImpl::init()
-{
-    CV_Assert(blobs.size() >= 1 && blobs.size() <= 2);
-    CV_Assert(blobs[0].dims == 4 && blobs[0].size[3] == kernel.width && blobs[0].size[2] == kernel.height);
-
-    bias = (blobs.size() >= 2);
-}
-
-void BaseConvolutionLayerImpl::allocate(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
-{
+    }
+    void allocate(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
+    {
        CV_Assert(inputs.size() > 0);

        init();
@@ -151,17 +106,40 @@ void BaseConvolutionLayerImpl::allocate(const std::vector<Mat*> &inputs, std::ve
            colRowBlob.create((int)colRowBlobShape.size(), &colRowBlobShape[0], input.type());
            colRowBlob.setTo(0);
        }
-}
+    }

-bool BaseConvolutionLayerImpl::is1x1() const
-{
+    void init()
+    {
+        CV_Assert(blobs.size() >= 1 && blobs.size() <= 2);
+        CV_Assert(blobs[0].dims == 4 && blobs[0].size[3] == kernel.width && blobs[0].size[2] == kernel.height);
+
+        bias = (blobs.size() >= 2);
+    }
+    virtual void computeInpOutShape(const Mat &inpBlob) = 0;
+    bool is1x1() const
+    {
        return (kernel.height == 1 && kernel.width == 1) &&
        (stride.height == 1 && stride.width == 1) &&
        (dilation.height == 1 && dilation.width == 1);
-}
+    }
+
+    int numOutput, group;
+    int inpH, inpW, inpCn;
+    int outH, outW, outCn;
+    int inpGroupCn, outGroupCn;
+    int ksize;
+    std::vector<int> colRowBlobShape;
+
+    bool bias;
+    Mat colRowBlob, biasOnesBlob;
+};

-void ConvolutionLayerImpl::computeInpOutShape(const Mat &input)
+//TODO: simultaneously convolution and bias addition for cache optimization
+class ConvolutionLayerImpl : public BaseConvolutionLayerImpl
 {
+public:
+    void computeInpOutShape(const Mat &input)
+    {
        CV_Assert(!bias || blobs[1].total() == (size_t)blobs[0].size[0]);

        numOutput = blobs[0].size[0];
@@ -193,10 +171,10 @@ void ConvolutionLayerImpl::computeInpOutShape(const Mat &input)
        colRowBlobShape.clear();
        colRowBlobShape.push_back(outH*outW);
        colRowBlobShape.push_back(ksize);
-}
+    }

-void ConvolutionLayerImpl::forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
-{
+    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
+    {
        CV_Assert(inputs.size() > 0);

        Mat weightsMat = blobs[0].reshape(1, outCn);
@@ -212,9 +190,9 @@ void ConvolutionLayerImpl::forward(std::vector<Mat*> &inputs, std::vector<Mat> &
            {
                for (int g = 0; g < group; g++)
                {
-                Mat colMat, curInp = slice(inpMat, n, _Range(g * inpGroupCn, inpGroupCn));
+                    Mat curInp = slice(inpMat, n, _Range(g * inpGroupCn, inpGroupCn));

-                im2row(curInp, colMat);
+                    im2row(curInp, colRowBlob);

                    _Range kerRange(g * outGroupCn, outGroupCn);
                    Mat kerMat = weightsMat.rowRange(kerRange);
@@ -222,7 +200,7 @@ void ConvolutionLayerImpl::forward(std::vector<Mat*> &inputs, std::vector<Mat> &
                    _Range outRange((g + n * group) * outGroupCn, outGroupCn);
                    Mat dstMat = outMat.rowRange(outRange);

-                dnn::gemm(kerMat, colMat, 1, dstMat, 0, GEMM_2_T);
+                    dnn::gemm(kerMat, colRowBlob, 1, dstMat, 0, GEMM_2_T);

                    if (bias)
                    {
@@ -231,54 +209,28 @@ void ConvolutionLayerImpl::forward(std::vector<Mat*> &inputs, std::vector<Mat> &
                }
            }
        }
-}
-
-void ConvolutionLayerImpl::im2col(const Mat &srcImg, Mat &dstCol)
-{
-    if (is1x1())
-    {
-        dstCol = srcImg.reshape(1, ksize);
-        return;
    }

-    Mat &colMat = colRowBlob;
-    if (srcImg.type() == CV_32F)
-        im2col_CpuPBody<float>::run(srcImg.ptr<float>(), inpGroupCn, inpH, inpW, kernel.height,
-                                    kernel.width, pad.height, pad.width, stride.height, stride.width,
-                                    dilation.height, dilation.width, outH, outW, colMat.ptr<float>());
-    if (srcImg.type() == CV_64F)
-        im2col_CpuPBody<double>::run(srcImg.ptr<double>(), inpGroupCn, inpH, inpW, kernel.height,
-                                     kernel.width, pad.height, pad.width, stride.height, stride.width,
-                                     dilation.height, dilation.width, outH, outW, colMat.ptr<double>());
-
-    dstCol = colMat;
-}
-
-void ConvolutionLayerImpl::im2row(const  Mat &srcImg,  Mat &dstRow)
-{
+    void im2row(const  Mat &srcImg, Mat &dstRow)
+    {
        if (is1x1())
        {
-        dstRow = srcImg.reshape(1, ksize).t();
-        return;
+            transpose(srcImg.reshape(1, ksize), dstRow);
        }
-
-    Mat &colMat = colRowBlob;
-    if (srcImg.type() == CV_32F)
-        im2row_CpuPBody<float>::run(srcImg.ptr<float>(), inpGroupCn, inpH, inpW, kernel.height,
-                                    kernel.width, pad.height, pad.width, stride.height, stride.width,
-                                    dilation.height, dilation.width, outH, outW, colMat.ptr<float>());
-    if (srcImg.type() == CV_64F)
-        im2row_CpuPBody<double>::run(srcImg.ptr<double>(), inpGroupCn, inpH, inpW, kernel.height,
+        else
+        {
+            cv::dnn::im2row(srcImg.ptr<float>(), inpGroupCn, inpH, inpW, kernel.height,
                            kernel.width, pad.height, pad.width, stride.height, stride.width,
-                                     dilation.height, dilation.width, outH, outW, colMat.ptr<double>());
-
-    dstRow = colMat;
-}
-
-//Deconvolution
+                            dilation.height, dilation.width, outH, outW, dstRow.ptr<float>());
+        }
+    }
+};

-void DeConvolutionLayerImpl::computeInpOutShape(const Mat &inpBlob)
+class DeConvolutionLayerImpl : public BaseConvolutionLayerImpl
 {
+public:
+    void computeInpOutShape(const Mat &inpBlob)
+    {
        CV_Assert(!bias || blobs[1].total() == (size_t)blobs[0].size[0]);

        numOutput = blobs[0].size[0];
@@ -302,10 +254,21 @@ void DeConvolutionLayerImpl::computeInpOutShape(const Mat &inpBlob)
        colRowBlobShape.clear();
        colRowBlobShape.push_back(ksize);
        colRowBlobShape.push_back(inpH * inpW);
-}

-void DeConvolutionLayerImpl::forward(std::vector<Mat *> &inputs, std::vector<Mat> &outputs)
-{
+        ofsbuf.resize(ksize*3);
+        for( int k = 0; k < ksize; k++ )
+        {
+            int w_offset = k % kernel.width;
+            int h_offset = (k / kernel.width) % kernel.height;
+            int c_im = k / kernel.height / kernel.width;
+            ofsbuf[k*3] = w_offset;
+            ofsbuf[k*3+1] = h_offset;
+            ofsbuf[k*3+2] = c_im;
+        }
+    }
+
+    void forward(std::vector<Mat *> &inputs, std::vector<Mat> &outputs)
+    {
        Mat weightsMat = blobs[0].reshape(1, inpCn);
        Mat biasesMat  = bias ? blobs[1].reshape(1, outCn) : Mat();

@@ -338,44 +301,22 @@ void DeConvolutionLayerImpl::forward(std::vector<Mat *> &inputs, std::vector<Mat
                }
            }
        }
-}
+    }

-void DeConvolutionLayerImpl::col2im(const Mat &colMat, Mat &dstImg)
-{
+    void col2im(const Mat &colMat, Mat &dstImg)
+    {
        if (is1x1())
        {
            dstImg = colMat;
            return;
        }
-    if (dstImg.type() == CV_32F)
-        col2im_CpuPBody<float>::run(colMat.ptr<float>(), outGroupCn, outH, outW, kernel.height, kernel.width, pad.height, pad.width, stride.height, stride.width, dstImg.ptr<float>());
-    if (dstImg.type() == CV_64F)
-        col2im_CpuPBody<double>::run(colMat.ptr<double>(), inpGroupCn, inpH, inpW, kernel.height, kernel.width, pad.height, pad.width, stride.height, stride.width, dstImg.ptr<double>());
-}
-
-//Initializers
-
-/*Ptr<BaseConvolutionLayer> ConvolutionLayer::create(Size kernel, Size stride, Size pad, Size dilation)
-{
-    ConvolutionLayerImpl *l = new ConvolutionLayerImpl();
-    l->kernel = kernel;
-    l->pad = pad;
-    l->stride = stride;
-    l->dilation = dilation;
-    return Ptr<BaseConvolutionLayer>(l);
-}
+        cv::dnn::col2im(colMat.ptr<float>(), outGroupCn, outH, outW, kernel.height, kernel.width,
+                        pad.height, pad.width, stride.height, stride.width,
+                        dilation.height, dilation.width, dstImg.ptr<float>(), &ofsbuf[0]);
+    }

-Ptr<BaseConvolutionLayer> DeconvolutionLayer::create(Size kernel, Size stride, Size pad, Size dilation, Size adjustPad)
-{
-    DeConvolutionLayerImpl *l = new DeConvolutionLayerImpl();
-    l->kernel = kernel;
-    l->pad = pad;
-    l->stride = stride;
-    l->dilation = dilation;
-    l->adjustPad = adjustPad;
-
-    return Ptr<BaseConvolutionLayer>(l);
-}*/
+    std::vector<int> ofsbuf;
+};

 //Convolution and Deconvolution
 static void initConvDeconvLayerFromCaffe(Ptr<BaseConvolutionLayer> l, const LayerParams &params)

--- a/modules/dnn/src/layers/elementwise_layers.cpp
+++ b/modules/dnn/src/layers/elementwise_layers.cpp
@@ -15,8 +15,7 @@ using std::pow;
 template<typename Func>
 class ElementWiseLayer : public Func::Layer
 {
-    Func func;
-
+public:
    template<typename Dtype>
    class PBody : public cv::ParallelLoopBody
    {
@@ -35,9 +34,7 @@ class ElementWiseLayer : public Func::Layer
        }
    };

-public:
-
-    ElementWiseLayer(const Func &f=Func()) : func(f) {}
+    ElementWiseLayer(bool run_parallel_=false, const Func &f=Func()) : func(f), run_parallel(run_parallel_) {}

    void allocate(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
    {
@@ -58,9 +55,16 @@ public:

            Range sizeRange = Range(0, dst.total());
            CV_Assert(src.type() == CV_32F);
-            cv::parallel_for_(sizeRange, PBody<float>(dst, func));
+            PBody<float> body(dst, func);
+            if( run_parallel )
+                cv::parallel_for_(sizeRange, body);
+            else
+                body(sizeRange);
        }
    }
+
+    Func func;
+    bool run_parallel;
 };

 struct ReLUFunctor
@@ -135,8 +139,24 @@ struct PowerFunctor
    template<typename TFloat>
    inline TFloat operator()(TFloat x) const
    {
-        return power == 1.0f ? (TFloat)shift + (TFloat)scale * x :
-            pow((TFloat)shift + (TFloat)scale * x, (TFloat)power);
+        return pow((TFloat)shift + (TFloat)scale * x, (TFloat)power);
+    }
+};
+
+struct PowerFunctor1
+{
+    typedef PowerLayer Layer;
+
+    const float scale;
+    const float shift;
+
+    PowerFunctor1(float scale_ = 1.f, float shift_ = 0)
+    : scale(scale_), shift(shift_) {}
+
+    template<typename TFloat>
+    inline TFloat operator()(TFloat x) const
+    {
+        return (TFloat)shift + (TFloat)scale * x;
    }
 };

@@ -165,12 +185,12 @@ public:
    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
    {
        CV_Assert(inputs.size() == 1);
-
        Mat &inpBlob = *inputs[0];

        for (size_t ii = 0; ii < outputs.size(); ii++)
        {
            Mat &outBlob = outputs[ii];
+            CV_Assert(inpBlob.isContinuous() && outBlob.isContinuous());

            CV_Assert(blobs[0].total() == inpBlob.size[1]);

@@ -181,8 +201,16 @@ public:
                Mat inpBlobPlane = getPlane(inpBlob, 0, n);
                Mat outBlobPlane = getPlane(outBlob, 0, n);

-                threshold(inpBlobPlane, outBlobPlane, 0, 0, cv::THRESH_TOZERO_INV);
-                scaleAdd(outBlobPlane, slopeWeight-1, inpBlobPlane, outBlobPlane);
+                size_t i, planeTotal = inpBlobPlane.total();
+                const float* inptr = inpBlobPlane.ptr<float>();
+                float* outptr = outBlobPlane.ptr<float>();
+                for( i = 0; i < planeTotal; i++ )
+                {
+                    float val = inptr[i];
+                    outptr[i] = val*(val >= 0.f ? 1.f : slopeWeight);
+                }
+                //threshold(inpBlobPlane, outBlobPlane, 0, 0, cv::THRESH_TOZERO_INV);
+                //scaleAdd(outBlobPlane, slopeWeight-1, inpBlobPlane, outBlobPlane);
            }
        }
    }
@@ -196,7 +224,7 @@ Ptr<_Layer> _Layer::create() { \
 Ptr<ReLULayer> ReLULayer::create(const LayerParams& params)
 {
    float negativeSlope = params.get<float>("negative_slope", 0.f);
-    Ptr<ReLULayer> l(new ElementWiseLayer<ReLUFunctor>(ReLUFunctor(negativeSlope)));
+    Ptr<ReLULayer> l(new ElementWiseLayer<ReLUFunctor>(false, ReLUFunctor(negativeSlope)));
    l->setParamsFrom(params);

    return l;
@@ -204,7 +232,7 @@ Ptr<ReLULayer> ReLULayer::create(const LayerParams& params)

 Ptr<TanHLayer> TanHLayer::create(const LayerParams& params)
 {
-    Ptr<TanHLayer> l(new ElementWiseLayer<TanHFunctor>());
+    Ptr<TanHLayer> l(new ElementWiseLayer<TanHFunctor>(true));
    l->setParamsFrom(params);

    return l;
@@ -212,7 +240,7 @@ Ptr<TanHLayer> TanHLayer::create(const LayerParams& params)

 Ptr<SigmoidLayer> SigmoidLayer::create(const LayerParams& params)
 {
-    Ptr<SigmoidLayer> l(new ElementWiseLayer<SigmoidFunctor>());
+    Ptr<SigmoidLayer> l(new ElementWiseLayer<SigmoidFunctor>(true));
    l->setParamsFrom(params);

    return l;
@@ -228,7 +256,7 @@ Ptr<AbsLayer> AbsLayer::create(const LayerParams& params)

 Ptr<BNLLLayer> BNLLLayer::create(const LayerParams& params)
 {
-    Ptr<BNLLLayer> l(new ElementWiseLayer<BNLLFunctor>());
+    Ptr<BNLLLayer> l(new ElementWiseLayer<BNLLFunctor>(true));
    l->setParamsFrom(params);

    return l;
@@ -239,7 +267,9 @@ Ptr<PowerLayer> PowerLayer::create(const LayerParams& params)
    float power = params.get<float>("power", 1.0f);
    float scale = params.get<float>("scale", 1.0f);
    float shift = params.get<float>("shift", 0.0f);
-    Ptr<PowerLayer> l(new ElementWiseLayer<PowerFunctor>(PowerFunctor(power, scale, shift)));
+    Ptr<PowerLayer> l(power == 1.f ?
+                      (PowerLayer*)(new ElementWiseLayer<PowerFunctor1>(false, PowerFunctor1(scale, shift))) :
+                      (PowerLayer*)(new ElementWiseLayer<PowerFunctor>(true, PowerFunctor(power, scale, shift))));
    l->setParamsFrom(params);

    return l;

--- a/modules/dnn/src/layers/eltwise_layer.cpp
+++ b/modules/dnn/src/layers/eltwise_layer.cpp
@@ -98,15 +98,14 @@ public:

    void forward(std::vector<Mat *> &inputs, std::vector<Mat> &outputs)
    {
+        Mat& output = outputs[0];
        switch (op)
        {
            case SUM:
-            {
                CV_Assert(coeffs.size() == 0 || coeffs.size() == inputs.size());
-                Mat& output = outputs[0];
-                output.setTo(0.);
                if (0 < coeffs.size())
                {
+                    output.setTo(0.);
                    for (size_t i = 0; i < inputs.size(); i++)
                    {
                        output += *inputs[i] * coeffs[i];
@@ -114,32 +113,26 @@ public:
                }
                else
                {
-                    for (size_t i = 0; i < inputs.size(); i++)
+                    add(*inputs[0], *inputs[1], output);
+                    for (size_t i = 2; i < inputs.size(); i++)
                    {
                        output += *inputs[i];
                    }
                }
-            }
                break;
            case PROD:
-            {
-                Mat& output = outputs[0];
                output.setTo(1.);
                for (size_t i = 0; i < inputs.size(); i++)
                {
                    output = output.mul(*inputs[i]);
                }
-            }
                break;
            case MAX:
-            {
-                Mat& output = outputs[0];
                cv::max(*inputs[0], *inputs[1], output);
                for (size_t i = 2; i < inputs.size(); i++)
                {
                    cv::max(output, *inputs[i], output);
                }
-            }
                break;
            default:
                CV_Assert(0);

--- a/modules/dnn/src/layers/op_im2col.cpp
+++ b/modules/dnn/src/layers/op_im2col.cpp
@@ -44,3 +44,326 @@
 #include "opencl_kernels_dnn.hpp"
 #include "op_im2col.hpp"
 #include "opencl_kernels_dnn.hpp"
+
+namespace cv {
+namespace dnn {
+
+#if 0
+template <typename Dtype>
+class im2col_CpuPBody : public cv::ParallelLoopBody
+{
+    const Dtype* data_im;
+    int channels, height, width;
+    int kernel_h, kernel_w;
+    int pad_h, pad_w;
+    int stride_h, stride_w;
+    int dilation_h, dilation_w;
+    Dtype* data_col;
+    int height_col, width_col, channels_col;
+
+    im2col_CpuPBody() {}
+public:
+
+    static void run(const Dtype* data_im,
+                    int channels, int height, int width,
+                    int kernel_h, int kernel_w,
+                    int pad_h, int pad_w,
+                    int stride_h, int stride_w,
+                    int dilation_h, int dilation_w,
+                    int height_col, int width_col,
+                    Dtype* data_col)
+    {
+        im2col_CpuPBody<Dtype> t;
+
+        t.data_im = data_im;
+        t.data_col = data_col;
+        t.channels = channels; t.height = height; t.width = width;
+        t.kernel_h = kernel_h; t.kernel_w = kernel_w;
+        t.pad_h = pad_h; t.pad_w = pad_w;
+        t.stride_h = stride_h; t.stride_w = stride_w;
+        t.dilation_h = dilation_h; t.dilation_w = dilation_w;
+
+        t.height_col = height_col;
+        t.width_col = width_col;
+        t.channels_col = channels * kernel_h * kernel_w;
+
+        cv::parallel_for_(Range(0, t.channels_col), t);
+    }
+
+    virtual void operator ()(const Range &r) const
+    {
+        for (int c = r.start; c < r.end; ++c)
+        {
+            int w_offset = c % kernel_w;
+            int h_offset = (c / kernel_w) % kernel_h;
+            int c_im = c / kernel_h / kernel_w;
+            for (int h = 0; h < height_col; ++h)
+            {
+                for (int w = 0; w < width_col; ++w)
+                {
+                    int h_pad = h * stride_h - pad_h + h_offset * dilation_h;
+                    int w_pad = w * stride_w - pad_w + w_offset * dilation_w;
+                    if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
+                        data_col[(c * height_col + h) * width_col + w] =
+                        data_im[(c_im * height + h_pad) * width + w_pad];
+                    else
+                        data_col[(c * height_col + h) * width_col + w] = 0;
+                }
+            }
+        }
+    }
+};
+#endif
+
+template <typename Dtype>
+class im2row_CpuPBody : public cv::ParallelLoopBody
+{
+    const Dtype* data_im;
+    int channels, height, width;
+    int kernel_h, kernel_w;
+    int pad_h, pad_w;
+    int stride_h, stride_w;
+    int dilation_h, dilation_w;
+    Dtype* data_col;
+    int height_col, width_col, channels_col;
+
+    im2row_CpuPBody() {}
+public:
+
+    static void run(const Dtype* data_im,
+                    int channels, int height, int width,
+                    int kernel_h, int kernel_w,
+                    int pad_h, int pad_w,
+                    int stride_h, int stride_w,
+                    int dilation_h, int dilation_w,
+                    int height_col, int width_col,
+                    Dtype* data_col)
+    {
+        im2row_CpuPBody<Dtype> t;
+
+        t.data_im = data_im;
+        t.data_col = data_col;
+        t.channels = channels; t.height = height; t.width = width;
+        t.kernel_h = kernel_h; t.kernel_w = kernel_w;
+        t.pad_h = pad_h; t.pad_w = pad_w;
+        t.stride_h = stride_h; t.stride_w = stride_w;
+        t.dilation_h = dilation_h; t.dilation_w = dilation_w;
+
+        t.height_col = height_col;
+        t.width_col = width_col;
+        t.channels_col = channels * kernel_h * kernel_w;
+
+        int total = t.height_col*t.width_col;
+#if 1
+        t(Range(0, total));
+#else
+        cv::parallel_for_(Range(0, total), t, 16);
+#endif
+    }
+
+    virtual void operator ()(const Range &r) const
+    {
+        int dh = dilation_h, dw = dilation_w;
+        int kh = kernel_h, kw = kernel_w;
+        Dtype* data_col_ = data_col;
+        const Dtype* data_im_ = data_im;
+        int kelems = kh*kw;
+        AutoBuffer<int> ofs_(kelems);
+        int* ofs = ofs_;
+        int k = 0;
+        for( int k_r = 0; k_r < kernel_h; k_r++ )
+            for( int k_c = 0; k_c < kernel_w; k_c++, k++ )
+                ofs[k] = k_r*dh*width + k_c*dw;
+
+        for (int row = r.start; row < r.end; ++row)
+        {
+            int out_c = row % width_col;
+            int out_r = row / width_col;
+            int out_row_offset = row*kh*kw*channels;
+
+            int start_in_r = out_r * stride_h - pad_h;
+            int start_in_c = out_c * stride_w - pad_w;
+            int start_k_r = std::max(0, (-start_in_r + dilation_h-1)/dilation_h);
+            int end_k_r = std::min(kh, (height - start_in_r + dilation_h-1)/dilation_h);
+            int start_k_c = std::max(0, (-start_in_c + dilation_w-1)/dilation_w);
+            int end_k_c = std::min(kw, (width - start_in_c + dilation_w-1)/dilation_w);
+
+            if( start_k_r == 0 && end_k_r == kh && start_k_c == 0 && end_k_c == kw )
+            {
+                for( int i_c = 0; i_c < channels; i_c++ )
+                {
+                    float* data_col_c = data_col_ + out_row_offset + i_c*kh*kw;
+                    const float* data_im_c = data_im_ + (i_c*height + start_in_r)*width + start_in_c;
+
+                    for( k = 0; k < kelems; k++ )
+                    {
+                        data_col_c[k] = data_im_c[ofs[k]];
+                    }
+                }
+            }
+            else
+            {
+                memset(data_col_, 0, kw*kh*channels*sizeof(data_col_[0]));
+                for(int i_c = 0; i_c < channels; i_c++)
+                {
+                    int channels_offset = i_c * width * height;
+                    int out_ch_offset = i_c*kh*kw;
+                    int in_r = start_in_r + start_k_r*dh;
+
+                    for(int k_r = start_k_r; k_r < end_k_r; k_r++, in_r += dh)
+                    {
+                        int row_offset = in_r*width;
+                        int out_col_offset = k_r*kw;
+                        int in_c = start_in_c + start_k_c*dw;
+
+                        for(int k_c = start_k_c; k_c < end_k_c; k_c++, in_c += dw)
+                        {
+                            int in_index = channels_offset + row_offset + in_c;
+                            int out_index = out_row_offset + out_ch_offset + out_col_offset + k_c;
+
+                            data_col_[out_index] = data_im_[in_index];
+                        }
+                    }
+                }
+            }
+        }
+    }
+};
+
+void im2row(const float* data_im, int channels, int height, int width,
+            int kernel_h, int kernel_w, int pad_h, int pad_w,
+            int stride_h, int stride_w, int dilation_h, int dilation_w,
+            int height_col, int width_col, float* data_col)
+{
+    im2row_CpuPBody<float>::run(data_im, channels, height, width,
+                                kernel_h, kernel_w, pad_h, pad_w,
+                                stride_h, stride_w, dilation_h, dilation_w,
+                                height_col, width_col, data_col);
+}
+
+
+#if 0
+template <typename Dtype>
+class col2im_CpuPBody : public cv::ParallelLoopBody
+{
+    const Dtype* data_col;
+    int channels, height, width;
+    int kernel_h, kernel_w;
+    int pad_h, pad_w;
+    int stride_h, stride_w;
+    Dtype* data_im;
+    int height_col, width_col;
+
+    col2im_CpuPBody() {}
+
+public:
+    static void run(const Dtype* data_col,
+                    int channels, int height, int width,
+                    int kernel_h, int kernel_w,
+                    int pad_h, int pad_w,
+                    int stride_h, int stride_w,
+                    Dtype* data_im)
+    {
+        //TODO: single-threaded version switch
+
+        col2im_CpuPBody t;
+        t.data_col = data_col;
+        t.data_im = data_im;
+        t.channels = channels; t.height = height; t.width = width;
+        t.kernel_h = kernel_h; t.kernel_w = kernel_w;
+        t.pad_h = pad_h; t.pad_w = pad_w;
+        t.stride_h = stride_h; t.stride_w = stride_w;
+        t.height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
+        t.width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
+        int img_total = channels * height * width;
+
+        cv::parallel_for_(Range(0, img_total), t);
+    }
+
+    virtual void operator ()(const Range &r) const
+    {
+        const Dtype* data_col_ = data_col;
+        Dtype* data_im_ = data_im;
+        int coeff_h_col = (1 - stride_h * kernel_w * height_col) * width_col;
+        int coeff_w_col = (1 - stride_w * height_col * width_col);
+        for (int index = r.start; index < r.end; index++)
+        {
+            Dtype val = 0;
+            int w = index % width + pad_w;
+            int h = (index / width) % height + pad_h;
+            int c = index / (width * height);
+
+            // compute the start and end of the output
+            int w_col_start = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
+            int w_col_end = std::min(w / stride_w + 1, width_col);
+            int h_col_start = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
+            int h_col_end = std::min(h / stride_h + 1, height_col);
+
+            // equivalent implementation
+            int offset =
+            (c * kernel_h * kernel_w + h * kernel_w + w) * height_col * width_col;
+
+            for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
+                for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
+                    val += data_col_[offset + h_col * coeff_h_col + w_col * coeff_w_col];
+                }
+            }
+            data_im_[index] = val;
+        }
+    }
+};
+#endif
+
+//single-threaded version
+template <typename Dtype>
+void col2im_cpu(const Dtype* data_col,
+                int channels, int height, int width,
+                int kernel_h, int kernel_w,
+                int pad_h, int pad_w,
+                int stride_h, int stride_w,
+                int dilation_h, int dilation_w,
+                Dtype* data_im,
+                const int* ofsbuf)
+{
+    int height_col = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+    int width_col = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+    int channels_col = channels * kernel_h * kernel_w;
+
+    std::memset(data_im, 0, height * width * channels * sizeof(Dtype));
+
+    for (int c = 0; c < channels_col; ++c, ofsbuf += 3)
+    {
+        //int w_offset = c % kernel_w;
+        //int h_offset = (c / kernel_w) % kernel_h;
+        //int c_im = c / kernel_h / kernel_w;
+        int w_offset = ofsbuf[0];
+        int h_offset = ofsbuf[1];
+        int c_im = ofsbuf[2];
+
+        for (int h = 0; h < height_col; ++h)
+        {
+            for (int w = 0; w < width_col; ++w)
+            {
+                int h_pad = h * stride_h - pad_h + h_offset * dilation_h;
+                int w_pad = w * stride_w - pad_w + w_offset * dilation_w;
+
+                if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
+                    data_im[(c_im * height + h_pad) * width + w_pad] +=
+                    data_col[(c * height_col + h) * width_col + w];
+            }
+        }
+    }
+}
+
+void col2im(const float* data_col, int channels, int height, int width,
+            int kernel_h, int kernel_w, int pad_h, int pad_w,
+            int stride_h, int stride_w, int dilation_h, int dilation_w,
+            float* data_im, const int* ofsbuf)
+{
+    //col2im_CpuPBody<float>::run(data_col, channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, data_im);
+    col2im_cpu(data_col, channels, height, width, kernel_h, kernel_w, pad_h, pad_w,
+               stride_h, stride_w, dilation_h, dilation_w, data_im, ofsbuf);
+}
+
+}
+}
--- a/modules/dnn/src/layers/op_im2col.hpp
+++ b/modules/dnn/src/layers/op_im2col.hpp
@@ -49,264 +49,15 @@ namespace cv
 namespace dnn
 {

-template <typename Dtype>
-class im2col_CpuPBody : public cv::ParallelLoopBody
-{
-    const Dtype* data_im;
-    int channels, height, width;
-    int kernel_h, kernel_w;
-    int pad_h, pad_w;
-    int stride_h, stride_w;
-    int dilation_h, dilation_w;
-    Dtype* data_col;
-    int height_col, width_col, channels_col;
-
-    im2col_CpuPBody() {}
-public:
-
-    static void run(const Dtype* data_im,
-                    int channels, int height, int width,
-                    int kernel_h, int kernel_w,
-                    int pad_h, int pad_w,
-                    int stride_h, int stride_w,
-                    int dilation_h, int dilation_w,
-                    int height_col, int width_col,
-                    Dtype* data_col)
-    {
-        im2col_CpuPBody<Dtype> t;
-
-        t.data_im = data_im;
-        t.data_col = data_col;
-        t.channels = channels; t.height = height; t.width = width;
-        t.kernel_h = kernel_h; t.kernel_w = kernel_w;
-        t.pad_h = pad_h; t.pad_w = pad_w;
-        t.stride_h = stride_h; t.stride_w = stride_w;
-        t.dilation_h = dilation_h; t.dilation_w = dilation_w;
-
-        t.height_col = height_col;
-        t.width_col = width_col;
-        t.channels_col = channels * kernel_h * kernel_w;
-
-        cv::parallel_for_(Range(0, t.channels_col), t);
-    }
-
-    virtual void operator ()(const Range &r) const
-    {
-        for (int c = r.start; c < r.end; ++c)
-        {
-            int w_offset = c % kernel_w;
-            int h_offset = (c / kernel_w) % kernel_h;
-            int c_im = c / kernel_h / kernel_w;
-            for (int h = 0; h < height_col; ++h)
-            {
-                for (int w = 0; w < width_col; ++w)
-                {
-                    int h_pad = h * stride_h - pad_h + h_offset * dilation_h;
-                    int w_pad = w * stride_w - pad_w + w_offset * dilation_w;
-                    if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
-                        data_col[(c * height_col + h) * width_col + w] =
-                            data_im[(c_im * height + h_pad) * width + w_pad];
-                    else
-                        data_col[(c * height_col + h) * width_col + w] = 0;
-                }
-            }
-        }
-    }
-};
-
-template <typename Dtype>
-class im2row_CpuPBody : public cv::ParallelLoopBody
-{
-    const Dtype* data_im;
-    int channels, height, width;
-    int kernel_h, kernel_w;
-    int pad_h, pad_w;
-    int stride_h, stride_w;
-    int dilation_h, dilation_w;
-    Dtype* data_col;
-    int height_col, width_col, channels_col;
-
-    im2row_CpuPBody() {}
-public:
-
-    static void run(const Dtype* data_im,
-                    int channels, int height, int width,
-                    int kernel_h, int kernel_w,
-                    int pad_h, int pad_w,
-                    int stride_h, int stride_w,
-                    int dilation_h, int dilation_w,
-                    int height_col, int width_col,
-                    Dtype* data_col)
-    {
-        im2row_CpuPBody<Dtype> t;
-
-        t.data_im = data_im;
-        t.data_col = data_col;
-        t.channels = channels; t.height = height; t.width = width;
-        t.kernel_h = kernel_h; t.kernel_w = kernel_w;
-        t.pad_h = pad_h; t.pad_w = pad_w;
-        t.stride_h = stride_h; t.stride_w = stride_w;
-        t.dilation_h = dilation_h; t.dilation_w = dilation_w;
-
-        t.height_col = height_col;
-        t.width_col = width_col;
-        t.channels_col = channels * kernel_h * kernel_w;
-
-        cv::parallel_for_(Range(0, t.height_col*t.width_col), t, 16);
-    }
-
-    virtual void operator ()(const Range &r) const
-    {
-        int dh = dilation_h, dw = dilation_w;
-        Dtype* data_col_ = data_col;
-        const Dtype* data_im_ = data_im;
-
-        for (int row = r.start; row < r.end; ++row)
-        {
-            int out_c = row % width_col;
-            int out_r = row / width_col;
-            int out_row_offset = row*kernel_h*kernel_w*channels;
-
-            int start_in_r = out_r * stride_h - pad_h;
-            int start_in_c = out_c * stride_w - pad_w;
-            int start_k_r = std::max(0, cvCeil(-start_in_r/(float)dilation_h));
-            int end_k_r = std::min(kernel_h, cvCeil((height - start_in_r)/(float)dilation_h));
-            int start_k_c = std::max(0, cvCeil(-start_in_c/(float)dilation_w));
-            int end_k_c = std::min(kernel_w, cvCeil((width - start_in_c)/(float)dilation_w));
-
-            for(int i_c = 0; i_c < channels; i_c++)
-            {
-                int channels_offset = i_c * width * height;
-                int out_ch_offset = i_c*kernel_h*kernel_w;
-                int in_r = start_in_r + start_k_r*dilation_h;
-
-                for(int k_r = start_k_r; k_r < end_k_r; k_r++, in_r += dh)
-                {
-                    int row_offset = in_r*width;
-                    int out_col_offset = k_r*kernel_w;
-                    int in_c = start_in_c + start_k_c*dilation_w;
-
-                    for(int k_c = start_k_c; k_c < end_k_c; k_c++, in_c += dw)
-                    {
-                        int in_index = channels_offset + row_offset + in_c;
-
-                        int out_index = out_row_offset + out_ch_offset + out_col_offset + k_c;
-
-                        data_col_[out_index] = data_im_[in_index];
-                    }
-                }
-            }
-        }
-    }
-};
-
-template <typename Dtype>
-class col2im_CpuPBody : public cv::ParallelLoopBody
-{
-    const Dtype* data_col;
-    int channels, height, width;
-    int kernel_h, kernel_w;
-    int pad_h, pad_w;
-    int stride_h, stride_w;
-    Dtype* data_im;
-    int height_col, width_col;
-
-    col2im_CpuPBody() {}
-
-public:
-
-    static void run(const Dtype* data_col,
-                    int channels, int height, int width,
-                    int kernel_h, int kernel_w,
-                    int pad_h, int pad_w,
-                    int stride_h, int stride_w,
-                    Dtype* data_im)
-    {
-        //TODO: single-threaded version switch
-
-        col2im_CpuPBody t;
-        t.data_col = data_col;
-        t.data_im = data_im;
-        t.channels = channels; t.height = height; t.width = width;
-        t.kernel_h = kernel_h; t.kernel_w = kernel_w;
-        t.pad_h = pad_h; t.pad_w = pad_w;
-        t.stride_h = stride_h; t.stride_w = stride_w;
-        t.height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
-        t.width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
-        int img_total = channels * height * width;
-
-        cv::parallel_for_(Range(0, img_total), t);
-    }
-
-    virtual void operator ()(const Range &r) const
-    {
-        const Dtype* data_col_ = data_col;
-        Dtype* data_im_ = data_im;
-        int coeff_h_col = (1 - stride_h * kernel_w * height_col) * width_col;
-        int coeff_w_col = (1 - stride_w * height_col * width_col);
-        for (int index = r.start; index < r.end; index++)
-        {
-            Dtype val = 0;
-            int w = index % width + pad_w;
-            int h = (index / width) % height + pad_h;
-            int c = index / (width * height);
-
-            // compute the start and end of the output
-            int w_col_start = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
-            int w_col_end = std::min(w / stride_w + 1, width_col);
-            int h_col_start = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
-            int h_col_end = std::min(h / stride_h + 1, height_col);
-
-            // equivalent implementation
-            int offset =
-            (c * kernel_h * kernel_w + h * kernel_w + w) * height_col * width_col;
-
-            for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
-              for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
-                val += data_col_[offset + h_col * coeff_h_col + w_col * coeff_w_col];
-              }
-            }
-            data_im_[index] = val;
-        }
-    }
-};
-
-//single-threaded version
-template <typename Dtype>
-void col2im_cpu(const Dtype* data_col,
-                int channels, int height, int width,
-                int kernel_h, int kernel_w,
-                int pad_h, int pad_w,
-                int stride_h, int stride_w,
-                int dilation_h, int dilation_w,
-                Dtype* data_im)
-{
-    int height_col = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
-    int width_col = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
-    int channels_col = channels * kernel_h * kernel_w;
-
-    std::memset(data_im, 0, height * width * channels * sizeof(Dtype));
-
-    for (int c = 0; c < channels_col; ++c)
-    {
-        int w_offset = c % kernel_w;
-        int h_offset = (c / kernel_w) % kernel_h;
-        int c_im = c / kernel_h / kernel_w;
-
-        for (int h = 0; h < height_col; ++h)
-        {
-            for (int w = 0; w < width_col; ++w)
-            {
-                int h_pad = h * stride_h - pad_h + h_offset * dilation_h;
-                int w_pad = w * stride_w - pad_w + w_offset * dilation_w;
-
-                if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
-                    data_im[(c_im * height + h_pad) * width + w_pad] +=
-                        data_col[(c * height_col + h) * width_col + w];
-            }
-        }
-    }
-}
+void im2row(const float* data_im, int channels, int height, int width,
+            int kernel_h, int kernel_w, int pad_h, int pad_w,
+            int stride_h, int stride_w, int dilation_h, int dilation_w,
+            int height_col, int width_col, float* data_col);
+
+void col2im(const float* data_col, int channels, int height, int width,
+            int kernel_h, int kernel_w, int pad_h, int pad_w,
+            int stride_h, int stride_w, int dilation_h, int dilation_w,
+            float* data_im, const int* ofsbuf);

 }
 }