optimized several conv net layers (#1227)

* rewritten the following layers to be [much] more efficient: convolution, fully connected, activations (relu, tanh, ...), LRN. Use optional AVX optimization for the first two. * eliminated trailing whitespaces

optimized several conv net layers (#1227)
* rewritten the following layers to be [much] more efficient: convolution, fully connected, activations (relu, tanh, ...), LRN. Use optional AVX optimization for the first two. * eliminated trailing whitespaces
645260af · Vadim Pisarevsky · GitHub · 009d2efb · 645260af · 645260af
Commit 645260af authored Jun 15, 2017 by Vadim Pisarevsky Committed by GitHub Jun 15, 2017
8 changed files
--- a/modules/dnn/include/opencv2/dnn/all_layers.hpp
+++ b/modules/dnn/include/opencv2/dnn/all_layers.hpp
@@ -201,9 +201,13 @@ namespace dnn
        String padMode;
    };

+    class CV_EXPORTS ActivationLayer;
+
    class CV_EXPORTS ConvolutionLayer : public BaseConvolutionLayer
    {
    public:
+        virtual bool setActivation(const Ptr<ActivationLayer>& layer) = 0;
+
        static Ptr<BaseConvolutionLayer> create(const LayerParams& params);
    };

@@ -327,8 +331,14 @@ namespace dnn
    };

    /* Activations */
+    class CV_EXPORTS ActivationLayer : public Layer
+    {
+    public:
+        virtual void forwardSlice(const float* src, float* dst, int len,
+                                  size_t outPlaneSize, int cn0, int cn1) const = 0;
+    };

-    class CV_EXPORTS ReLULayer : public Layer
+    class CV_EXPORTS ReLULayer : public ActivationLayer
    {
    public:
        float negativeSlope;
@@ -336,37 +346,37 @@ namespace dnn
        static Ptr<ReLULayer> create(const LayerParams &params);
    };

-    class CV_EXPORTS ChannelsPReLULayer : public Layer
+    class CV_EXPORTS ChannelsPReLULayer : public ActivationLayer
    {
    public:
        static Ptr<ChannelsPReLULayer> create(const LayerParams& params);
    };

-    class CV_EXPORTS TanHLayer : public Layer
+    class CV_EXPORTS TanHLayer : public ActivationLayer
    {
    public:
        static Ptr<TanHLayer> create(const LayerParams &params);
    };

-    class CV_EXPORTS SigmoidLayer : public Layer
+    class CV_EXPORTS SigmoidLayer : public ActivationLayer
    {
    public:
        static Ptr<SigmoidLayer> create(const LayerParams &params);
    };

-    class CV_EXPORTS BNLLLayer : public Layer
+    class CV_EXPORTS BNLLLayer : public ActivationLayer
    {
    public:
        static Ptr<BNLLLayer> create(const LayerParams &params);
    };

-    class CV_EXPORTS AbsLayer : public Layer
+    class CV_EXPORTS AbsLayer : public ActivationLayer
    {
    public:
        static Ptr<AbsLayer> create(const LayerParams &params);
    };

-    class CV_EXPORTS PowerLayer : public Layer
+    class CV_EXPORTS PowerLayer : public ActivationLayer
    {
    public:
        float power, scale, shift;
@@ -374,7 +384,7 @@ namespace dnn
        static Ptr<PowerLayer> create(const LayerParams &params);
    };

-    /* Layers using in semantic segmentation */
+    /* Layers used in semantic segmentation */

    class CV_EXPORTS CropLayer : public Layer
    {

--- a/modules/dnn/src/layers/convolution_layer.cpp
+++ b/modules/dnn/src/layers/convolution_layer.cpp
@@ -43,6 +43,7 @@
 #include "layers_common.hpp"
 #include "op_im2col.hpp"
 #include "op_blas.hpp"
+#include "opencv2/core/hal/intrin.hpp"
 #include <iostream>

 namespace cv
@@ -96,12 +97,17 @@ public:
        (stride.height == 1 && stride.width == 1) &&
        (dilation.height == 1 && dilation.width == 1);
    }
+    bool setActivation(const Ptr<ActivationLayer>& ) { return false; }
 };

 //TODO: simultaneously convolution and bias addition for cache optimization
 class ConvolutionLayerImpl : public BaseConvolutionLayerImpl
 {
 public:
+    enum { VEC_ALIGN = 8 };
+    Mat weightsMat;
+    Ptr<ActivationLayer> activ;
+
    MatShape computeColRowShape(const MatShape &inpShape, const MatShape &outShape) const
    {
        Size out(outShape[3], outShape[2]);
@@ -117,7 +123,7 @@ public:
    {
        CV_Assert(blobs.size() != 0);
        CV_Assert(!hasBias() || blobs[1].total() == (size_t)blobs[0].size[0]);
-        CV_Assert(inputs.size() != 0);
+        CV_Assert(inputs.size() == (size_t)1);

        internals.clear();

@@ -138,91 +144,376 @@ public:
            getConvPoolOutParams(Size(inpH, inpW), kernel, stride, padMode, out);
        }

-        int group = inpCn / blobs[0].size[1];
-
-        CV_Assert(inpCn % group == 0 && outCn % group == 0);
-        CV_Assert(blobs[0].size[0] == outCn);
+        int ngroups = inpCn / blobs[0].size[1];
+        CV_Assert(inpCn % ngroups == 0 && outCn % ngroups == 0);

        int dims[] = {inputs[0][0], outCn, out.height, out.width};
        outputs.resize(inputs.size(), shape(dims));

-        internals.push_back(MatShape());
-        if (!is1x1())
-            internals[0] = computeColRowShape(inputs[0], outputs[0]);
-
-        if (hasBias())
-            internals.push_back(shape(1, out.area()));
-
        return false;
    }

-    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
-    {
-        CV_Assert(inputs.size() > 0);
-
-        internals[0].setTo(0);
-
-        if (hasBias())
-            internals[1].setTo(1);
-
-        int outCn = blobs[0].size[0];
-        int inpCn = inputs[0]->size[1];
-        int inpGroupCn = blobs[0].size[1];
+    bool setActivation(const Ptr<ActivationLayer>& layer) { activ = layer; return true; }

-        Mat weightsMat = blobs[0].reshape(1, outCn);
-        Mat biasesMat  = hasBias() ? blobs[1].reshape(1, outCn) : Mat();
+    class ParallelConv : public cv::ParallelLoopBody
+    {
+    public:
+        enum { BLK_SIZE = 32, BLK_SIZE_CN = 64 };
+
+        const Mat* input_;
+        const Mat* weights_;
+        Mat* output_;
+        int outShape[4];
+        Size kernel_, pad_, stride_, dilation_;
+        int ngroups_, nstripes_;
+        std::vector<int> ofstab_;
+        std::vector<float> biasvec_;
+        const ActivationLayer* activ_;
+        bool is1x1_;
+        bool useAVX2;
+
+        ParallelConv() {}
+
+        static void run( const Mat& input, Mat& output,
+                         const Mat& weights, const Mat& bias,
+                         Size kernel, Size pad, Size stride, Size dilation,
+                         int ngroups, int nstripes, const ActivationLayer* activ )
+        {
+            CV_Assert( input.dims == 4 && output.dims == 4 &&
+                       input.size[0] == output.size[0] &&
+                       weights.rows == output.size[1] &&
+                       weights.cols == (input.size[1]/ngroups)*kernel.width*kernel.height &&
+                       input.type() == output.type() &&
+                       input.type() == weights.type() &&
+                       input.type() == CV_32F &&
+                       input.isContinuous() &&
+                       output.isContinuous() &&
+                       (bias.empty() || (bias.isContinuous() && bias.type() == CV_32F &&
+                                         bias.total() == (size_t)output.size[1])));
+            ParallelConv p;
+
+            p.input_ = &input;
+            p.weights_ = &weights;
+            p.output_ = &output;
+            for( int i = 0; i < 4; i++ ) p.outShape[i] = output.size[i];
+            p.outShape[1] /= ngroups;
+            p.kernel_ = kernel; p.pad_ = pad; p.stride_ = stride; p.dilation_ = dilation;
+            p.ngroups_ = ngroups;
+            p.nstripes_ = nstripes;
+            p.activ_ = activ;
+            int inpCnAll = input.size[1], width = input.size[3], height = input.size[2];
+            int inpCn = inpCnAll / ngroups;
+            int k, outCn = output.size[1];
+            p.is1x1_ = kernel == Size(0,0) && pad == Size(0, 0);
+            p.useAVX2 = checkHardwareSupport(CPU_AVX2);
+
+            int ncn = std::min(inpCn, (int)BLK_SIZE_CN);
+            p.ofstab_.resize(kernel.width*kernel.height*ncn);
+            int* ofstab = &p.ofstab_[0];
+
+            for( k = 0; k < ncn; k++ )
+                for( int k_r = 0; k_r < kernel.height; k_r++ )
+                    for( int k_c = 0; k_c < kernel.width; k_c++ )
+                        ofstab[(k*kernel.height + k_r)*kernel.width + k_c] =
+                        (k*height + k_r*dilation.height)*width + k_c*dilation.width;
+
+            p.biasvec_.resize(outCn+2);
+            float* biasvec = &p.biasvec_[0];
+            if( bias.empty() )
+            {
+                for( k = 0; k < outCn; k++ )
+                    biasvec[k] = 0.f;
+            }
+            else
+            {
+                for( k = 0; k < outCn; k++ )
+                    biasvec[k] = bias.at<float>(k);
+            }
+            biasvec[outCn] = biasvec[outCn+1] = biasvec[outCn-1];
+            parallel_for_(Range(0, nstripes), p, nstripes);
+        }

-        for (size_t ii = 0; ii < outputs.size(); ii++)
+        virtual void operator ()(const Range &r0) const
        {
-            int numImg = inputs[ii]->size[0];
-            int group = inpCn / blobs[0].size[1];
-            int outGroupCn = outCn / group;
-            Mat inpMat = *inputs[ii];
-            Mat outMat = outputs[ii].reshape(1, numImg*group*outGroupCn);
+            const int valign = ConvolutionLayerImpl::VEC_ALIGN;
+            int ngroups = ngroups_, batchSize = input_->size[0]*ngroups;
+            int outW = output_->size[3], outH = output_->size[2], outCn = output_->size[1]/ngroups;
+            int width = input_->size[3], height = input_->size[2], inpCn = input_->size[1]/ngroups;
+            int nstripes = nstripes_;
+            int kernel_w = kernel_.width, kernel_h = kernel_.height;
+            int pad_w = pad_.width, pad_h = pad_.height;
+            int stride_w = stride_.width, stride_h = stride_.height;
+            int dilation_w = dilation_.width, dilation_h = dilation_.height;
+            int karea = kernel_w*kernel_h;
+            int i, j, k;
+            size_t inpPlaneSize = width*height;
+            size_t outPlaneSize = outW*outH;
+            bool is1x1 = is1x1_;
+
+            int stripesPerSample;
+            size_t stripeSize;
+            Range r = r0;
+
+            if( nstripes >= batchSize*2 )
+            {
+                stripesPerSample = nstripes/batchSize;
+                stripeSize = alignSize((outPlaneSize + stripesPerSample - 1)/stripesPerSample, valign);
+                stripeSize = std::min(stripeSize, outPlaneSize);
+            }
+            else
+            {
+                stripesPerSample = 1;
+                int samplesPerStripe = std::max((batchSize + nstripes - 1)/nstripes, 1);
+                r.start *= samplesPerStripe;
+                r.end *= samplesPerStripe;
+                nstripes *= samplesPerStripe;
+                stripeSize = outPlaneSize;
+            }

-            for (int n = 0; n < numImg; n++)
+            const float* data_inp0_ = input_->ptr<float>();
+            const int* ofstab = &ofstab_[0];
+            const float* wptr_orig_ = weights_->ptr<float>();
+            size_t wstep = weights_->step1();
+            const float* biasvec = &biasvec_[0];
+            float* data_out0_ = output_->ptr<float>();
+            size_t rowbufsz = (size_t)karea*BLK_SIZE_CN*BLK_SIZE;
+            const int valignBytes = (int)(valign*sizeof(float));
+            AutoBuffer<float> rowbuf0_(rowbufsz + valignBytes);
+            float* rowbuf0 = alignPtr((float*)rowbuf0_, valignBytes);
+
+            // we clear the buffer once; ultimately, it lets us to avoid
+            // tail processing after running the unrolled/vectorized loop.
+            // the main idea is to make sure that the tail (a.k.a. padding) of each row
+            // (i.e. the elements with indices between vsz=karea*ncn and vsz_a)
+            // does not contain NaNs or Infs. Because the padding in the weights
+            // matrix is explicitly initialized with 0's, we handle all other
+            // cases nicely, i.e. we can skip expliciting re-initialization
+            // of the padding - we just retain elements from the previous iteration
+            // of the loop over channels (cn0).
+            memset(rowbuf0, 0, rowbufsz*sizeof(rowbuf0[0]) );
+
+            for( int stripe = r.start; stripe < r.end; stripe++ )
            {
-                for (int g = 0; g < group; g++)
+                int subsampleIdx = stripe/stripesPerSample;
+                if( subsampleIdx >= batchSize )
+                    break;
+                int stripeStart = (int)((stripe - subsampleIdx*stripesPerSample)*stripeSize);
+                int stripeEnd = (int)std::min(stripeStart + stripeSize, outPlaneSize);
+                const float* data_inp0 = data_inp0_ + subsampleIdx*inpPlaneSize*inpCn;
+                float* data_out0 = data_out0_ + subsampleIdx*outPlaneSize*outCn;
+                int startOutCn = (subsampleIdx % ngroups)*outCn;
+                const float* wptr_orig = wptr_orig_ + wstep*startOutCn;
+                const float* biasptr = biasvec + startOutCn;
+
+                for( int cn0 = 0; cn0 < inpCn; cn0 += BLK_SIZE_CN )
                {
-                    Mat curInp = slice(inpMat, n, _Range(g * inpGroupCn, inpGroupCn));
-
-                    im2row(curInp, internals[0], shape(inpMat), shape(outputs[ii]));
-
-                    _Range kerRange(g * outGroupCn, outGroupCn);
-                    Mat kerMat = weightsMat.rowRange(kerRange);
-
-                    _Range outRange((g + n * group) * outGroupCn, outGroupCn);
-                    Mat dstMat = outMat.rowRange(outRange);
+                    int cn1 = std::min(cn0 + BLK_SIZE_CN, inpCn);
+                    int ncn = cn1 - cn0, vsz = karea*ncn;
+                    int vsz_a = (int)alignSize(vsz, valign);
+                    const float* wptr = wptr_orig + cn0*karea;

-                    dnn::gemm(kerMat, internals[0], 1, dstMat, 0, GEMM_2_T);
-
-                    if (hasBias())
+                    for( int ofs0 = stripeStart; ofs0 < stripeEnd; ofs0 += BLK_SIZE )
                    {
-                        dnn::gemm(biasesMat.rowRange(kerRange), internals[1], 1, dstMat, 1);
+                        int ofs, ofs1 = std::min(ofs0 + BLK_SIZE, stripeEnd);
+
+                        // do im2row for a part of input tensor
+                        if( is1x1 )
+                        {
+                            for( ofs = ofs0; ofs < ofs1; ofs++ )
+                            {
+                                int out_i = ofs / outW;
+                                int out_j = ofs - out_i * outW;
+                                float* rowbuf = rowbuf0 + (ofs - ofs0)*vsz_a;
+
+                                int in_i = out_i * stride_h - pad_h;
+                                int in_j = out_j * stride_w - pad_w;
+                                const float* imgptr = data_inp0 + (cn0*height + in_i)*width + in_j;
+
+                                for( k = 0; k < vsz; k++ )
+                                    rowbuf[k] = imgptr[k*inpPlaneSize];
+                            }
+                        }
+                        else
+                        {
+                            for( ofs = ofs0; ofs < ofs1; ofs++ )
+                            {
+                                int out_i = ofs / outW;
+                                int out_j = ofs - out_i * outW;
+                                float* rowbuf = rowbuf0 + (ofs - ofs0)*vsz_a;
+
+                                int in_i = out_i * stride_h - pad_h;
+                                int in_j = out_j * stride_w - pad_w;
+                                const float* imgptr = data_inp0 + (cn0*height + in_i)*width + in_j;
+
+                                // this condition should be true for most of the tensor elements, i.e.
+                                // most of the time the kernel aperture is inside the tensor X-Y plane.
+                                if( 0 <= in_i && in_i < height - (kernel_h-1)*dilation_h &&
+                                    0 <= in_j && in_j < width - (kernel_w-1)*dilation_w )
+                                {
+                                    for( k = 0; k < vsz; k++ )
+                                        rowbuf[k] = imgptr[ofstab[k]];
+                                }
+                                else
+                                {
+                                    int i0 = std::max(0, (-in_i + dilation_h-1)/dilation_h);
+                                    int i1 = std::min(kernel_h, (height - in_i + dilation_h-1)/dilation_h);
+                                    int j0 = std::max(0, (-in_j + dilation_w-1)/dilation_w);
+                                    int j1 = std::min(kernel_w, (width - in_j + dilation_w-1)/dilation_w);
+
+                                    // here some non-continous sub-row of the row will not be
+                                    // filled from the tensor; we need to make sure that the uncovered
+                                    // elements are explicitly set to 0's. the easiest way is to
+                                    // set all the elements to 0's before the loop.
+                                    memset(rowbuf, 0, vsz*sizeof(rowbuf[0]));
+                                    for( k = 0; k < ncn; k++, imgptr += width*height )
+                                    {
+                                        for( i = i0; i < i1; i++ )
+                                        {
+                                            for( j = j0; j < j1; j++ )
+                                            {
+                                                int imgofs = i*(dilation_h*width) + j*dilation_w;
+                                                rowbuf[(k*kernel_h + i)*kernel_w + j] = imgptr[imgofs];
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                        }
+
+                        // now compute dot product of the weights
+                        // and im2row-transformed part of the tensor
+                        int bsz = ofs1 - ofs0;
+                    #if CV_DNN_TRY_AVX2
+                        if(useAVX2)
+                            fastConv_avx2(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0, outShape, bsz, vsz, vsz_a, cn0 == 0);
+                        else
+                    #endif
+                        for( int i = 0; i < outCn; i += 2 )
+                        {
+                            const float* wptr0 = wptr + i*wstep;
+                            const float* wptr1 = wptr0 + wstep;
+                            float* outptr0 = data_out0 + ofs0 + i*outPlaneSize;
+                            float* outptr1 = outptr0 + outPlaneSize;
+                            float bias0 = biasptr[i], bias1 = biasptr[i+1];
+
+                            if( i+1 >= outCn )
+                            {
+                                wptr1 = wptr0;
+                                outptr1 = outptr0;
+                                bias1 = bias0;
+                            }
+
+                            int j = 0;
+                        #if CV_SIMD128
+                            for( ; j <= bsz - 4; j += 4 )
+                            {
+                                const float* rptr = rowbuf0 + j*vsz_a;
+                                v_float32x4 s0, s1;
+
+                                if( cn0 == 0 )
+                                {
+                                    s0 = v_setall_f32(bias0);
+                                    s1 = v_setall_f32(bias1);
+                                }
+                                else
+                                {
+                                    s0 = v_load(outptr0 + j);
+                                    s1 = v_load(outptr1 + j);
+                                }
+
+                                v_float32x4 vs00 = v_setzero_f32(), vs01 = v_setzero_f32(),
+                                            vs02 = v_setzero_f32(), vs03 = v_setzero_f32(),
+                                            vs10 = v_setzero_f32(), vs11 = v_setzero_f32(),
+                                            vs12 = v_setzero_f32(), vs13 = v_setzero_f32();
+                                for( k = 0; k < vsz; k += 4, rptr += 4 )
+                                {
+                                    v_float32x4 w0 = v_load_aligned(wptr0 + k), w1 = v_load_aligned(wptr1 + k);
+                                    v_float32x4 r0 = v_load_aligned(rptr), r1 = v_load_aligned(rptr + vsz_a),
+                                                r2 = v_load_aligned(rptr + vsz_a*2), r3 = v_load_aligned(rptr + vsz_a*3);
+
+                                    vs00 += w0*r0;
+                                    vs01 += w0*r1;
+                                    vs02 += w0*r2;
+                                    vs03 += w0*r3;
+
+                                    vs10 += w1*r0;
+                                    vs11 += w1*r1;
+                                    vs12 += w1*r2;
+                                    vs13 += w1*r3;
+                                }
+                                s0 += v_reduce_sum4(vs00, vs01, vs02, vs03);
+                                s1 += v_reduce_sum4(vs10, vs11, vs12, vs13);
+
+                                v_store(outptr0 + j, s0);
+                                v_store(outptr1 + j, s1);
+                            }
+                        #endif
+                            for( ; j < bsz; j++ )
+                            {
+                                const float* rptr = rowbuf0 + j*vsz_a;
+                                float s00, s10;
+
+                                if( cn0 == 0 )
+                                {
+                                    s00 = bias0;
+                                    s10 = bias1;
+                                }
+                                else
+                                {
+                                    s00 = outptr0[j];
+                                    s10 = outptr1[j];
+                                }
+
+                                for( k = 0; k < vsz; k++ )
+                                {
+                                    float r0 = rptr[k];
+                                    s00 += wptr0[k]*r0;
+                                    s10 += wptr1[k]*r0;
+                                }
+
+                                outptr0[j] = s00;
+                                outptr1[j] = s10;
+                            }
+                        }
                    }
                }
+
+                if( activ_ )
+                    activ_->forwardSlice(data_out0 + stripeStart, data_out0 + stripeStart,
+                                         (int)(stripeEnd - stripeStart),
+                                         outPlaneSize, startOutCn, startOutCn + outCn);
            }
        }
-    }
+    };

-    void im2row(const  Mat &srcImg, Mat &dstRow, const MatShape& inShape, const MatShape& outShape)
+    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
    {
-        int inpH = inShape[2];
-        int inpW = inShape[3];
-        int outH = outShape[2], outW = outShape[3];
-        int inpGroupCn = blobs[0].size[1];
-        int ksize = inpGroupCn * kernel.height * kernel.width;
+        CV_Assert(inputs.size() == (size_t)1 && inputs[0]->size[1] % blobs[0].size[1] == 0);
+        int ngroups = inputs[0]->size[1]/blobs[0].size[1];
+        CV_Assert(outputs[0].size[1] % ngroups == 0);

-        if (is1x1())
-        {
-            transpose(srcImg.reshape(1, ksize), dstRow);
-        }
-        else
+        int outCn = blobs[0].size[0];
+
+        if( weightsMat.empty() )
        {
-            cv::dnn::im2row(srcImg.ptr<float>(), inpGroupCn, inpH, inpW, kernel.height,
-                            kernel.width, pad.height, pad.width, stride.height, stride.width,
-                            dilation.height, dilation.width, outH, outW, dstRow.ptr<float>());
+            Mat wm = blobs[0].reshape(1, outCn);
+            if( wm.step1() % VEC_ALIGN != 0 )
+            {
+                int newcols = (int)alignSize(wm.step1(), VEC_ALIGN);
+                Mat wm_buffer = Mat(outCn, newcols, wm.type());
+                Mat wm_padding = wm_buffer.colRange(wm.cols, newcols);
+                wm_padding.setTo(Scalar::all(0.));
+                Mat wm_aligned = wm_buffer.colRange(0, wm.cols);
+                wm.copyTo(wm_aligned);
+                wm = wm_aligned;
+            }
+            weightsMat = wm;
        }
+        Mat biasesMat = hasBias() ? blobs[1].reshape(1, outCn) : Mat();
+
+        int nstripes = std::max(getNumThreads(), 1);
+        ParallelConv::run(*inputs[0], outputs[0], weightsMat, biasesMat,
+                          kernel, pad, stride, dilation, ngroups, nstripes, activ.get());
    }

    virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
@@ -249,8 +540,8 @@ public:
        int inpH = inpShape[2];
        int inpW = inpShape[3];
        int outCn = outShape[1];
-        int group = inpCn / blobs[0].size[1];
-        int outGroupCn = outCn / group;
+        int ngroups = inpCn / blobs[0].size[1];
+        int outGroupCn = outCn / ngroups;
        int ksize = outGroupCn * kernel.height * kernel.width;
        return shape(ksize, inpH * inpW);
    }
@@ -271,10 +562,10 @@ public:
        int outW = stride.width * (inpW - 1) + kernel.width - 2 * pad.width + adjustPad.width;
        int outCn = blobs[0].size[0];

-        int group = inpCn / blobs[0].size[1];
+        int ngroups = inpCn / blobs[0].size[1];

-        CV_Assert(inpCn % group == 0 && outCn % group == 0);
-        CV_Assert(blobs[0].size[0] == outCn && blobs[0].size[1] == inpCn / group);
+        CV_Assert(inpCn % ngroups == 0 && outCn % ngroups == 0);
+        CV_Assert(blobs[0].size[0] == outCn && blobs[0].size[1] == inpCn / ngroups);

        int dims[] = {inputs[0][0], outCn, outH, outW};
        outputs.resize(inputs.size(), shape(dims));
@@ -303,9 +594,9 @@ public:

        for (size_t ii = 0; ii < outputs.size(); ii++)
        {
-            int group = inpCn / blobs[0].size[1];
+            int ngroups = inpCn / blobs[0].size[1];
            int inpGroupCn = blobs[0].size[1];
-            int outGroupCn = outCn / group;
+            int outGroupCn = outCn / ngroups;
            int numImg = inputs[ii]->size[0];

            Mat convBlob = inputs[ii]->reshape(1, numImg*inpCn);
@@ -313,12 +604,12 @@ public:

            for (int n = 0; n < numImg; n++)
            {
-                for (int g = 0; g < group; g++)
+                for (int g = 0; g < ngroups; g++)
                {
-                    Mat dstMat = decnBlob.rowRange(_Range((g + n * group) * outGroupCn, outGroupCn));
+                    Mat dstMat = decnBlob.rowRange(_Range((g + n * ngroups) * outGroupCn, outGroupCn));
                    Mat &colMat = (is1x1()) ? dstMat : internals[0];

-                    Mat convMat = convBlob.rowRange(_Range((g + n * group) * inpGroupCn, inpGroupCn));
+                    Mat convMat = convBlob.rowRange(_Range((g + n * ngroups) * inpGroupCn, inpGroupCn));
                    Mat wghtMat = weightsMat.rowRange(_Range(g * inpGroupCn, inpGroupCn));

                    dnn::gemm(wghtMat, convMat, 1, colMat, 0, GEMM_1_T);
@@ -340,8 +631,8 @@ public:
    {
        int outCn = outShape[1], outH = outShape[2], outW = outShape[3];
        int inpCn = inShape[1];
-        int group = inpCn / blobs[0].size[1];
-        int outGroupCn = outCn / group;
+        int ngroups = inpCn / blobs[0].size[1];
+        int outGroupCn = outCn / ngroups;

        if (is1x1())
        {
@@ -382,12 +673,12 @@ static void initConvDeconvLayerFromCaffe(Ptr<BaseConvolutionLayer> l, const Laye

    bool bias = params.get<bool>("bias_term", true);
    int numOutput = params.get<int>("num_output");
-    int group = params.get<int>("group", 1);
+    int ngroups = params.get<int>("group", 1);

    l->adjustPad.height = params.get<int>("adj_h", 0);
    l->adjustPad.width = params.get<int>("adj_w", 0);

-    CV_Assert(numOutput % group == 0);
+    CV_Assert(numOutput % ngroups == 0);
    CV_Assert((bias && l->blobs.size() == 2) || (!bias && l->blobs.size() == 1));
 }


--- a/modules/dnn/src/layers/elementwise_layers.cpp
+++ b/modules/dnn/src/layers/elementwise_layers.cpp
@@ -16,25 +16,53 @@ template<typename Func>
 class ElementWiseLayer : public Func::Layer
 {
 public:
-    template<typename Dtype>
    class PBody : public cv::ParallelLoopBody
    {
-        Func &func;
-        Dtype *src, *dst;
    public:
+        const Func* func_;
+        const Mat* src_;
+        Mat* dst_;
+        int nstripes_;

-        PBody(Mat &src, Mat &dst, Func &func_) :
-            func(func_), src(src.ptr<Dtype>()), dst(dst.ptr<Dtype>())
-        {}
+        PBody(const Func &func, const Mat &src, Mat& dst, int nstripes)
+        {
+            func_ = &func;
+            src_ = &src;
+            dst_ = &dst;
+            nstripes_ = nstripes;
+        }

        void operator()(const Range &r) const
        {
-            for (int i = r.start; i < r.end; i++)
-                dst[i] = func(src[i]);
+            int nstripes = nstripes_, nsamples, outCn;
+            size_t planeSize;
+
+            if( src_->dims == 4 )
+            {
+                nsamples = src_->size[0];
+                outCn = src_->size[1];
+                planeSize = (size_t)src_->size[2]*src_->size[3];
+            }
+            else
+            {
+                nsamples = outCn = 1;
+                planeSize = (size_t)src_->total();
+            }
+
+            size_t stripeSize = (planeSize + nstripes - 1)/nstripes;
+            size_t stripeStart = r.start*stripeSize;
+            size_t stripeEnd = std::min(r.end*stripeSize, planeSize);
+
+            for( int i = 0; i < nsamples; i++ )
+            {
+                const float* srcptr = src_->ptr<float>(i) + stripeStart;
+                float* dstptr = dst_->ptr<float>(i) + stripeStart;
+                func_->apply(srcptr, dstptr, (int)(stripeEnd - stripeStart), planeSize, 0, outCn);
+            }
        }
    };

-    ElementWiseLayer(bool run_parallel_=false, const Func &f=Func()) : func(f), run_parallel(run_parallel_) {}
+    ElementWiseLayer(const Func &f=Func()) { func = f; }

    bool getMemoryShapes(const std::vector<MatShape> &inputs,
                         const int requiredOutputs,
@@ -49,20 +77,22 @@ public:
    {
        for (size_t i = 0; i < inputs.size(); i++)
        {
-            Mat &src = *inputs[i];
+            const Mat &src = *inputs[i];
            Mat &dst = outputs[i];
-            CV_Assert(src.isContinuous() && dst.isContinuous());
+            CV_Assert(src.size == dst.size && src.type() == dst.type() &&
+                      src.isContinuous() && dst.isContinuous() && src.type() == CV_32F);

-            Range sizeRange = Range(0, dst.total());
-            CV_Assert(src.type() == CV_32F);
-            PBody<float> body(src, dst, func);
-            if( run_parallel )
-                cv::parallel_for_(sizeRange, body);
-            else
-                body(sizeRange);
+            const int nstripes = getNumThreads();
+            PBody body(func, src, dst, nstripes);
+            parallel_for_(Range(0, nstripes), body, nstripes);
        }
    }

+    void forwardSlice(const float* src, float* dst, int len, size_t planeSize, int cn0, int cn1) const
+    {
+        func.apply(src, dst, len, planeSize, cn0, cn1);
+    }
+
    virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
                           const std::vector<MatShape> &outputs) const
    {
@@ -83,172 +113,208 @@ struct ReLUFunctor
    typedef ReLULayer Layer;
    float slope;

-    ReLUFunctor(float slope_) : slope(slope_) {}
+    explicit ReLUFunctor(float slope_=1.f) : slope(slope_) {}

-    template<typename TFloat>
-    inline TFloat operator()(TFloat x) const
+    void apply(const float* srcptr, float* dstptr, int len, size_t planeSize, int cn0, int cn1) const
    {
-        return (x >= (TFloat)0) ? x : (TFloat)slope * x;
+        float s = slope;
+        for( int cn = cn0; cn < cn1; cn++, srcptr += planeSize, dstptr += planeSize )
+        {
+            int i = 0;
+#if CV_SIMD128
+            v_float32x4 s4 = v_setall_f32(s), z = v_setzero_f32();
+            for( ; i <= len - 16; i += 16 )
+            {
+                v_float32x4 x0 = v_load(srcptr + i);
+                v_float32x4 x1 = v_load(srcptr + i + 4);
+                v_float32x4 x2 = v_load(srcptr + i + 8);
+                v_float32x4 x3 = v_load(srcptr + i + 12);
+                x0 = v_select(x0 >= z, x0, x0*s4);
+                x1 = v_select(x1 >= z, x1, x1*s4);
+                x2 = v_select(x2 >= z, x2, x2*s4);
+                x3 = v_select(x3 >= z, x3, x3*s4);
+                v_store(dstptr + i, x0);
+                v_store(dstptr + i + 4, x1);
+                v_store(dstptr + i + 8, x2);
+                v_store(dstptr + i + 12, x3);
+            }
+#endif
+            for( ; i < len; i++ )
+            {
+                float x = srcptr[i];
+                dstptr[i] = x >= 0.f ? x : s*x;
+            }
+        }
    }

-    int64 getFLOPSPerElement() const {return 1;}
+    int64 getFLOPSPerElement() const { return 1; }
 };

 struct TanHFunctor
 {
    typedef TanHLayer Layer;

-    template<typename TFloat>
-    inline TFloat operator()(TFloat x) const
+    void apply(const float* srcptr, float* dstptr, int len, size_t planeSize, int cn0, int cn1) const
    {
-        return tanh(x);
+        for( int cn = cn0; cn < cn1; cn++, srcptr += planeSize, dstptr += planeSize )
+        {
+            for( int i = 0; i < len; i++ )
+            {
+                float x = srcptr[i];
+                dstptr[i] = tanh(x);
+            }
+        }
    }

-    int64 getFLOPSPerElement() const {return 1;}
+    int64 getFLOPSPerElement() const { return 1; }
 };

 struct SigmoidFunctor
 {
    typedef SigmoidLayer Layer;

-    template<typename TFloat>
-    inline TFloat operator()(TFloat x) const
+    void apply(const float* srcptr, float* dstptr, int len, size_t planeSize, int cn0, int cn1) const
    {
-        return (TFloat)1 / ((TFloat)1 + exp(-x));
+        for( int cn = cn0; cn < cn1; cn++, srcptr += planeSize, dstptr += planeSize )
+        {
+            for( int i = 0; i < len; i++ )
+            {
+                float x = srcptr[i];
+                dstptr[i] = 1.f/(1.f + exp(-x));
+            }
+        }
    }

-    int64 getFLOPSPerElement() const {return 3;}
+    int64 getFLOPSPerElement() const { return 3; }
 };

 struct AbsValFunctor
 {
    typedef AbsLayer Layer;

-    template<typename TFloat>
-    inline TFloat operator()(TFloat x) const
+    void apply(const float* srcptr, float* dstptr, int len, size_t planeSize, int cn0, int cn1) const
    {
-        return abs(x);
+        for( int cn = cn0; cn < cn1; cn++, srcptr += planeSize, dstptr += planeSize )
+        {
+            for( int i = 0; i < len; i++ )
+            {
+                float x = srcptr[i];
+                dstptr[i] = abs(x);
+            }
+        }
    }

-    int64 getFLOPSPerElement() const {return 1;}
+    int64 getFLOPSPerElement() const { return 1; }
 };

 struct BNLLFunctor
 {
    typedef BNLLLayer Layer;

-    template<typename TFloat>
-    inline TFloat operator()(TFloat x) const
+    void apply(const float* srcptr, float* dstptr, int len, size_t planeSize, int cn0, int cn1) const
    {
-        return log((TFloat)1 + exp(-abs(x)));
+        for( int cn = cn0; cn < cn1; cn++, srcptr += planeSize, dstptr += planeSize )
+        {
+            for( int i = 0; i < len; i++ )
+            {
+                float x = srcptr[i];
+                dstptr[i] = log(1.f + exp(-abs(x)));
+            }
+        }
    }

-    int64 getFLOPSPerElement() const {return 5;}
+    int64 getFLOPSPerElement() const { return 5; }
 };

 struct PowerFunctor
 {
    typedef PowerLayer Layer;

-    const float power;
-    const float scale;
-    const float shift;
+    float power;
+    float scale;
+    float shift;

-    PowerFunctor(float power_, float scale_ = 1.f, float shift_ = 0)
+    explicit PowerFunctor(float power_ = 1.f, float scale_ = 1.f, float shift_ = 0.f)
        : power(power_), scale(scale_), shift(shift_) {}

-    template<typename TFloat>
-    inline TFloat operator()(TFloat x) const
+    void apply(const float* srcptr, float* dstptr, int len, size_t planeSize, int cn0, int cn1) const
    {
-        return pow((TFloat)shift + (TFloat)scale * x, (TFloat)power);
+        float a = scale, b = shift, p = power;
+        if( p == 1.f )
+        {
+            for( int cn = cn0; cn < cn1; cn++, srcptr += planeSize, dstptr += planeSize )
+            {
+                for( int i = 0; i < len; i++ )
+                {
+                    float x = srcptr[i];
+                    dstptr[i] = a*x + b;
+                }
+            }
+        }
+        else
+        {
+            for( int cn = cn0; cn < cn1; cn++, srcptr += planeSize, dstptr += planeSize )
+            {
+                for( int i = 0; i < len; i++ )
+                {
+                    float x = srcptr[i];
+                    dstptr[i] = pow(a*x + b, p);
+                }
+            }
+        }
    }

-    int64 getFLOPSPerElement() const {return 3;}
+    int64 getFLOPSPerElement() const { return power == 1 ? 2 : 10; }
 };

-struct PowerFunctor1
-{
-    typedef PowerLayer Layer;
-
-    const float scale;
-    const float shift;
-
-    PowerFunctor1(float scale_ = 1.f, float shift_ = 0)
-    : scale(scale_), shift(shift_) {}
-
-    template<typename TFloat>
-    inline TFloat operator()(TFloat x) const
-    {
-        return (TFloat)shift + (TFloat)scale * x;
-    }

-    int64 getFLOPSPerElement() const {return 2;}
-};
-
-class ChannelsPReLULayerImpl : public ChannelsPReLULayer
+struct ChannelsPReLUFunctor
 {
-public:
-    ChannelsPReLULayerImpl(const LayerParams& params)
-    {
-        CV_Assert(params.blobs.size() == 1);
-        setParamsFrom(params);
-    }
+    typedef ChannelsPReLULayer Layer;
+    Mat scale;

-    bool getMemoryShapes(const std::vector<MatShape> &inputs,
-                                         const int requiredOutputs,
-                                         std::vector<MatShape> &outputs,
-                                         std::vector<MatShape> &internals) const
+    explicit ChannelsPReLUFunctor(const Mat& scale_=Mat()) : scale(scale_)
    {
-        Layer::getMemoryShapes(inputs, requiredOutputs, outputs, internals);
-        return true;
    }

-    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
+    void apply(const float* srcptr, float* dstptr, int len, size_t planeSize, int cn0, int cn1) const
    {
-        CV_Assert(inputs.size() == 1);
-        Mat &inpBlob = *inputs[0];
-
-        for (size_t ii = 0; ii < outputs.size(); ii++)
-        {
-            Mat &outBlob = outputs[ii];
-            CV_Assert(inpBlob.isContinuous() && outBlob.isContinuous());
+        CV_Assert(scale.isContinuous() && scale.type() == CV_32F);

-            CV_Assert(blobs[0].total() == inpBlob.size[1]);
+        const float* scaleptr = scale.ptr<float>();
+        CV_Assert( 0 <= cn0 && cn0 < cn1 && cn1 <= (int)scale.total() );

-            for (int n = 0; n < inpBlob.size[1]; n++)
+        for( int cn = cn0; cn < cn1; cn++, srcptr += planeSize, dstptr += planeSize )
+        {
+            float s = scaleptr[cn];
+            int i = 0;
+        #if CV_SIMD128
+            v_float32x4 s4 = v_setall_f32(s), z = v_setzero_f32();
+            for( ; i <= len - 16; i += 16 )
            {
-                float slopeWeight = blobs[0].at<float>(n);
-
-                Mat inpBlobPlane = getPlane(inpBlob, 0, n);
-                Mat outBlobPlane = getPlane(outBlob, 0, n);
-
-                size_t i, planeTotal = inpBlobPlane.total();
-                const float* inptr = inpBlobPlane.ptr<float>();
-                float* outptr = outBlobPlane.ptr<float>();
-                for( i = 0; i < planeTotal; i++ )
-                {
-                    float val = inptr[i];
-                    outptr[i] = val*(val >= 0.f ? 1.f : slopeWeight);
-                }
-                //threshold(inpBlobPlane, outBlobPlane, 0, 0, cv::THRESH_TOZERO_INV);
-                //scaleAdd(outBlobPlane, slopeWeight-1, inpBlobPlane, outBlobPlane);
+                v_float32x4 x0 = v_load(ptr + i);
+                v_float32x4 x1 = v_load(ptr + i + 4);
+                v_float32x4 x2 = v_load(ptr + i + 8);
+                v_float32x4 x3 = v_load(ptr + i + 12);
+                x0 = v_select(x0 >= z, x0, x0*s4);
+                x1 = v_select(x1 >= z, x1, x1*s4);
+                x2 = v_select(x2 >= z, x2, x2*s4);
+                x3 = v_select(x3 >= z, x3, x3*s4);
+                v_store(ptr + i, x0);
+                v_store(ptr + i + 4, x1);
+                v_store(ptr + i + 8, x2);
+                v_store(ptr + i + 12, x3);
+            }
+        #endif
+            for( ; i < len; i++ )
+            {
+                float x = srcptr[i];
+                dstptr[i] = x >= 0.f ? x : s*x;
            }
        }
    }

-    virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
-                           const std::vector<MatShape> &outputs) const
-    {
-        (void)inputs; // suppress unused variable warning
-        long flops = 0;
-
-        for (int i = 0; i < outputs.size(); i++)
-        {
-            flops += total(outputs[i]) * 3;
-        }
-
-        return flops;
-    }
+    int64 getFLOPSPerElement() const { return 1; }
 };

 #define ACTIVATION_CREATOR_FOR(_Layer, _Functor, ...) \
@@ -259,7 +325,7 @@ Ptr<_Layer> _Layer::create() { \
 Ptr<ReLULayer> ReLULayer::create(const LayerParams& params)
 {
    float negativeSlope = params.get<float>("negative_slope", 0.f);
-    Ptr<ReLULayer> l(new ElementWiseLayer<ReLUFunctor>(true, ReLUFunctor(negativeSlope)));
+    Ptr<ReLULayer> l(new ElementWiseLayer<ReLUFunctor>(ReLUFunctor(negativeSlope)));
    l->setParamsFrom(params);
    l->negativeSlope = negativeSlope;

@@ -268,7 +334,7 @@ Ptr<ReLULayer> ReLULayer::create(const LayerParams& params)

 Ptr<TanHLayer> TanHLayer::create(const LayerParams& params)
 {
-    Ptr<TanHLayer> l(new ElementWiseLayer<TanHFunctor>(true));
+    Ptr<TanHLayer> l(new ElementWiseLayer<TanHFunctor>());
    l->setParamsFrom(params);

    return l;
@@ -276,7 +342,7 @@ Ptr<TanHLayer> TanHLayer::create(const LayerParams& params)

 Ptr<SigmoidLayer> SigmoidLayer::create(const LayerParams& params)
 {
-    Ptr<SigmoidLayer> l(new ElementWiseLayer<SigmoidFunctor>(true));
+    Ptr<SigmoidLayer> l(new ElementWiseLayer<SigmoidFunctor>());
    l->setParamsFrom(params);

    return l;
@@ -292,7 +358,7 @@ Ptr<AbsLayer> AbsLayer::create(const LayerParams& params)

 Ptr<BNLLLayer> BNLLLayer::create(const LayerParams& params)
 {
-    Ptr<BNLLLayer> l(new ElementWiseLayer<BNLLFunctor>(true));
+    Ptr<BNLLLayer> l(new ElementWiseLayer<BNLLFunctor>());
    l->setParamsFrom(params);

    return l;
@@ -303,9 +369,7 @@ Ptr<PowerLayer> PowerLayer::create(const LayerParams& params)
    float power = params.get<float>("power", 1.0f);
    float scale = params.get<float>("scale", 1.0f);
    float shift = params.get<float>("shift", 0.0f);
-    Ptr<PowerLayer> l(power == 1.f ?
-                      (PowerLayer*)(new ElementWiseLayer<PowerFunctor1>(false, PowerFunctor1(scale, shift))) :
-                      (PowerLayer*)(new ElementWiseLayer<PowerFunctor>(true, PowerFunctor(power, scale, shift))));
+    Ptr<PowerLayer> l(new ElementWiseLayer<PowerFunctor>(PowerFunctor(power, scale, shift)));
    l->setParamsFrom(params);
    l->power = power;
    l->scale = scale;
@@ -314,10 +378,12 @@ Ptr<PowerLayer> PowerLayer::create(const LayerParams& params)
    return l;
 }

-
 Ptr<ChannelsPReLULayer> ChannelsPReLULayer::create(const LayerParams& params)
 {
-    return Ptr<ChannelsPReLULayer>(new ChannelsPReLULayerImpl(params));
+    Ptr<ChannelsPReLULayer> l(new ElementWiseLayer<ChannelsPReLUFunctor>(ChannelsPReLUFunctor(params.blobs[0])));
+    l->setParamsFrom(params);
+
+    return l;
 }

 }

--- a/modules/dnn/src/layers/fully_connected_layer.cpp
+++ b/modules/dnn/src/layers/fully_connected_layer.cpp
@@ -52,6 +52,8 @@ namespace dnn
 class FullyConnectedLayerImpl : public InnerProductLayer
 {
 public:
+    enum { VEC_ALIGN = 8 };
+
    FullyConnectedLayerImpl(const LayerParams& params)
    {
        setParamsFrom(params);
@@ -65,15 +67,29 @@ public:
        CV_Assert(blobs[0].dims >= 2 && (size_t)(innerSize * numOutput) == blobs[0].total());
        CV_Assert(!bias || (blobs.size() == 2 && (size_t)numOutput == blobs[1].total()));

-        blobs[0] = blobs[0].reshape(1, numOutput);
+        weightsMat = blobs[0] = blobs[0].reshape(1, numOutput);
+        int vecsize = weightsMat.cols;
+        if( vecsize % VEC_ALIGN != 0 )
+        {
+            int vecsize_aligned = (int)alignSize(vecsize, VEC_ALIGN);
+            Mat weightsBuf(weightsMat.rows, vecsize_aligned, weightsMat.type());
+            Mat wpadding = weightsBuf.colRange(vecsize, vecsize_aligned);
+            wpadding.setTo(Scalar::all(0.));
+            weightsMat = weightsBuf.colRange(0, vecsize);
+            blobs[0].copyTo(weightsMat);
+            blobs[0] = weightsMat;
+        }
+
        if (bias)
-            blobs[1] = blobs[1].reshape(1, 1);
+            biasMat = blobs[1] = blobs[1].reshape(1, 1);
+        else
+            biasMat = Mat::zeros(1, numOutput, weightsMat.type());
    }

    bool getMemoryShapes(const std::vector<MatShape> &inputs,
                         const int requiredOutputs,
                         std::vector<MatShape> &outputs,
-                         std::vector<MatShape> &internals) const
+                         std::vector<MatShape> &) const
    {
        CV_Assert(inputs.size() > 0);
        CV_Assert(1 <= blobs.size() && blobs.size() <= 2);
@@ -84,36 +100,116 @@ public:
        int numOutput = blobs[0].size[0];
        outputs.resize(inputs.size(), shape(outerSize, numOutput));

-        internals.push_back(shape(outerSize, 1));
-
        CV_Assert(!bias || (size_t)numOutput == blobs[1].total());
-
        return false;
    }

-    void forward(std::vector<Mat*> &input, std::vector<Mat> &output, std::vector<Mat> &internals)
+    class FullConnected : public ParallelLoopBody
    {
-        internals[0].setTo(1.);
-        const Mat &weight = blobs[0];
-        const Mat *biasMat = NULL, *biasOnesMat = NULL;
-
-        int axisCan = clamp(axis, input[0]->dims);
-        int outerSize = input[0]->total(0, axisCan);
+    public:
+        FullConnected(const Mat& srcMat, const Mat& weights, const Mat& biasMat, Mat& dstMat, int nstripes)
+        {
+            CV_Assert( srcMat.dims == 2 && srcMat.cols == weights.cols &&
+                       dstMat.rows == srcMat.rows && dstMat.cols == weights.rows &&
+                       srcMat.type() == weights.type() && weights.type() == dstMat.type() &&
+                       srcMat.type() == CV_32F &&
+                       (biasMat.empty() || (biasMat.type() == srcMat.type() &&
+                        biasMat.isContinuous() && (int)biasMat.total() == dstMat.cols)) );
+
+            srcMat_ = &srcMat;
+            weights_ = &weights;
+            biasMat_ = &biasMat;
+            dstMat_ = &dstMat;
+            nstripes_ = nstripes;
+            useAVX2_ = checkHardwareSupport(CPU_AVX2);
+        }

-        if (bias)
+        void operator()(const Range& r) const
        {
-            biasOnesMat = &internals[0];
-            biasMat = &blobs[1];
+            int nsamples = srcMat_->rows;
+            int nw0 = weights_->rows;
+            int vecsize = srcMat_->cols;
+            int nstripes = nstripes_;
+            size_t total = (size_t)nsamples*nw0;
+            size_t stripeSize = (total + nstripes - 1)/nstripes;
+            size_t stripeStart = r.start*stripeSize;
+            size_t stripeEnd = r.end == nstripes ? total : std::min(r.end*stripeSize, total);
+            size_t wstep = weights_->step1();
+
+            for( size_t ofs = stripeStart; ofs < stripeEnd; )
+            {
+                int sampleIdx = (int)(ofs / nw0);
+                int delta = (int)(ofs - (size_t)sampleIdx*nw0);
+                const float* sptr = srcMat_->ptr<float>(sampleIdx);
+                const float* wptr = weights_->ptr<float>(delta);
+                float* dptr = dstMat_->ptr<float>(sampleIdx) + delta;
+                const float* biasptr = biasMat_->ptr<float>() + delta;
+                int nw = std::min(nw0 - delta, (int)(stripeEnd - ofs));
+
+            #if CV_DNN_TRY_AVX2
+                if( useAVX2_ )
+                    fastGEMM1T_avx2( sptr, wptr, wstep, biasptr, dptr, nw, vecsize);
+                else
+            #endif
+                {
+                    int i = 0, k;
+
+            #if CV_SIMD128
+                    for( ; i <= nw - 4; i += 4, wptr += 4*wstep )
+                    {
+                        vfloat32x4 vs0 = v_setall_f32(0.f), vs1 = v_setall_f32(0.f);
+                        vfloat32x4 vs2 = v_setall_f32(0.f), vs3 = v_setall_f32(0.f);
+
+                        for( k = 0; k < vecsize; k += 4 )
+                        {
+                            vfloat32x4 v = v_load_aligned(sptr + k);
+                            vs0 += v*v_load_aligned(wptr + k);
+                            vs1 += v*v_load_aligned(wptr + wstep + k);
+                            vs2 += v*v_load_aligned(wptr + wstep*2 + k);
+                            vs3 += v*v_load_aligned(wptr + wstep*3 + k);
+                        }
+
+                        vfloat32x4 s = v_reduce_sum4(vs0, vs1, vs2, vs3);
+                        s += v_load(biasptr + i);
+                        v_store(dptr + i, s);
+                    }
+            #endif
+
+                    for( ; i < nw; i++, wptr += wstep )
+                    {
+                        float s0=biasptr[i];
+
+                        for( k = 0; k < vecsize; k++ )
+                        {
+                            float v = sptr[k];
+                            s0 += v*wptr[k];
+                        }
+                        dptr[i] = s0;
+                    }
+                }
+                ofs += nw;
+            }
        }

+        const Mat *srcMat_, *weights_, *biasMat_;
+        Mat* dstMat_;
+        int nstripes_;
+        bool useAVX2_;
+    };
+
+    void forward(std::vector<Mat*> &input, std::vector<Mat> &output, std::vector<Mat> &)
+    {
+        int axisCan = clamp(axis, input[0]->dims);
+        int outerSize = input[0]->total(0, axisCan);
+
        for (size_t i = 0; i < input.size(); i++)
        {
            Mat srcMat = input[i]->reshape(1, outerSize);
            Mat dstMat = output[i].reshape(1, outerSize);
-            dnn::gemm(srcMat, weight, 1, dstMat, 0, GEMM_2_T);

-            if (bias)
-                dnn::gemm(*biasOnesMat, *biasMat, 1, dstMat, 1);
+            const int nstripes = getNumThreads();
+            FullConnected fconn(srcMat, weightsMat, biasMat, dstMat, nstripes);
+            parallel_for_(Range(0, nstripes), fconn, nstripes);
        }
    }

@@ -134,6 +230,7 @@ public:
    }

    bool bias;
+    Mat weightsMat, biasMat;
 };

 Ptr<InnerProductLayer> InnerProductLayer::create(const LayerParams& params)

--- a/modules/dnn/src/layers/layers_common.avx2.cpp
+++ b/modules/dnn/src/layers/layers_common.avx2.cpp
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+#include "layers_common.hpp"
+#include "opencv2/core/hal/intrin.hpp"
+
+#if CV_DNN_TRY_AVX2
+
+#include <immintrin.h>
+
+namespace cv {
+namespace dnn {
+
+void fastConv_avx2( const float* weights, size_t wstep, const float* bias,
+                    const float* rowbuf, float* output, const int* outShape,
+                    int blockSize, int vecsize, int vecsize_aligned, bool initOutput )
+{
+    int outCn = outShape[1];
+    size_t outPlaneSize = outShape[2]*outShape[3];
+
+    // now compute dot product of the weights
+    // and im2row-transformed part of the tensor
+    for( int i = 0; i < outCn; i += 3 )
+    {
+        const float* wptr0 = weights + i*wstep;
+        const float* wptr1 = wptr0 + wstep;
+        const float* wptr2 = wptr1 + wstep;
+        float* outptr0 = output + i*outPlaneSize;
+        float* outptr1 = outptr0 + outPlaneSize;
+        float* outptr2 = outptr1 + outPlaneSize;
+        float bias0 = bias[i], bias1 = bias[i+1], bias2 = bias[i+2];
+
+        if( i+2 >= outCn )
+        {
+            wptr2 = wptr1;
+            outptr2 = outptr1;
+            bias2 = bias1;
+            if( i+1 >= outCn )
+            {
+                wptr2 = wptr1 = wptr0;
+                outptr2 = outptr1 = outptr0;
+                bias2 = bias1 = bias0;
+            }
+        }
+
+        int j = 0;
+        for( ; j <= blockSize - 4; j += 4 )
+        {
+            const float* rptr = rowbuf + j*vecsize_aligned;
+
+            __m256 vs00 = _mm256_setzero_ps(), vs01 = _mm256_setzero_ps(),
+                   vs02 = _mm256_setzero_ps(), vs03 = _mm256_setzero_ps(),
+                   vs10 = _mm256_setzero_ps(), vs11 = _mm256_setzero_ps(),
+                   vs12 = _mm256_setzero_ps(), vs13 = _mm256_setzero_ps(),
+                   vs20 = _mm256_setzero_ps(), vs21 = _mm256_setzero_ps(),
+                   vs22 = _mm256_setzero_ps(), vs23 = _mm256_setzero_ps();
+
+            for( int k = 0; k < vecsize; k += 8, rptr += 8 )
+            {
+                __m256 w0 = _mm256_load_ps(wptr0 + k);
+                __m256 w1 = _mm256_load_ps(wptr1 + k);
+                __m256 w2 = _mm256_load_ps(wptr2 + k);
+                __m256 r0 = _mm256_load_ps(rptr);
+
+                vs00 = _mm256_fmadd_ps(w0, r0, vs00);
+                vs10 = _mm256_fmadd_ps(w1, r0, vs10);
+                vs20 = _mm256_fmadd_ps(w2, r0, vs20);
+
+                r0 = _mm256_load_ps(rptr + vecsize_aligned);
+                vs01 = _mm256_fmadd_ps(w0, r0, vs01);
+                vs11 = _mm256_fmadd_ps(w1, r0, vs11);
+                vs21 = _mm256_fmadd_ps(w2, r0, vs21);
+
+                r0 = _mm256_load_ps(rptr + vecsize_aligned*2);
+                vs02 = _mm256_fmadd_ps(w0, r0, vs02);
+                vs12 = _mm256_fmadd_ps(w1, r0, vs12);
+                vs22 = _mm256_fmadd_ps(w2, r0, vs22);
+
+                r0 = _mm256_load_ps(rptr + vecsize_aligned*3);
+                vs03 = _mm256_fmadd_ps(w0, r0, vs03);
+                vs13 = _mm256_fmadd_ps(w1, r0, vs13);
+                vs23 = _mm256_fmadd_ps(w2, r0, vs23);
+            }
+
+            __m256 t0 = _mm256_hadd_ps(_mm256_hadd_ps(vs00, vs01), _mm256_hadd_ps(vs02, vs03));
+            __m256 t1 = _mm256_hadd_ps(_mm256_hadd_ps(vs10, vs11), _mm256_hadd_ps(vs12, vs13));
+            __m256 t2 = _mm256_hadd_ps(_mm256_hadd_ps(vs20, vs21), _mm256_hadd_ps(vs22, vs23));
+
+            t0 = _mm256_add_ps(t0, _mm256_permute2f128_ps(t0, t0, 1));
+            t1 = _mm256_add_ps(t1, _mm256_permute2f128_ps(t1, t1, 1));
+            t2 = _mm256_add_ps(t2, _mm256_permute2f128_ps(t2, t2, 1));
+
+            __m256 s0, s1, s2;
+
+            if( initOutput )
+            {
+                s0 = _mm256_set1_ps(bias0);
+                s1 = _mm256_set1_ps(bias1);
+                s2 = _mm256_set1_ps(bias2);
+            }
+            else
+            {
+                s0 = _mm256_castps128_ps256(_mm_loadu_ps(outptr0 + j));
+                s1 = _mm256_castps128_ps256(_mm_loadu_ps(outptr1 + j));
+                s2 = _mm256_castps128_ps256(_mm_loadu_ps(outptr2 + j));
+            }
+
+            s0 = _mm256_add_ps(s0, t0);
+            s1 = _mm256_add_ps(s1, t1);
+            s2 = _mm256_add_ps(s2, t2);
+
+            _mm_storeu_ps(outptr0 + j, _mm256_castps256_ps128(s0));
+            _mm_storeu_ps(outptr1 + j, _mm256_castps256_ps128(s1));
+            _mm_storeu_ps(outptr2 + j, _mm256_castps256_ps128(s2));
+        }
+
+        for( ; j < blockSize; j++ )
+        {
+            const float* rptr = rowbuf + j*vecsize_aligned;
+            float s00, s10, s20;
+
+            if( initOutput )
+            {
+                s00 = bias0;
+                s10 = bias1;
+                s20 = bias2;
+            }
+            else
+            {
+                s00 = outptr0[j];
+                s10 = outptr1[j];
+                s20 = outptr2[j];
+            }
+
+            for( int k = 0; k < vecsize; k++ )
+            {
+                float r0 = rptr[k];
+                s00 += wptr0[k]*r0;
+                s10 += wptr1[k]*r0;
+                s20 += wptr2[k]*r0;
+            }
+
+            outptr0[j] = s00;
+            outptr1[j] = s10;
+            outptr2[j] = s20;
+        }
+    }
+    _mm256_zeroupper();
+}
+
+// dst = vec * weights^t + bias
+void fastGEMM1T_avx2( const float* vec, const float* weights,
+                      size_t wstep, const float* bias,
+                      float* dst, int nvecs, int vecsize )
+{
+    int i = 0;
+
+    for( ; i <= nvecs - 8; i += 8 )
+    {
+        const float* wptr = weights + i*wstep;
+        __m256 vs0 = _mm256_setzero_ps(), vs1 = _mm256_setzero_ps(),
+               vs2 = _mm256_setzero_ps(), vs3 = _mm256_setzero_ps(),
+               vs4 = _mm256_setzero_ps(), vs5 = _mm256_setzero_ps(),
+               vs6 = _mm256_setzero_ps(), vs7 = _mm256_setzero_ps();
+
+        for( int k = 0; k < vecsize; k += 8, wptr += 8 )
+        {
+            __m256 v = _mm256_load_ps(vec + k);
+
+            vs0 = _mm256_fmadd_ps(_mm256_load_ps(wptr), v, vs0);
+            vs1 = _mm256_fmadd_ps(_mm256_load_ps(wptr + wstep), v, vs1);
+            vs2 = _mm256_fmadd_ps(_mm256_load_ps(wptr + wstep*2), v, vs2);
+            vs3 = _mm256_fmadd_ps(_mm256_load_ps(wptr + wstep*3), v, vs3);
+            vs4 = _mm256_fmadd_ps(_mm256_load_ps(wptr + wstep*4), v, vs4);
+            vs5 = _mm256_fmadd_ps(_mm256_load_ps(wptr + wstep*5), v, vs5);
+            vs6 = _mm256_fmadd_ps(_mm256_load_ps(wptr + wstep*6), v, vs6);
+            vs7 = _mm256_fmadd_ps(_mm256_load_ps(wptr + wstep*7), v, vs7);
+        }
+
+        __m256 s0 = _mm256_hadd_ps(_mm256_hadd_ps(vs0, vs1), _mm256_hadd_ps(vs2, vs3));
+        __m256 s1 = _mm256_hadd_ps(_mm256_hadd_ps(vs4, vs5), _mm256_hadd_ps(vs6, vs7));
+
+        s0 = _mm256_add_ps(s0, _mm256_permute2f128_ps(s0, s0, 1));
+        s1 = _mm256_add_ps(s1, _mm256_permute2f128_ps(s1, s1, 1));
+
+        s0 = _mm256_add_ps(s0, _mm256_castps128_ps256(_mm_loadu_ps(bias + i)));
+        s1 = _mm256_add_ps(s1, _mm256_castps128_ps256(_mm_loadu_ps(bias + i + 4)));
+
+        _mm_storeu_ps(dst + i, _mm256_castps256_ps128(s0));
+        _mm_storeu_ps(dst + i + 4, _mm256_castps256_ps128(s1));
+    }
+
+    float temp = 0.f;
+    for( ; i < nvecs; i++ )
+    {
+        const float* wptr = weights + i*wstep;
+        __m256 vs0 = _mm256_setzero_ps();
+
+        for( int k = 0; k < vecsize; k += 8, wptr += 8 )
+        {
+            __m256 v = _mm256_load_ps(vec + k);
+            vs0 = _mm256_fmadd_ps(_mm256_load_ps(wptr), v, vs0);
+        }
+
+        __m256 s0 = _mm256_hadd_ps(_mm256_hadd_ps(vs0, vs0), vs0);
+        s0 = _mm256_add_ps(s0, _mm256_permute2f128_ps(s0, s0, 1));
+        _mm_store_ss(&temp, _mm256_castps256_ps128(s0));
+        dst[i] = temp + bias[i];
+    }
+
+    _mm256_zeroupper();
+}
+
+}
+}
+
+#endif
--- a/modules/dnn/src/layers/layers_common.hpp
+++ b/modules/dnn/src/layers/layers_common.hpp
@@ -64,6 +64,21 @@ void getConvPoolOutParams(const Size& inp, const Size &kernel,
 void getConvPoolPaddings(const Size& inp, const Size& out,
                         const Size &kernel, const Size &stride,
                         const String &padMode, Size &pad);
+
+#if CV_SSE2
+#define CV_DNN_TRY_AVX2 1
+
+void fastConv_avx2(const float* weights, size_t wstep, const float* bias,
+                   const float* rowbuf, float* output, const int* outShape,
+                   int blockSize, int vecsize, int vecsize_aligned, bool initOutput);
+void fastGEMM1T_avx2( const float* vec, const float* weights,
+                     size_t wstep, const float* bias,
+                     float* dst, int nvecs, int vecsize );
+
+#else
+#define CV_DNN_TRY_AVX2 0
+#endif
+
 }
 }


--- a/modules/dnn/src/layers/lrn_layer.cpp
+++ b/modules/dnn/src/layers/lrn_layer.cpp
@@ -41,8 +41,9 @@

 #include "../precomp.hpp"
 #include "layers_common.hpp"
-#include <opencv2/imgproc.hpp>
-#include <opencv2/dnn/shape_utils.hpp>
+#include "opencv2/imgproc.hpp"
+#include "opencv2/dnn/shape_utils.hpp"
+#include "opencv2/core/hal/hal.hpp"
 #include <algorithm>

 namespace cv
@@ -100,45 +101,94 @@ public:
        }
    }

-    void channelNormalization(Mat &srcBlob, Mat &dstBlob)
+    class ChannelLRN : public ParallelLoopBody
    {
-        int num = srcBlob.size[0];
-        int channels = srcBlob.size[1];
-        int ksize = (size - 1) / 2;
-        int sizeNormFactor = normBySize ? size : 1;
-
-        Mat srcMat = srcBlob.clone();
-        Mat dstMat = dstBlob;
+    public:
+        ChannelLRN(const float* src, float* dst, int channels, int ksize,
+                   float alpha1, float bias1, float beta1,
+                   size_t planeSize, int nsamples, int nstripes)
+        {
+            src_ = src; dst_ = dst;
+            channels_ = channels;
+            ksize_ = ksize;
+            alpha1_ = alpha1; bias1_ = bias1; beta1_ = beta1;
+            planeSize_ = planeSize; nsamples_ = nsamples; nstripes_ = nstripes;
+        }

-        for (int n = 0; n < num; n++)
+        void operator()(const Range& r) const
        {
-            Mat accum = getPlane(dstMat, n, channels-1); //trick for memory saving
-            accum.setTo(0);
+            int nsamples = nsamples_, nstripes = nstripes_;
+            size_t planeSize = planeSize_, planeSize_n = planeSize * nsamples;
+            size_t elemsPerStripe = (planeSize_n + nstripes - 1)/nstripes;
+            size_t rstart = r.start*elemsPerStripe;
+            size_t rend = r.end == nstripes ? planeSize_n : r.end*elemsPerStripe;
+            rstart = std::min(rstart, planeSize_n);
+            rend = std::min(rend, planeSize_n);
+            float alpha1 = alpha1_, bias1 = bias1_, beta1 = beta1_;
+            int k, channels = channels_, ksize = ksize_;

-            for (int cn = 0; cn < std::min(ksize, channels); cn++)
-                cv::accumulateSquare(getPlane(srcMat, n, cn), accum);
+            AutoBuffer<float> buf_((channels + ksize*2 + 4)*2);
+            float* acc = (float*)buf_;
+            float* buf = acc + channels + ksize + 1;
+            for( k = 0; k <= ksize; k++ )
+                buf[-k-1] = buf[channels + k] = 0.f;

-            for (int cn = 0; cn < channels; cn++)
+            for( size_t ofs = rstart; ofs < rend; )
            {
-                if (cn + ksize < channels)
-                {
-                    cv::accumulateSquare(getPlane(srcMat, n, cn + ksize), accum);
-                }
+                int sampleIdx = (int)(ofs/planeSize);
+                if( sampleIdx >= nsamples )
+                    break;
+                size_t ofs0 = ofs - sampleIdx*planeSize;
+                size_t ofs1 = std::min(planeSize - ofs0, rend - ofs) + ofs;
+                const float* src = src_ + sampleIdx*planeSize*channels + ofs0;
+                float* dst = dst_ + sampleIdx*planeSize*channels + ofs0;

-                if (cn - ksize - 1 >= 0)
+                for( ; ofs < ofs1; ofs++, src++, dst++ )
                {
-                    //subtractSquare
-                    Mat left = getPlane(srcMat, n, cn - ksize - 1);
-                    cv::pow(left, 2, left);
-                    cv::subtract(accum, left, accum);
-                }
+                    for( k = 0; k < channels; k++ )
+                        buf[k] = src[k*planeSize];
+                    float s = 0;
+                    for( k = 0; k < ksize; k++ )
+                        s += buf[k]*buf[k];
+                    for( k = 0; k < channels; k++ )
+                    {
+                        float x1 = buf[k + ksize];
+                        float x0 = buf[k - ksize - 1];
+                        s = std::max(s + (x1 + x0)*(x1 - x0), 0.f);
+                        acc[k] = (float)(alpha1*s + bias1);
+                    }

-                Mat dst = getPlane(dstMat, n, cn);
-                accum.convertTo(dst, dst.type(), alpha/sizeNormFactor, bias);
-                cv::pow(dst, beta, dst);
-                cv::divide(getPlane(srcMat, n, cn), dst, dst);
+                    hal::log32f(acc, acc, channels);
+                    for( k = 0; k < channels; k++ )
+                        acc[k] *= beta1;
+                    hal::exp32f(acc, acc, channels);
+
+                    for( k = 0; k < channels; k++ )
+                        dst[k*planeSize] = buf[k]*acc[k];
+                }
            }
        }
+
+        const float* src_;
+        float* dst_;
+        float alpha1_, bias1_, beta1_;
+        size_t planeSize_;
+        int channels_, ksize_, nsamples_, nstripes_;
+    };
+
+    void channelNormalization(Mat &srcBlob, Mat &dstBlob)
+    {
+        int num = srcBlob.size[0];
+        int channels = srcBlob.size[1];
+        int ksize = (size - 1) / 2;
+        int sizeNormFactor = normBySize ? size : 1;
+        size_t planeSize = srcBlob.size[2]*srcBlob.size[3];
+
+        int nstripes = std::max(getNumThreads(), 1);
+
+        ChannelLRN clrn(srcBlob.ptr<float>(), dstBlob.ptr<float>(), channels,
+                        ksize, alpha/sizeNormFactor, bias, -beta, planeSize, num, nstripes);
+        parallel_for_(Range(0, nstripes), clrn, nstripes);
    }

    void sqrBoxFilter_(const Mat &src, Mat &dst)

--- a/modules/dnn/src/layers/op_im2col.cpp
+++ b/modules/dnn/src/layers/op_im2col.cpp
@@ -48,194 +48,6 @@
 namespace cv {
 namespace dnn {

-#if 0
-template <typename Dtype>
-class im2col_CpuPBody : public cv::ParallelLoopBody
-{
-    const Dtype* data_im;
-    int channels, height, width;
-    int kernel_h, kernel_w;
-    int pad_h, pad_w;
-    int stride_h, stride_w;
-    int dilation_h, dilation_w;
-    Dtype* data_col;
-    int height_col, width_col, channels_col;
-
-    im2col_CpuPBody() {}
-public:
-
-    static void run(const Dtype* data_im,
-                    int channels, int height, int width,
-                    int kernel_h, int kernel_w,
-                    int pad_h, int pad_w,
-                    int stride_h, int stride_w,
-                    int dilation_h, int dilation_w,
-                    int height_col, int width_col,
-                    Dtype* data_col)
-    {
-        im2col_CpuPBody<Dtype> t;
-
-        t.data_im = data_im;
-        t.data_col = data_col;
-        t.channels = channels; t.height = height; t.width = width;
-        t.kernel_h = kernel_h; t.kernel_w = kernel_w;
-        t.pad_h = pad_h; t.pad_w = pad_w;
-        t.stride_h = stride_h; t.stride_w = stride_w;
-        t.dilation_h = dilation_h; t.dilation_w = dilation_w;
-
-        t.height_col = height_col;
-        t.width_col = width_col;
-        t.channels_col = channels * kernel_h * kernel_w;
-
-        cv::parallel_for_(Range(0, t.channels_col), t);
-    }
-
-    virtual void operator ()(const Range &r) const
-    {
-        for (int c = r.start; c < r.end; ++c)
-        {
-            int w_offset = c % kernel_w;
-            int h_offset = (c / kernel_w) % kernel_h;
-            int c_im = c / kernel_h / kernel_w;
-            for (int h = 0; h < height_col; ++h)
-            {
-                for (int w = 0; w < width_col; ++w)
-                {
-                    int h_pad = h * stride_h - pad_h + h_offset * dilation_h;
-                    int w_pad = w * stride_w - pad_w + w_offset * dilation_w;
-                    if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
-                        data_col[(c * height_col + h) * width_col + w] =
-                        data_im[(c_im * height + h_pad) * width + w_pad];
-                    else
-                        data_col[(c * height_col + h) * width_col + w] = 0;
-                }
-            }
-        }
-    }
-};
-#endif
-
-template <typename Dtype>
-class im2row_CpuPBody : public cv::ParallelLoopBody
-{
-    const Dtype* data_im;
-    int channels, height, width;
-    int kernel_h, kernel_w;
-    int pad_h, pad_w;
-    int stride_h, stride_w;
-    int dilation_h, dilation_w;
-    Dtype* data_col;
-    int height_col, width_col, channels_col;
-
-    im2row_CpuPBody() {}
-public:
-
-    static void run(const Dtype* data_im,
-                    int channels, int height, int width,
-                    int kernel_h, int kernel_w,
-                    int pad_h, int pad_w,
-                    int stride_h, int stride_w,
-                    int dilation_h, int dilation_w,
-                    int height_col, int width_col,
-                    Dtype* data_col)
-    {
-        im2row_CpuPBody<Dtype> t;
-
-        t.data_im = data_im;
-        t.data_col = data_col;
-        t.channels = channels; t.height = height; t.width = width;
-        t.kernel_h = kernel_h; t.kernel_w = kernel_w;
-        t.pad_h = pad_h; t.pad_w = pad_w;
-        t.stride_h = stride_h; t.stride_w = stride_w;
-        t.dilation_h = dilation_h; t.dilation_w = dilation_w;
-
-        t.height_col = height_col;
-        t.width_col = width_col;
-        t.channels_col = channels * kernel_h * kernel_w;
-
-        cv::parallel_for_(Range(0, t.height_col*t.width_col), t, 16);
-    }
-
-    virtual void operator ()(const Range &r) const
-    {
-        int dh = dilation_h, dw = dilation_w;
-        int kh = kernel_h, kw = kernel_w;
-        Dtype* data_col_ = data_col;
-        const Dtype* data_im_ = data_im;
-        int kelems = kh*kw;
-        AutoBuffer<int> ofs_(kelems);
-        int* ofs = ofs_;
-        int k = 0;
-        for( int k_r = 0; k_r < kernel_h; k_r++ )
-            for( int k_c = 0; k_c < kernel_w; k_c++, k++ )
-                ofs[k] = k_r*dh*width + k_c*dw;
-
-        for (int row = r.start; row < r.end; ++row)
-        {
-            int out_c = row % width_col;
-            int out_r = row / width_col;
-            int out_row_offset = row*kh*kw*channels;
-
-            int start_in_r = out_r * stride_h - pad_h;
-            int start_in_c = out_c * stride_w - pad_w;
-            int start_k_r = std::max(0, (-start_in_r + dilation_h-1)/dilation_h);
-            int end_k_r = std::min(kh, (height - start_in_r + dilation_h-1)/dilation_h);
-            int start_k_c = std::max(0, (-start_in_c + dilation_w-1)/dilation_w);
-            int end_k_c = std::min(kw, (width - start_in_c + dilation_w-1)/dilation_w);
-
-            if( start_k_r == 0 && end_k_r == kh && start_k_c == 0 && end_k_c == kw )
-            {
-                for( int i_c = 0; i_c < channels; i_c++ )
-                {
-                    float* data_col_c = data_col_ + out_row_offset + i_c*kh*kw;
-                    const float* data_im_c = data_im_ + (i_c*height + start_in_r)*width + start_in_c;
-
-                    for( k = 0; k < kelems; k++ )
-                    {
-                        data_col_c[k] = data_im_c[ofs[k]];
-                    }
-                }
-            }
-            else
-            {
-                for(int i_c = 0; i_c < channels; i_c++)
-                {
-                    int channels_offset = i_c * width * height;
-                    int out_ch_offset = i_c*kh*kw;
-                    int in_r = start_in_r + start_k_r*dh;
-
-                    for(int k_r = start_k_r; k_r < end_k_r; k_r++, in_r += dh)
-                    {
-                        int row_offset = in_r*width;
-                        int out_col_offset = k_r*kw;
-                        int in_c = start_in_c + start_k_c*dw;
-
-                        for(int k_c = start_k_c; k_c < end_k_c; k_c++, in_c += dw)
-                        {
-                            int in_index = channels_offset + row_offset + in_c;
-                            int out_index = out_row_offset + out_ch_offset + out_col_offset + k_c;
-
-                            data_col_[out_index] = data_im_[in_index];
-                        }
-                    }
-                }
-            }
-        }
-    }
-};
-
-void im2row(const float* data_im, int channels, int height, int width,
-            int kernel_h, int kernel_w, int pad_h, int pad_w,
-            int stride_h, int stride_w, int dilation_h, int dilation_w,
-            int height_col, int width_col, float* data_col)
-{
-    im2row_CpuPBody<float>::run(data_im, channels, height, width,
-                                kernel_h, kernel_w, pad_h, pad_w,
-                                stride_h, stride_w, dilation_h, dilation_w,
-                                height_col, width_col, data_col);
-}
-
-
 template <typename Dtype>
 class col2im_CpuPBody : public cv::ParallelLoopBody
 {