optimized several conv net layers (#1227)

* rewritten the following layers to be [much] more efficient: convolution, fully connected, activations (relu, tanh, ...), LRN. Use optional AVX optimization for the first two. * eliminated trailing whitespaces

optimized several conv net layers (#1227)
* rewritten the following layers to be [much] more efficient: convolution, fully connected, activations (relu, tanh, ...), LRN. Use optional AVX optimization for the first two. * eliminated trailing whitespaces
645260af · Vadim Pisarevsky · GitHub · 009d2efb · 645260af · 645260af
Commit 645260af authored Jun 15, 2017 by Vadim Pisarevsky Committed by GitHub Jun 15, 2017
8 changed files
--- a/modules/dnn/include/opencv2/dnn/all_layers.hpp
+++ b/modules/dnn/include/opencv2/dnn/all_layers.hpp
@@ -201,9 +201,13 @@ namespace dnn
        String padMode;
    };
+    class CV_EXPORTS ActivationLayer;
    class CV_EXPORTS ConvolutionLayer : public BaseConvolutionLayer
    {
    public:
+        virtual bool setActivation(const Ptr<ActivationLayer>& layer) = 0;
        static Ptr<BaseConvolutionLayer> create(const LayerParams& params);
    };
@@ -327,8 +331,14 @@ namespace dnn
    };
    /* Activations */
+    class CV_EXPORTS ActivationLayer : public Layer
+    {
+    public:
+        virtual void forwardSlice(const float* src, float* dst, int len,
+                                  size_t outPlaneSize, int cn0, int cn1) const = 0;
+    };
-    class CV_EXPORTS ReLULayer : public Layer
+    class CV_EXPORTS ReLULayer : public ActivationLayer
    {
    public:
        float negativeSlope;
@@ -336,37 +346,37 @@ namespace dnn
        static Ptr<ReLULayer> create(const LayerParams &params);
    };
-    class CV_EXPORTS ChannelsPReLULayer : public Layer
+    class CV_EXPORTS ChannelsPReLULayer : public ActivationLayer
    {
    public:
        static Ptr<ChannelsPReLULayer> create(const LayerParams& params);
    };
-    class CV_EXPORTS TanHLayer : public Layer
+    class CV_EXPORTS TanHLayer : public ActivationLayer
    {
    public:
        static Ptr<TanHLayer> create(const LayerParams &params);
    };
-    class CV_EXPORTS SigmoidLayer : public Layer
+    class CV_EXPORTS SigmoidLayer : public ActivationLayer
    {
    public:
        static Ptr<SigmoidLayer> create(const LayerParams &params);
    };
-    class CV_EXPORTS BNLLLayer : public Layer
+    class CV_EXPORTS BNLLLayer : public ActivationLayer
    {
    public:
        static Ptr<BNLLLayer> create(const LayerParams &params);
    };
-    class CV_EXPORTS AbsLayer : public Layer
+    class CV_EXPORTS AbsLayer : public ActivationLayer
    {
    public:
        static Ptr<AbsLayer> create(const LayerParams &params);
    };
-    class CV_EXPORTS PowerLayer : public Layer
+    class CV_EXPORTS PowerLayer : public ActivationLayer
    {
    public:
        float power, scale, shift;
@@ -374,7 +384,7 @@ namespace dnn
        static Ptr<PowerLayer> create(const LayerParams &params);
    };
-    /* Layers using in semantic segmentation */
+    /* Layers used in semantic segmentation */
    class CV_EXPORTS CropLayer : public Layer
    {

--- a/modules/dnn/src/layers/convolution_layer.cpp
+++ b/modules/dnn/src/layers/convolution_layer.cpp
--- a/modules/dnn/src/layers/elementwise_layers.cpp
+++ b/modules/dnn/src/layers/elementwise_layers.cpp
--- a/modules/dnn/src/layers/fully_connected_layer.cpp
+++ b/modules/dnn/src/layers/fully_connected_layer.cpp
@@ -52,6 +52,8 @@ namespace dnn
 class FullyConnectedLayerImpl : public InnerProductLayer
 {
 public:
+    enum { VEC_ALIGN = 8 };
    FullyConnectedLayerImpl(const LayerParams& params)
    {
        setParamsFrom(params);
@@ -65,15 +67,29 @@ public:
        CV_Assert(blobs[0].dims >= 2 && (size_t)(innerSize * numOutput) == blobs[0].total());
        CV_Assert(!bias || (blobs.size() == 2 && (size_t)numOutput == blobs[1].total()));
-        blobs[0] = blobs[0].reshape(1, numOutput);
+        weightsMat = blobs[0] = blobs[0].reshape(1, numOutput);
+        int vecsize = weightsMat.cols;
+        if( vecsize % VEC_ALIGN != 0 )
+        {
+            int vecsize_aligned = (int)alignSize(vecsize, VEC_ALIGN);
+            Mat weightsBuf(weightsMat.rows, vecsize_aligned, weightsMat.type());
+            Mat wpadding = weightsBuf.colRange(vecsize, vecsize_aligned);
+            wpadding.setTo(Scalar::all(0.));
+            weightsMat = weightsBuf.colRange(0, vecsize);
+            blobs[0].copyTo(weightsMat);
+            blobs[0] = weightsMat;
+        }
        if (bias)
-            blobs[1] = blobs[1].reshape(1, 1);
+            biasMat = blobs[1] = blobs[1].reshape(1, 1);
+        else
+            biasMat = Mat::zeros(1, numOutput, weightsMat.type());
    }
    bool getMemoryShapes(const std::vector<MatShape> &inputs,
                         const int requiredOutputs,
                         std::vector<MatShape> &outputs,
-                         std::vector<MatShape> &internals) const
+                         std::vector<MatShape> &) const
    {
        CV_Assert(inputs.size() > 0);
        CV_Assert(1 <= blobs.size() && blobs.size() <= 2);
@@ -84,36 +100,116 @@ public:
        int numOutput = blobs[0].size[0];
        outputs.resize(inputs.size(), shape(outerSize, numOutput));
-        internals.push_back(shape(outerSize, 1));
        CV_Assert(!bias || (size_t)numOutput == blobs[1].total());
        return false;
    }
-    void forward(std::vector<Mat*> &input, std::vector<Mat> &output, std::vector<Mat> &internals)
+    class FullConnected : public ParallelLoopBody
    {
-        internals[0].setTo(1.);
+    public:
-        const Mat &weight = blobs[0];
+        FullConnected(const Mat& srcMat, const Mat& weights, const Mat& biasMat, Mat& dstMat, int nstripes)
-        const Mat *biasMat = NULL, *biasOnesMat = NULL;
+        {
+            CV_Assert( srcMat.dims == 2 && srcMat.cols == weights.cols &&
-        int axisCan = clamp(axis, input[0]->dims);
+                       dstMat.rows == srcMat.rows && dstMat.cols == weights.rows &&
-        int outerSize = input[0]->total(0, axisCan);
+                       srcMat.type() == weights.type() && weights.type() == dstMat.type() &&
+                       srcMat.type() == CV_32F &&
+                       (biasMat.empty() || (biasMat.type() == srcMat.type() &&
+                        biasMat.isContinuous() && (int)biasMat.total() == dstMat.cols)) );
+            srcMat_ = &srcMat;
+            weights_ = &weights;
+            biasMat_ = &biasMat;
+            dstMat_ = &dstMat;
+            nstripes_ = nstripes;
+            useAVX2_ = checkHardwareSupport(CPU_AVX2);
+        }
-        if (bias)
+        void operator()(const Range& r) const
        {
-            biasOnesMat = &internals[0];
+            int nsamples = srcMat_->rows;
-            biasMat = &blobs[1];
+            int nw0 = weights_->rows;
+            int vecsize = srcMat_->cols;
+            int nstripes = nstripes_;
+            size_t total = (size_t)nsamples*nw0;
+            size_t stripeSize = (total + nstripes - 1)/nstripes;
+            size_t stripeStart = r.start*stripeSize;
+            size_t stripeEnd = r.end == nstripes ? total : std::min(r.end*stripeSize, total);
+            size_t wstep = weights_->step1();
+            for( size_t ofs = stripeStart; ofs < stripeEnd; )
+            {
+                int sampleIdx = (int)(ofs / nw0);
+                int delta = (int)(ofs - (size_t)sampleIdx*nw0);
+                const float* sptr = srcMat_->ptr<float>(sampleIdx);
+                const float* wptr = weights_->ptr<float>(delta);
+                float* dptr = dstMat_->ptr<float>(sampleIdx) + delta;
+                const float* biasptr = biasMat_->ptr<float>() + delta;
+                int nw = std::min(nw0 - delta, (int)(stripeEnd - ofs));
+            #if CV_DNN_TRY_AVX2
+                if( useAVX2_ )
+                    fastGEMM1T_avx2( sptr, wptr, wstep, biasptr, dptr, nw, vecsize);
+                else
+            #endif
+                {
+                    int i = 0, k;
+            #if CV_SIMD128
+                    for( ; i <= nw - 4; i += 4, wptr += 4*wstep )
+                    {
+                        vfloat32x4 vs0 = v_setall_f32(0.f), vs1 = v_setall_f32(0.f);
+                        vfloat32x4 vs2 = v_setall_f32(0.f), vs3 = v_setall_f32(0.f);
+                        for( k = 0; k < vecsize; k += 4 )
+                        {
+                            vfloat32x4 v = v_load_aligned(sptr + k);
+                            vs0 += v*v_load_aligned(wptr + k);
+                            vs1 += v*v_load_aligned(wptr + wstep + k);
+                            vs2 += v*v_load_aligned(wptr + wstep*2 + k);
+                            vs3 += v*v_load_aligned(wptr + wstep*3 + k);
+                        }
+                        vfloat32x4 s = v_reduce_sum4(vs0, vs1, vs2, vs3);
+                        s += v_load(biasptr + i);
+                        v_store(dptr + i, s);
+                    }
+            #endif
+                    for( ; i < nw; i++, wptr += wstep )
+                    {
+                        float s0=biasptr[i];
+                        for( k = 0; k < vecsize; k++ )
+                        {
+                            float v = sptr[k];
+                            s0 += v*wptr[k];
+                        }
+                        dptr[i] = s0;
+                    }
+                }
+                ofs += nw;
+            }
        }
+        const Mat *srcMat_, *weights_, *biasMat_;
+        Mat* dstMat_;
+        int nstripes_;
+        bool useAVX2_;
+    };
+    void forward(std::vector<Mat*> &input, std::vector<Mat> &output, std::vector<Mat> &)
+    {
+        int axisCan = clamp(axis, input[0]->dims);
+        int outerSize = input[0]->total(0, axisCan);
        for (size_t i = 0; i < input.size(); i++)
        {
            Mat srcMat = input[i]->reshape(1, outerSize);
            Mat dstMat = output[i].reshape(1, outerSize);
-            dnn::gemm(srcMat, weight, 1, dstMat, 0, GEMM_2_T);
-            if (bias)
+            const int nstripes = getNumThreads();
-                dnn::gemm(*biasOnesMat, *biasMat, 1, dstMat, 1);
+            FullConnected fconn(srcMat, weightsMat, biasMat, dstMat, nstripes);
+            parallel_for_(Range(0, nstripes), fconn, nstripes);
        }
    }
@@ -134,6 +230,7 @@ public:
    }
    bool bias;
+    Mat weightsMat, biasMat;
 };
 Ptr<InnerProductLayer> InnerProductLayer::create(const LayerParams& params)

--- a/modules/dnn/src/layers/layers_common.avx2.cpp
+++ b/modules/dnn/src/layers/layers_common.avx2.cpp
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#include "precomp.hpp"
+#include "layers_common.hpp"
+#include "opencv2/core/hal/intrin.hpp"
+#if CV_DNN_TRY_AVX2
+#include <immintrin.h>
+namespace cv {
+namespace dnn {
+void fastConv_avx2( const float* weights, size_t wstep, const float* bias,
+                    const float* rowbuf, float* output, const int* outShape,
+                    int blockSize, int vecsize, int vecsize_aligned, bool initOutput )
+{
+    int outCn = outShape[1];
+    size_t outPlaneSize = outShape[2]*outShape[3];
+    // now compute dot product of the weights
+    // and im2row-transformed part of the tensor
+    for( int i = 0; i < outCn; i += 3 )
+    {
+        const float* wptr0 = weights + i*wstep;
+        const float* wptr1 = wptr0 + wstep;
+        const float* wptr2 = wptr1 + wstep;
+        float* outptr0 = output + i*outPlaneSize;
+        float* outptr1 = outptr0 + outPlaneSize;
+        float* outptr2 = outptr1 + outPlaneSize;
+        float bias0 = bias[i], bias1 = bias[i+1], bias2 = bias[i+2];
+        if( i+2 >= outCn )
+        {
+            wptr2 = wptr1;
+            outptr2 = outptr1;
+            bias2 = bias1;
+            if( i+1 >= outCn )
+            {
+                wptr2 = wptr1 = wptr0;
+                outptr2 = outptr1 = outptr0;
+                bias2 = bias1 = bias0;
+            }
+        }
+        int j = 0;
+        for( ; j <= blockSize - 4; j += 4 )
+        {
+            const float* rptr = rowbuf + j*vecsize_aligned;
+            __m256 vs00 = _mm256_setzero_ps(), vs01 = _mm256_setzero_ps(),
+                   vs02 = _mm256_setzero_ps(), vs03 = _mm256_setzero_ps(),
+                   vs10 = _mm256_setzero_ps(), vs11 = _mm256_setzero_ps(),
+                   vs12 = _mm256_setzero_ps(), vs13 = _mm256_setzero_ps(),
+                   vs20 = _mm256_setzero_ps(), vs21 = _mm256_setzero_ps(),
+                   vs22 = _mm256_setzero_ps(), vs23 = _mm256_setzero_ps();
+            for( int k = 0; k < vecsize; k += 8, rptr += 8 )
+            {
+                __m256 w0 = _mm256_load_ps(wptr0 + k);
+                __m256 w1 = _mm256_load_ps(wptr1 + k);
+                __m256 w2 = _mm256_load_ps(wptr2 + k);
+                __m256 r0 = _mm256_load_ps(rptr);
+                vs00 = _mm256_fmadd_ps(w0, r0, vs00);
+                vs10 = _mm256_fmadd_ps(w1, r0, vs10);
+                vs20 = _mm256_fmadd_ps(w2, r0, vs20);
+                r0 = _mm256_load_ps(rptr + vecsize_aligned);
+                vs01 = _mm256_fmadd_ps(w0, r0, vs01);
+                vs11 = _mm256_fmadd_ps(w1, r0, vs11);
+                vs21 = _mm256_fmadd_ps(w2, r0, vs21);
+                r0 = _mm256_load_ps(rptr + vecsize_aligned*2);
+                vs02 = _mm256_fmadd_ps(w0, r0, vs02);
+                vs12 = _mm256_fmadd_ps(w1, r0, vs12);
+                vs22 = _mm256_fmadd_ps(w2, r0, vs22);
+                r0 = _mm256_load_ps(rptr + vecsize_aligned*3);
+                vs03 = _mm256_fmadd_ps(w0, r0, vs03);
+                vs13 = _mm256_fmadd_ps(w1, r0, vs13);
+                vs23 = _mm256_fmadd_ps(w2, r0, vs23);
+            }
+            __m256 t0 = _mm256_hadd_ps(_mm256_hadd_ps(vs00, vs01), _mm256_hadd_ps(vs02, vs03));
+            __m256 t1 = _mm256_hadd_ps(_mm256_hadd_ps(vs10, vs11), _mm256_hadd_ps(vs12, vs13));
+            __m256 t2 = _mm256_hadd_ps(_mm256_hadd_ps(vs20, vs21), _mm256_hadd_ps(vs22, vs23));
+            t0 = _mm256_add_ps(t0, _mm256_permute2f128_ps(t0, t0, 1));
+            t1 = _mm256_add_ps(t1, _mm256_permute2f128_ps(t1, t1, 1));
+            t2 = _mm256_add_ps(t2, _mm256_permute2f128_ps(t2, t2, 1));
+            __m256 s0, s1, s2;
+            if( initOutput )
+            {
+                s0 = _mm256_set1_ps(bias0);
+                s1 = _mm256_set1_ps(bias1);
+                s2 = _mm256_set1_ps(bias2);
+            }
+            else
+            {
+                s0 = _mm256_castps128_ps256(_mm_loadu_ps(outptr0 + j));
+                s1 = _mm256_castps128_ps256(_mm_loadu_ps(outptr1 + j));
+                s2 = _mm256_castps128_ps256(_mm_loadu_ps(outptr2 + j));
+            }
+            s0 = _mm256_add_ps(s0, t0);
+            s1 = _mm256_add_ps(s1, t1);
+            s2 = _mm256_add_ps(s2, t2);
+            _mm_storeu_ps(outptr0 + j, _mm256_castps256_ps128(s0));
+            _mm_storeu_ps(outptr1 + j, _mm256_castps256_ps128(s1));
+            _mm_storeu_ps(outptr2 + j, _mm256_castps256_ps128(s2));
+        }
+        for( ; j < blockSize; j++ )
+        {
+            const float* rptr = rowbuf + j*vecsize_aligned;
+            float s00, s10, s20;
+            if( initOutput )
+            {
+                s00 = bias0;
+                s10 = bias1;
+                s20 = bias2;
+            }
+            else
+            {
+                s00 = outptr0[j];
+                s10 = outptr1[j];
+                s20 = outptr2[j];
+            }
+            for( int k = 0; k < vecsize; k++ )
+            {
+                float r0 = rptr[k];
+                s00 += wptr0[k]*r0;
+                s10 += wptr1[k]*r0;
+                s20 += wptr2[k]*r0;
+            }
+            outptr0[j] = s00;
+            outptr1[j] = s10;
+            outptr2[j] = s20;
+        }
+    }
+    _mm256_zeroupper();
+}
+// dst = vec * weights^t + bias
+void fastGEMM1T_avx2( const float* vec, const float* weights,
+                      size_t wstep, const float* bias,
+                      float* dst, int nvecs, int vecsize )
+{
+    int i = 0;
+    for( ; i <= nvecs - 8; i += 8 )
+    {
+        const float* wptr = weights + i*wstep;
+        __m256 vs0 = _mm256_setzero_ps(), vs1 = _mm256_setzero_ps(),
+               vs2 = _mm256_setzero_ps(), vs3 = _mm256_setzero_ps(),
+               vs4 = _mm256_setzero_ps(), vs5 = _mm256_setzero_ps(),
+               vs6 = _mm256_setzero_ps(), vs7 = _mm256_setzero_ps();
+        for( int k = 0; k < vecsize; k += 8, wptr += 8 )
+        {
+            __m256 v = _mm256_load_ps(vec + k);
+            vs0 = _mm256_fmadd_ps(_mm256_load_ps(wptr), v, vs0);
+            vs1 = _mm256_fmadd_ps(_mm256_load_ps(wptr + wstep), v, vs1);
+            vs2 = _mm256_fmadd_ps(_mm256_load_ps(wptr + wstep*2), v, vs2);
+            vs3 = _mm256_fmadd_ps(_mm256_load_ps(wptr + wstep*3), v, vs3);
+            vs4 = _mm256_fmadd_ps(_mm256_load_ps(wptr + wstep*4), v, vs4);
+            vs5 = _mm256_fmadd_ps(_mm256_load_ps(wptr + wstep*5), v, vs5);
+            vs6 = _mm256_fmadd_ps(_mm256_load_ps(wptr + wstep*6), v, vs6);
+            vs7 = _mm256_fmadd_ps(_mm256_load_ps(wptr + wstep*7), v, vs7);
+        }
+        __m256 s0 = _mm256_hadd_ps(_mm256_hadd_ps(vs0, vs1), _mm256_hadd_ps(vs2, vs3));
+        __m256 s1 = _mm256_hadd_ps(_mm256_hadd_ps(vs4, vs5), _mm256_hadd_ps(vs6, vs7));
+        s0 = _mm256_add_ps(s0, _mm256_permute2f128_ps(s0, s0, 1));
+        s1 = _mm256_add_ps(s1, _mm256_permute2f128_ps(s1, s1, 1));
+        s0 = _mm256_add_ps(s0, _mm256_castps128_ps256(_mm_loadu_ps(bias + i)));
+        s1 = _mm256_add_ps(s1, _mm256_castps128_ps256(_mm_loadu_ps(bias + i + 4)));
+        _mm_storeu_ps(dst + i, _mm256_castps256_ps128(s0));
+        _mm_storeu_ps(dst + i + 4, _mm256_castps256_ps128(s1));
+    }
+    float temp = 0.f;
+    for( ; i < nvecs; i++ )
+    {
+        const float* wptr = weights + i*wstep;
+        __m256 vs0 = _mm256_setzero_ps();
+        for( int k = 0; k < vecsize; k += 8, wptr += 8 )
+        {
+            __m256 v = _mm256_load_ps(vec + k);
+            vs0 = _mm256_fmadd_ps(_mm256_load_ps(wptr), v, vs0);
+        }
+        __m256 s0 = _mm256_hadd_ps(_mm256_hadd_ps(vs0, vs0), vs0);
+        s0 = _mm256_add_ps(s0, _mm256_permute2f128_ps(s0, s0, 1));
+        _mm_store_ss(&temp, _mm256_castps256_ps128(s0));
+        dst[i] = temp + bias[i];
+    }
+    _mm256_zeroupper();
+}
+}
+}
+#endif
--- a/modules/dnn/src/layers/layers_common.hpp
+++ b/modules/dnn/src/layers/layers_common.hpp
@@ -64,6 +64,21 @@ void getConvPoolOutParams(const Size& inp, const Size &kernel,
 void getConvPoolPaddings(const Size& inp, const Size& out,
                         const Size &kernel, const Size &stride,
                         const String &padMode, Size &pad);
+#if CV_SSE2
+#define CV_DNN_TRY_AVX2 1
+void fastConv_avx2(const float* weights, size_t wstep, const float* bias,
+                   const float* rowbuf, float* output, const int* outShape,
+                   int blockSize, int vecsize, int vecsize_aligned, bool initOutput);
+void fastGEMM1T_avx2( const float* vec, const float* weights,
+                     size_t wstep, const float* bias,
+                     float* dst, int nvecs, int vecsize );
+#else
+#define CV_DNN_TRY_AVX2 0
+#endif
 }
 }

--- a/modules/dnn/src/layers/lrn_layer.cpp
+++ b/modules/dnn/src/layers/lrn_layer.cpp
@@ -41,8 +41,9 @@
 #include "../precomp.hpp"
 #include "layers_common.hpp"
-#include <opencv2/imgproc.hpp>
+#include "opencv2/imgproc.hpp"
-#include <opencv2/dnn/shape_utils.hpp>
+#include "opencv2/dnn/shape_utils.hpp"
+#include "opencv2/core/hal/hal.hpp"
 #include <algorithm>
 namespace cv
@@ -100,45 +101,94 @@ public:
        }
    }
-    void channelNormalization(Mat &srcBlob, Mat &dstBlob)
+    class ChannelLRN : public ParallelLoopBody
    {
-        int num = srcBlob.size[0];
+    public:
-        int channels = srcBlob.size[1];
+        ChannelLRN(const float* src, float* dst, int channels, int ksize,
-        int ksize = (size - 1) / 2;
+                   float alpha1, float bias1, float beta1,
-        int sizeNormFactor = normBySize ? size : 1;
+                   size_t planeSize, int nsamples, int nstripes)
+        {
-        Mat srcMat = srcBlob.clone();
+            src_ = src; dst_ = dst;
-        Mat dstMat = dstBlob;
+            channels_ = channels;
+            ksize_ = ksize;
+            alpha1_ = alpha1; bias1_ = bias1; beta1_ = beta1;
+            planeSize_ = planeSize; nsamples_ = nsamples; nstripes_ = nstripes;
+        }
-        for (int n = 0; n < num; n++)
+        void operator()(const Range& r) const
        {
-            Mat accum = getPlane(dstMat, n, channels-1); //trick for memory saving
+            int nsamples = nsamples_, nstripes = nstripes_;
-            accum.setTo(0);
+            size_t planeSize = planeSize_, planeSize_n = planeSize * nsamples;
+            size_t elemsPerStripe = (planeSize_n + nstripes - 1)/nstripes;
+            size_t rstart = r.start*elemsPerStripe;
+            size_t rend = r.end == nstripes ? planeSize_n : r.end*elemsPerStripe;
+            rstart = std::min(rstart, planeSize_n);
+            rend = std::min(rend, planeSize_n);
+            float alpha1 = alpha1_, bias1 = bias1_, beta1 = beta1_;
+            int k, channels = channels_, ksize = ksize_;
-            for (int cn = 0; cn < std::min(ksize, channels); cn++)
+            AutoBuffer<float> buf_((channels + ksize*2 + 4)*2);
-                cv::accumulateSquare(getPlane(srcMat, n, cn), accum);
+            float* acc = (float*)buf_;
+            float* buf = acc + channels + ksize + 1;
+            for( k = 0; k <= ksize; k++ )
+                buf[-k-1] = buf[channels + k] = 0.f;
-            for (int cn = 0; cn < channels; cn++)
+            for( size_t ofs = rstart; ofs < rend; )
            {
-                if (cn + ksize < channels)
+                int sampleIdx = (int)(ofs/planeSize);
-                {
+                if( sampleIdx >= nsamples )
-                    cv::accumulateSquare(getPlane(srcMat, n, cn + ksize), accum);
+                    break;
-                }
+                size_t ofs0 = ofs - sampleIdx*planeSize;
+                size_t ofs1 = std::min(planeSize - ofs0, rend - ofs) + ofs;
+                const float* src = src_ + sampleIdx*planeSize*channels + ofs0;
+                float* dst = dst_ + sampleIdx*planeSize*channels + ofs0;
-                if (cn - ksize - 1 >= 0)
+                for( ; ofs < ofs1; ofs++, src++, dst++ )
                {
-                    //subtractSquare
+                    for( k = 0; k < channels; k++ )
-                    Mat left = getPlane(srcMat, n, cn - ksize - 1);
+                        buf[k] = src[k*planeSize];
-                    cv::pow(left, 2, left);
+                    float s = 0;
-                    cv::subtract(accum, left, accum);
+                    for( k = 0; k < ksize; k++ )
-                }
+                        s += buf[k]*buf[k];
+                    for( k = 0; k < channels; k++ )
+                    {
+                        float x1 = buf[k + ksize];
+                        float x0 = buf[k - ksize - 1];
+                        s = std::max(s + (x1 + x0)*(x1 - x0), 0.f);
+                        acc[k] = (float)(alpha1*s + bias1);
+                    }
-                Mat dst = getPlane(dstMat, n, cn);
+                    hal::log32f(acc, acc, channels);
-                accum.convertTo(dst, dst.type(), alpha/sizeNormFactor, bias);
+                    for( k = 0; k < channels; k++ )
-                cv::pow(dst, beta, dst);
+                        acc[k] *= beta1;
-                cv::divide(getPlane(srcMat, n, cn), dst, dst);
+                    hal::exp32f(acc, acc, channels);
+                    for( k = 0; k < channels; k++ )
+                        dst[k*planeSize] = buf[k]*acc[k];
+                }
            }
        }
+        const float* src_;
+        float* dst_;
+        float alpha1_, bias1_, beta1_;
+        size_t planeSize_;
+        int channels_, ksize_, nsamples_, nstripes_;
+    };
+    void channelNormalization(Mat &srcBlob, Mat &dstBlob)
+    {
+        int num = srcBlob.size[0];
+        int channels = srcBlob.size[1];
+        int ksize = (size - 1) / 2;
+        int sizeNormFactor = normBySize ? size : 1;
+        size_t planeSize = srcBlob.size[2]*srcBlob.size[3];
+        int nstripes = std::max(getNumThreads(), 1);
+        ChannelLRN clrn(srcBlob.ptr<float>(), dstBlob.ptr<float>(), channels,
+                        ksize, alpha/sizeNormFactor, bias, -beta, planeSize, num, nstripes);
+        parallel_for_(Range(0, nstripes), clrn, nstripes);
    }
    void sqrBoxFilter_(const Mat &src, Mat &dst)

--- a/modules/dnn/src/layers/op_im2col.cpp
+++ b/modules/dnn/src/layers/op_im2col.cpp
@@ -48,194 +48,6 @@
 namespace cv {
 namespace dnn {
-#if 0
-template <typename Dtype>
-class im2col_CpuPBody : public cv::ParallelLoopBody
-{
-    const Dtype* data_im;
-    int channels, height, width;
-    int kernel_h, kernel_w;
-    int pad_h, pad_w;
-    int stride_h, stride_w;
-    int dilation_h, dilation_w;
-    Dtype* data_col;
-    int height_col, width_col, channels_col;
-    im2col_CpuPBody() {}
-public:
-    static void run(const Dtype* data_im,
-                    int channels, int height, int width,
-                    int kernel_h, int kernel_w,
-                    int pad_h, int pad_w,
-                    int stride_h, int stride_w,
-                    int dilation_h, int dilation_w,
-                    int height_col, int width_col,
-                    Dtype* data_col)
-    {
-        im2col_CpuPBody<Dtype> t;
-        t.data_im = data_im;
-        t.data_col = data_col;
-        t.channels = channels; t.height = height; t.width = width;
-        t.kernel_h = kernel_h; t.kernel_w = kernel_w;
-        t.pad_h = pad_h; t.pad_w = pad_w;
-        t.stride_h = stride_h; t.stride_w = stride_w;
-        t.dilation_h = dilation_h; t.dilation_w = dilation_w;
-        t.height_col = height_col;
-        t.width_col = width_col;
-        t.channels_col = channels * kernel_h * kernel_w;
-        cv::parallel_for_(Range(0, t.channels_col), t);
-    }
-    virtual void operator ()(const Range &r) const
-    {
-        for (int c = r.start; c < r.end; ++c)
-        {
-            int w_offset = c % kernel_w;
-            int h_offset = (c / kernel_w) % kernel_h;
-            int c_im = c / kernel_h / kernel_w;
-            for (int h = 0; h < height_col; ++h)
-            {
-                for (int w = 0; w < width_col; ++w)
-                {
-                    int h_pad = h * stride_h - pad_h + h_offset * dilation_h;
-                    int w_pad = w * stride_w - pad_w + w_offset * dilation_w;
-                    if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
-                        data_col[(c * height_col + h) * width_col + w] =
-                        data_im[(c_im * height + h_pad) * width + w_pad];
-                    else
-                        data_col[(c * height_col + h) * width_col + w] = 0;
-                }
-            }
-        }
-    }
-};
-#endif
-template <typename Dtype>
-class im2row_CpuPBody : public cv::ParallelLoopBody
-{
-    const Dtype* data_im;
-    int channels, height, width;
-    int kernel_h, kernel_w;
-    int pad_h, pad_w;
-    int stride_h, stride_w;
-    int dilation_h, dilation_w;
-    Dtype* data_col;
-    int height_col, width_col, channels_col;
-    im2row_CpuPBody() {}
-public:
-    static void run(const Dtype* data_im,
-                    int channels, int height, int width,
-                    int kernel_h, int kernel_w,
-                    int pad_h, int pad_w,
-                    int stride_h, int stride_w,
-                    int dilation_h, int dilation_w,
-                    int height_col, int width_col,
-                    Dtype* data_col)
-    {
-        im2row_CpuPBody<Dtype> t;
-        t.data_im = data_im;
-        t.data_col = data_col;
-        t.channels = channels; t.height = height; t.width = width;
-        t.kernel_h = kernel_h; t.kernel_w = kernel_w;
-        t.pad_h = pad_h; t.pad_w = pad_w;
-        t.stride_h = stride_h; t.stride_w = stride_w;
-        t.dilation_h = dilation_h; t.dilation_w = dilation_w;
-        t.height_col = height_col;
-        t.width_col = width_col;
-        t.channels_col = channels * kernel_h * kernel_w;
-        cv::parallel_for_(Range(0, t.height_col*t.width_col), t, 16);
-    }
-    virtual void operator ()(const Range &r) const
-    {
-        int dh = dilation_h, dw = dilation_w;
-        int kh = kernel_h, kw = kernel_w;
-        Dtype* data_col_ = data_col;
-        const Dtype* data_im_ = data_im;
-        int kelems = kh*kw;
-        AutoBuffer<int> ofs_(kelems);
-        int* ofs = ofs_;
-        int k = 0;
-        for( int k_r = 0; k_r < kernel_h; k_r++ )
-            for( int k_c = 0; k_c < kernel_w; k_c++, k++ )
-                ofs[k] = k_r*dh*width + k_c*dw;
-        for (int row = r.start; row < r.end; ++row)
-        {
-            int out_c = row % width_col;
-            int out_r = row / width_col;
-            int out_row_offset = row*kh*kw*channels;
-            int start_in_r = out_r * stride_h - pad_h;
-            int start_in_c = out_c * stride_w - pad_w;
-            int start_k_r = std::max(0, (-start_in_r + dilation_h-1)/dilation_h);
-            int end_k_r = std::min(kh, (height - start_in_r + dilation_h-1)/dilation_h);
-            int start_k_c = std::max(0, (-start_in_c + dilation_w-1)/dilation_w);
-            int end_k_c = std::min(kw, (width - start_in_c + dilation_w-1)/dilation_w);
-            if( start_k_r == 0 && end_k_r == kh && start_k_c == 0 && end_k_c == kw )
-            {
-                for( int i_c = 0; i_c < channels; i_c++ )
-                {
-                    float* data_col_c = data_col_ + out_row_offset + i_c*kh*kw;
-                    const float* data_im_c = data_im_ + (i_c*height + start_in_r)*width + start_in_c;
-                    for( k = 0; k < kelems; k++ )
-                    {
-                        data_col_c[k] = data_im_c[ofs[k]];
-                    }
-                }
-            }
-            else
-            {
-                for(int i_c = 0; i_c < channels; i_c++)
-                {
-                    int channels_offset = i_c * width * height;
-                    int out_ch_offset = i_c*kh*kw;
-                    int in_r = start_in_r + start_k_r*dh;
-                    for(int k_r = start_k_r; k_r < end_k_r; k_r++, in_r += dh)
-                    {
-                        int row_offset = in_r*width;
-                        int out_col_offset = k_r*kw;
-                        int in_c = start_in_c + start_k_c*dw;
-                        for(int k_c = start_k_c; k_c < end_k_c; k_c++, in_c += dw)
-                        {
-                            int in_index = channels_offset + row_offset + in_c;
-                            int out_index = out_row_offset + out_ch_offset + out_col_offset + k_c;
-                            data_col_[out_index] = data_im_[in_index];
-                        }
-                    }
-                }
-            }
-        }
-    }
-};
-void im2row(const float* data_im, int channels, int height, int width,
-            int kernel_h, int kernel_w, int pad_h, int pad_w,
-            int stride_h, int stride_w, int dilation_h, int dilation_w,
-            int height_col, int width_col, float* data_col)
-{
-    im2row_CpuPBody<float>::run(data_im, channels, height, width,
-                                kernel_h, kernel_w, pad_h, pad_w,
-                                stride_h, stride_w, dilation_h, dilation_w,
-                                height_col, width_col, data_col);
-}
 template <typename Dtype>
 class col2im_CpuPBody : public cv::ParallelLoopBody
 {