some further optimizations and cleanups in dnn (#1237)

* some further optimizations and cleanups in dnn: + got rid of dnn::gemm; it's not perf critical anymore (perhaps) + embedded col2im functionality into convolution_layer.cpp, since it's not used anywhere else + parallel max pooling. even better performance can be achieved if we knew that max indices are not needed (and they are not needed in most networks) + somewhat optimized deconvolution layer: optimized bias addition (merged it with col2im), optimized col2im slightly. + hopefully fixed incorrect memory access in fully-connected layer; restored aligned memory reads (they should work fine now) * hopefully fixed regressions in ENet performance * fixed some typos in deconvolution; added SIMD optimization for the max pooling layer * fixed warnings in SIMD-less build configuration

some further optimizations and cleanups in dnn (#1237)
* some further optimizations and cleanups in dnn: + got rid of dnn::gemm; it's not perf critical anymore (perhaps) + embedded col2im functionality into convolution_layer.cpp, since it's not used anywhere else + parallel max pooling. even better performance can be achieved if we knew that max indices are not needed (and they are not needed in most networks) + somewhat optimized deconvolution layer: optimized bias addition (merged it with col2im), optimized col2im slightly. + hopefully fixed incorrect memory access in fully-connected layer; restored aligned memory reads (they should work fine now) * hopefully fixed regressions in ENet performance * fixed some typos in deconvolution; added SIMD optimization for the max pooling layer * fixed warnings in SIMD-less build configuration
b593cae0 · Vadim Pisarevsky · GitHub · 0b4fc061 · b593cae0 · b593cae0
Commit b593cae0 authored Jun 21, 2017 by Vadim Pisarevsky Committed by GitHub Jun 21, 2017
12 changed files
--- a/modules/dnn/src/layers/convolution_layer.cpp
+++ b/modules/dnn/src/layers/convolution_layer.cpp
--- a/modules/dnn/src/layers/fully_connected_layer.cpp
+++ b/modules/dnn/src/layers/fully_connected_layer.cpp
@@ -41,7 +41,6 @@
 #include "../precomp.hpp"
 #include "layers_common.hpp"
-#include "op_blas.hpp"
 #include "op_halide.hpp"
 #include <opencv2/dnn/shape_utils.hpp>
@@ -133,33 +132,42 @@ public:
        void operator()(const Range& r) const
        {
+            int valign = FullyConnectedLayerImpl::VEC_ALIGN;
            int nsamples = srcMat_->rows;
            int nw0 = weights_->rows;
-            int vecsize = srcMat_->cols;
+            int k, vecsize = srcMat_->cols;
+            int vecsize_aligned = (int)alignSize(vecsize, VEC_ALIGN);
            int nstripes = nstripes_;
            size_t total = (size_t)nsamples*nw0;
            size_t stripeSize = (total + nstripes - 1)/nstripes;
            size_t stripeStart = r.start*stripeSize;
            size_t stripeEnd = r.end == nstripes ? total : std::min(r.end*stripeSize, total);
            size_t wstep = weights_->step1();
+            AutoBuffer<float> srcbuf(vecsize_aligned + valign);
+            float* sptr = alignPtr((float*)srcbuf, (int)(valign*sizeof(float)));
+            for( k = vecsize; k < vecsize_aligned; k++ )
+                sptr[k] = 0.f;
            for( size_t ofs = stripeStart; ofs < stripeEnd; )
            {
                int sampleIdx = (int)(ofs / nw0);
                int delta = (int)(ofs - (size_t)sampleIdx*nw0);
-                const float* sptr = srcMat_->ptr<float>(sampleIdx);
+                const float* sptr_ = srcMat_->ptr<float>(sampleIdx);
                const float* wptr = weights_->ptr<float>(delta);
                float* dptr = dstMat_->ptr<float>(sampleIdx) + delta;
                const float* biasptr = biasMat_->ptr<float>() + delta;
                int nw = std::min(nw0 - delta, (int)(stripeEnd - ofs));
+                memcpy(sptr, sptr_, vecsize*sizeof(sptr[0]));
            #if CV_DNN_TRY_AVX2
                if( useAVX2_ )
                    fastGEMM1T_avx2( sptr, wptr, wstep, biasptr, dptr, nw, vecsize);
                else
            #endif
                {
-                    int i = 0, k;
+                    int i = 0;
            #if CV_SIMD128
                    for( ; i <= nw - 4; i += 4, wptr += 4*wstep )
@@ -169,7 +177,7 @@ public:
                        for( k = 0; k < vecsize; k += 4 )
                        {
-                            vfloat32x4 v = v_load(sptr + k);
+                            vfloat32x4 v = v_load_aligned(sptr + k);
                            vs0 += v*v_load_aligned(wptr + k);
                            vs1 += v*v_load_aligned(wptr + wstep + k);
                            vs2 += v*v_load_aligned(wptr + wstep*2 + k);

--- a/modules/dnn/src/layers/layers_common.avx2.cpp
+++ b/modules/dnn/src/layers/layers_common.avx2.cpp
@@ -204,7 +204,7 @@ void fastGEMM1T_avx2( const float* vec, const float* weights,
        for( int k = 0; k < vecsize; k += 8, wptr += 8 )
        {
-            __m256 v = _mm256_loadu_ps(vec + k);
+            __m256 v = _mm256_load_ps(vec + k);
            vs0 = _mm256_fmadd_ps(_mm256_load_ps(wptr), v, vs0);
            vs1 = _mm256_fmadd_ps(_mm256_load_ps(wptr + wstep), v, vs1);
@@ -237,7 +237,7 @@ void fastGEMM1T_avx2( const float* vec, const float* weights,
        for( int k = 0; k < vecsize; k += 8, wptr += 8 )
        {
-            __m256 v = _mm256_loadu_ps(vec + k);
+            __m256 v = _mm256_load_ps(vec + k);
            vs0 = _mm256_fmadd_ps(_mm256_load_ps(wptr), v, vs0);
        }
@@ -250,6 +250,76 @@ void fastGEMM1T_avx2( const float* vec, const float* weights,
    _mm256_zeroupper();
 }
+void fastGEMM_avx2( const float* aptr, size_t astep, const float* bptr,
+                    size_t bstep, float* cptr, size_t cstep,
+                    int ma, int na, int nb )
+{
+    int n = 0;
+    for( ; n <= nb - 16; n += 16 )
+    {
+        for( int m = 0; m < ma; m += 4 )
+        {
+            const float* aptr0 = aptr + astep*m;
+            const float* aptr1 = aptr + astep*std::min(m+1, ma-1);
+            const float* aptr2 = aptr + astep*std::min(m+2, ma-1);
+            const float* aptr3 = aptr + astep*std::min(m+3, ma-1);
+            float* cptr0 = cptr + cstep*m;
+            float* cptr1 = cptr + cstep*std::min(m+1, ma-1);
+            float* cptr2 = cptr + cstep*std::min(m+2, ma-1);
+            float* cptr3 = cptr + cstep*std::min(m+3, ma-1);
+            __m256 d00 = _mm256_setzero_ps(), d01 = _mm256_setzero_ps();
+            __m256 d10 = _mm256_setzero_ps(), d11 = _mm256_setzero_ps();
+            __m256 d20 = _mm256_setzero_ps(), d21 = _mm256_setzero_ps();
+            __m256 d30 = _mm256_setzero_ps(), d31 = _mm256_setzero_ps();
+            for( int k = 0; k < na; k++ )
+            {
+                __m256 a0 = _mm256_set1_ps(aptr0[k]);
+                __m256 a1 = _mm256_set1_ps(aptr1[k]);
+                __m256 a2 = _mm256_set1_ps(aptr2[k]);
+                __m256 a3 = _mm256_set1_ps(aptr3[k]);
+                __m256 b0 = _mm256_loadu_ps(bptr + k*bstep + n);
+                __m256 b1 = _mm256_loadu_ps(bptr + k*bstep + n + 8);
+                d00 = _mm256_fmadd_ps(a0, b0, d00);
+                d01 = _mm256_fmadd_ps(a0, b1, d01);
+                d10 = _mm256_fmadd_ps(a1, b0, d10);
+                d11 = _mm256_fmadd_ps(a1, b1, d11);
+                d20 = _mm256_fmadd_ps(a2, b0, d20);
+                d21 = _mm256_fmadd_ps(a2, b1, d21);
+                d30 = _mm256_fmadd_ps(a3, b0, d30);
+                d31 = _mm256_fmadd_ps(a3, b1, d31);
+            }
+            _mm256_storeu_ps(cptr0 + n, d00);
+            _mm256_storeu_ps(cptr0 + n + 8, d01);
+            _mm256_storeu_ps(cptr1 + n, d10);
+            _mm256_storeu_ps(cptr1 + n + 8, d11);
+            _mm256_storeu_ps(cptr2 + n, d20);
+            _mm256_storeu_ps(cptr2 + n + 8, d21);
+            _mm256_storeu_ps(cptr3 + n, d30);
+            _mm256_storeu_ps(cptr3 + n + 8, d31);
+        }
+    }
+    _mm256_zeroupper();
+    for( ; n < nb; n++ )
+    {
+        for( int m = 0; m < ma; m++ )
+        {
+            const float* aptr0 = aptr + astep*m;
+            float* cptr0 = cptr + cstep*m;
+            float d0 = 0.f;
+            for( int k = 0; k < na; k++ )
+                d0 += aptr0[k]*bptr[k*bstep + n];
+            cptr0[n] = d0;
+        }
+    }
+}
 }
 }

--- a/modules/dnn/src/layers/layers_common.hpp
+++ b/modules/dnn/src/layers/layers_common.hpp
@@ -42,8 +42,6 @@
 #ifndef __OPENCV_DNN_LAYERS_LAYERS_COMMON_HPP__
 #define __OPENCV_DNN_LAYERS_LAYERS_COMMON_HPP__
 #include <opencv2/dnn.hpp>
-#include "op_blas.hpp"
-#include "op_im2col.hpp"
 #include <opencv2/dnn/shape_utils.hpp>
 namespace cv
@@ -74,6 +72,9 @@ void fastConv_avx2(const float* weights, size_t wstep, const float* bias,
 void fastGEMM1T_avx2( const float* vec, const float* weights,
                     size_t wstep, const float* bias,
                     float* dst, int nvecs, int vecsize );
+void fastGEMM_avx2( const float* aptr, size_t astep, const float* bptr0,
+                   size_t bstep, float* cptr, size_t cstep,
+                   int ma, int na, int nb );
 #else
 #define CV_DNN_TRY_AVX2 0

--- a/modules/dnn/src/layers/normalize_bbox_layer.cpp
+++ b/modules/dnn/src/layers/normalize_bbox_layer.cpp
@@ -41,7 +41,6 @@
 #include "../precomp.hpp"
 #include "layers_common.hpp"
-#include "op_blas.hpp"
 #include <float.h>
 #include <algorithm>
@@ -182,14 +181,14 @@ public:
                    Mat norm(channelSize, 1, buffer.type()); // 1 x channelSize
                    // (_channels x channelSize)T * _channels x 1 -> channelSize x 1
-                    gemmCPU(buffer, sumChannelMultiplier, 1, norm, 0, GEMM_1_T);
+                    gemm(buffer, sumChannelMultiplier, 1, norm, 0, norm, GEMM_1_T);
                    // compute norm
                    pow(norm, 0.5f, norm);
                    // scale the layer
                    // _channels x 1 * (channelSize x 1)T -> _channels x channelSize
-                    gemmCPU(sumChannelMultiplier, norm, 1, buffer, 0, GEMM_2_T);
+                    gemm(sumChannelMultiplier, norm, 1, buffer, 0, buffer, GEMM_2_T);
                    dst = src / buffer;
                }
@@ -204,7 +203,7 @@ public:
                {
                    // _scale: _channels x 1
                    // _channels x 1 * 1 x channelSize -> _channels x channelSize
-                    gemmCPU(scale, sumSpatialMultiplier, 1, buffer, 0);
+                    gemm(scale, sumSpatialMultiplier, 1, buffer, 0, buffer);
                    dst = dst.mul(buffer);
                }

--- a/modules/dnn/src/layers/op_blas.cpp
+++ b/modules/dnn/src/layers/op_blas.cpp
-#include "op_blas.hpp"
-#ifdef HAVE_LAPACK
-#include "opencv_lapack.h"
-#endif
-#include <iostream>
-namespace cv
-{
-namespace dnn
-{
-void gemm(InputArray A, InputArray B, double alpha, InputOutputArray C, double beta, int flags)
-{
-    if (C.isMat())
-        gemmCPU(A.getMat(), B.getMat(), alpha, C.getMatRef(), beta, flags);
-    else
-    {
-        cv::gemm(A, B, alpha, (beta == 0) ? noArray() : C, beta, C, flags);
-    }
-}
-inline void SwapRowCols(const Mat &A, int &rows, int &cols, bool isTrans)
-{
-    CV_DbgAssert(A.dims == 2);
-    rows = (isTrans) ? A.cols : A.rows;
-    cols = (isTrans) ? A.rows : A.cols;
-}
-class GEMMInvoker : public ParallelLoopBody
-{
-public:
-    GEMMInvoker(const Mat* _a, const Mat* _b, double _alpha, Mat* _c, double _beta)
-    {
-        a = _a;
-        b = _b;
-        c = _c;
-        alpha = _alpha;
-        beta = _beta;
-    }
-    void operator()(const Range& range) const
-    {
-        int mmax = a->rows;
-        int nmax = range.end - range.start;
-        int kmax = a->cols;
-        int m, n, k;
-        AutoBuffer<float> buf(nmax);
-        float* ptr = buf;
-        if( mmax %2 != 0 )
-            memset(ptr, 0, nmax*sizeof(ptr[0]));
-        for( m = 0; m < mmax; m += 2 )
-        {
-            float* dst0 = c->ptr<float>(m) + range.start;
-            float* dst1 = m+1 < mmax ? c->ptr<float>(m+1) + range.start : ptr;
-            const float* aptr0 = a->ptr<float>(m);
-            const float* aptr1 = m+1 < mmax ? a->ptr<float>(m+1) : aptr0;
-            if( beta != 1 )
-            {
-                if( beta == 0 )
-                    for( n = 0; n < nmax; n++ )
-                    {
-                        dst0[n] = 0.f;
-                        dst1[n] = 0.f;
-                    }
-                else
-                    for( n = 0; n < nmax; n++ )
-                    {
-                        dst0[n] *= (float)beta;
-                        dst1[n] *= (float)beta;
-                    }
-            }
-            for( k = 0; k < kmax; k++ )
-            {
-                float alpha0 = (float)(alpha*aptr0[k]);
-                float alpha1 = (float)(alpha*aptr1[k]);
-                const float* bptr = b->ptr<float>(k) + range.start;
-                for( n = 0; n < nmax; n++ )
-                {
-                    float d0 = dst0[n] + alpha0*bptr[n];
-                    float d1 = dst1[n] + alpha1*bptr[n];
-                    dst0[n] = d0;
-                    dst1[n] = d1;
-                }
-            }
-        }
-    }
-    const Mat *a, *b;
-    Mat* c;
-    double alpha, beta;
-};
-void gemmCPU(const Mat &A, const Mat &B, double alpha, Mat &C, double beta, int flags /*= 0*/)
-{
-    #ifdef HAVE_LAPACK
-    bool transA = static_cast<bool>(flags & GEMM_1_T);
-    bool transB = static_cast<bool>(flags & GEMM_2_T);
-    bool transC = static_cast<bool>(flags & GEMM_3_T);
-    int Arows, Acols, Brows, Bcols, Crows, Ccols;
-    SwapRowCols(A, Arows, Acols, transA);
-    SwapRowCols(B, Brows, Bcols, transB);
-    SwapRowCols(C, Crows, Ccols, transC);
-    CV_Assert(!(flags & GEMM_3_T));
-    CV_Assert(Acols == Brows && Arows == Crows && Bcols == Ccols);
-    CV_Assert(A.isContinuous() && B.isContinuous() && C.isContinuous());
-    CV_Assert(A.type() == B.type() && B.type() == C.type());
-    CV_Assert(A.data != C.data && B.data != C.data);
-    if (C.type() == CV_32F)
-    {
-        cblas_sgemm(CblasRowMajor, transA ? CblasTrans : CblasNoTrans, transB ? CblasTrans : CblasNoTrans,
-                    Arows, Bcols, Acols,
-                    (float)alpha, A.ptr<float>(), A.cols,
-                    B.ptr<float>(), B.cols,
-                    (float)beta, C.ptr<float>(), C.cols);
-    }
-    else if (C.type() == CV_64F)
-    {
-        //TODO: Should be tested
-        cblas_dgemm(CblasRowMajor, transA ? CblasTrans : CblasNoTrans, transB ? CblasTrans : CblasNoTrans,
-                    Arows, Bcols, Acols,
-                    alpha, A.ptr<double>(), A.cols,
-                    B.ptr<double>(), B.cols,
-                    beta, C.ptr<double>(), C.cols);
-    }
-    else
-    {
-        CV_Error(Error::BadDepth, "Only floating point types are supported");
-    }
-    #else
-    if( C.type() == CV_32F && flags == 0 )
-    {
-        GEMMInvoker invoker(&A, &B, alpha, &C, beta);
-        double granularity = 10000000./((double)A.rows*A.cols);
-        parallel_for_(Range(0, B.cols), invoker, granularity);
-    }
-    else
-        cv::gemm(A, B, alpha, C, beta, C, flags);
-    #endif
-}
-int getBlasThreads()
-{
-    #ifdef OPENBLAS_VERSION
-    return openblas_get_num_threads();
-    #else
-    return 1;
-    #endif
-}
-void setBlasThreads(int numThreads)
-{
-    #ifdef OPENBLAS_VERSION
-    openblas_set_num_threads(numThreads);
-    goto_set_num_threads(numThreads);
-    #else
-    (void)numThreads;   //suppress compilers' warning
-    #endif
-}
-}
-}
--- a/modules/dnn/src/layers/op_blas.hpp
+++ b/modules/dnn/src/layers/op_blas.hpp
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-#ifndef __OPENCV_DNN_LAYERS_OP_BLAS_HPP__
-#define __OPENCV_DNN_LAYERS_OP_BLAS_HPP__
-#include "../precomp.hpp"
-namespace cv
-{
-namespace dnn
-{
-    int getBlasThreads();
-    void setBlasThreads(int numThreads);
-    void gemm(InputArray A, InputArray B, double alpha, InputOutputArray C, double beta, int flags = 0);
-    void gemmCPU(const Mat &A, const Mat &B, double alpha, Mat &C, double beta, int flags = 0);
-}
-}
-#endif
--- a/modules/dnn/src/layers/op_im2col.cpp
+++ b/modules/dnn/src/layers/op_im2col.cpp
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-#include "../precomp.hpp"
-#include <opencv2/core/ocl.hpp>
-#include "opencl_kernels_dnn.hpp"
-#include "op_im2col.hpp"
-#include "opencl_kernels_dnn.hpp"
-namespace cv {
-namespace dnn {
-template <typename Dtype>
-class col2im_CpuPBody : public cv::ParallelLoopBody
-{
-    const Dtype* data_col;
-    int channels, height, width;
-    int kernel_h, kernel_w;
-    int pad_h, pad_w;
-    int stride_h, stride_w;
-    Dtype* data_im;
-    int height_col, width_col;
-    col2im_CpuPBody() {}
-public:
-    static void run(const Dtype* data_col,
-                    int channels, int height, int width,
-                    int kernel_h, int kernel_w,
-                    int pad_h, int pad_w,
-                    int stride_h, int stride_w,
-                    Dtype* data_im)
-    {
-        //TODO: single-threaded version switch
-        col2im_CpuPBody t;
-        t.data_col = data_col;
-        t.data_im = data_im;
-        t.channels = channels; t.height = height; t.width = width;
-        t.kernel_h = kernel_h; t.kernel_w = kernel_w;
-        t.pad_h = pad_h; t.pad_w = pad_w;
-        t.stride_h = stride_h; t.stride_w = stride_w;
-        t.height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
-        t.width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
-        int img_total = channels * height * width;
-        cv::parallel_for_(Range(0, img_total), t);
-    }
-    virtual void operator ()(const Range &r) const
-    {
-        const Dtype* data_col_ = data_col;
-        Dtype* data_im_ = data_im;
-        int coeff_h_col = (1 - stride_h * kernel_w * height_col) * width_col;
-        int coeff_w_col = (1 - stride_w * height_col * width_col);
-        for (int index = r.start; index < r.end; index++)
-        {
-            Dtype val = 0;
-            int w = index % width + pad_w;
-            int h = (index / width) % height + pad_h;
-            int c = index / (width * height);
-            // compute the start and end of the output
-            int w_col_start = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
-            int w_col_end = std::min(w / stride_w + 1, width_col);
-            int h_col_start = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
-            int h_col_end = std::min(h / stride_h + 1, height_col);
-            // equivalent implementation
-            int offset =
-            (c * kernel_h * kernel_w + h * kernel_w + w) * height_col * width_col;
-            for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
-                for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
-                    val += data_col_[offset + h_col * coeff_h_col + w_col * coeff_w_col];
-                }
-            }
-            data_im_[index] = val;
-        }
-    }
-};
-//single-threaded version
-template <typename Dtype>
-void col2im_cpu(const Dtype* data_col,
-                int channels, int height, int width,
-                int kernel_h, int kernel_w,
-                int pad_h, int pad_w,
-                int stride_h, int stride_w,
-                int dilation_h, int dilation_w,
-                Dtype* data_im,
-                const int* ofsbuf)
-{
-    int height_col = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
-    int width_col = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
-    int channels_col = channels * kernel_h * kernel_w;
-    std::memset(data_im, 0, height * width * channels * sizeof(Dtype));
-    for (int c = 0; c < channels_col; ++c, ofsbuf += 3)
-    {
-        //int w_offset = c % kernel_w;
-        //int h_offset = (c / kernel_w) % kernel_h;
-        //int c_im = c / kernel_h / kernel_w;
-        int w_offset = ofsbuf[0];
-        int h_offset = ofsbuf[1];
-        int c_im = ofsbuf[2];
-        for (int h = 0; h < height_col; ++h)
-        {
-            for (int w = 0; w < width_col; ++w)
-            {
-                int h_pad = h * stride_h - pad_h + h_offset * dilation_h;
-                int w_pad = w * stride_w - pad_w + w_offset * dilation_w;
-                if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
-                    data_im[(c_im * height + h_pad) * width + w_pad] +=
-                    data_col[(c * height_col + h) * width_col + w];
-            }
-        }
-    }
-}
-void col2im(const float* data_col, int channels, int height, int width,
-            int kernel_h, int kernel_w, int pad_h, int pad_w,
-            int stride_h, int stride_w, int dilation_h, int dilation_w,
-            float* data_im, const int* ofsbuf)
-{
-    (void)dilation_h;
-    (void)dilation_w;
-    (void)ofsbuf;
-    col2im_CpuPBody<float>::run(data_col, channels, height, width, kernel_h,
-                                kernel_w, pad_h, pad_w, stride_h, stride_w, data_im);
-#if 0
-    col2im_cpu(data_col, channels, height, width, kernel_h, kernel_w, pad_h, pad_w,
-               stride_h, stride_w, dilation_h, dilation_w, data_im, ofsbuf);
-#endif
-}
-}
-}
--- a/modules/dnn/src/layers/op_im2col.hpp
+++ b/modules/dnn/src/layers/op_im2col.hpp
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-#ifndef __OPENCV_DNN_LAYERS_IM2COL_HPP__
-#define __OPENCV_DNN_LAYERS_IM2COL_HPP__
-#include <opencv2/core.hpp>
-#include <cstdlib>
-namespace cv
-{
-namespace dnn
-{
-void im2row(const float* data_im, int channels, int height, int width,
-            int kernel_h, int kernel_w, int pad_h, int pad_w,
-            int stride_h, int stride_w, int dilation_h, int dilation_w,
-            int height_col, int width_col, float* data_col);
-void col2im(const float* data_col, int channels, int height, int width,
-            int kernel_h, int kernel_w, int pad_h, int pad_w,
-            int stride_h, int stride_w, int dilation_h, int dilation_w,
-            float* data_im, const int* ofsbuf);
-}
-}
-#endif
--- a/modules/dnn/src/layers/pooling_layer.cpp
+++ b/modules/dnn/src/layers/pooling_layer.cpp
@@ -41,6 +41,7 @@
 #include "../precomp.hpp"
 #include "layers_common.hpp"
+#include "opencv2/core/hal/intrin.hpp"
 #include "op_halide.hpp"
 #include <float.h>
 #include <algorithm>
@@ -130,50 +131,150 @@ public:
            return Ptr<BackendNode>();
    }
-    void maxPooling(Mat &src, Mat &dst, Mat &mask)
+    class MaxPoolingInvoker : public ParallelLoopBody
    {
-        Size inp(src.size[3], src.size[2]),
+    public:
-            out(dst.size[3], dst.size[2]);
+        const Mat* src_;
+        Mat *dst_, *mask_;
+        Size kernel_, stride_, pad_;
+        int nstripes_;
-        for (int n = 0; n < src.size[0]; ++n)
+        MaxPoolingInvoker(const Mat& src, Mat& dst, Mat& mask, Size kernel, Size stride, Size pad, int nstripes)
        {
-            for (int c = 0; c < src.size[1]; ++c)
+            src_ = &src;
-            {
+            dst_ = &dst;
-                const float *srcData = src.ptr<float>(n, c);
+            mask_ = &mask;
-                float *dstData = dst.ptr<float>(n, c);
+            kernel_ = kernel;
-                float *dstMaskData = mask.ptr<float>(n, c);
+            stride_ = stride;
+            pad_ = pad;
+            nstripes_ = nstripes;
+            CV_Assert(src.isContinuous() && dst.isContinuous() &&
+                      src.type() == CV_32F && src.type() == dst.type() &&
+                      mask.type() == src.type() && src.dims == 4 && dst.dims == 4 &&
+                      src.size[0] == dst.size[0] && src.size[1] == dst.size[1] &&
+                      mask.size == dst.size);
+        }
-                for (int ph = 0; ph < out.height; ++ph)
+        void operator()(const Range& r) const
+        {
+            int nimgs = dst_->size[0], channels = dst_->size[1];
+            int width = dst_->size[3], height = dst_->size[2];
+            int inp_width = src_->size[3], inp_height = src_->size[2];
+            size_t total = dst_->total();
+            size_t stripeSize = (total + nstripes_ - 1)/nstripes_;
+            size_t stripeStart = r.start*stripeSize;
+            size_t stripeEnd = std::min(r.end*stripeSize, total);
+            size_t ofs = stripeStart;
+            int x0 = (int)(ofs % width);
+            ofs /= width;
+            int y0 = (int)(ofs % height);
+            ofs /= height;
+            int c = (int)(ofs % channels);
+            int n = (int)(ofs / channels);
+            const float *srcData = src_->ptr<float>(n, c);
+            float *dstData = dst_->ptr<float>(n, c, y0) + x0;
+            float *dstMaskData = mask_->ptr<float>(n, c, y0) + x0;
+            int kernel_w = kernel_.width, kernel_h = kernel_.height;
+            int pad_w = pad_.width, pad_h = pad_.height;
+            int stride_w = stride_.width, stride_h = stride_.height;
+        #if CV_SIMD128
+            v_float32x4 idx00(0.f, (float)stride_w, (float)(stride_w*2), (float)(stride_w*3));
+            v_float32x4 ones = v_setall_f32(1.f);
+            v_float32x4 delta = v_setall_f32((float)(inp_width - kernel_w));
+        #endif
+            for( ofs = stripeStart; ofs < stripeEnd; ofs++, dstData++, dstMaskData++ )
+            {
+                int ystart = y0 * stride_h - pad_h;
+                int xstart = x0 * stride_w - pad_w;
+                int yend = min(ystart + kernel_h, inp_height);
+                int xend = min(xstart + kernel_w, inp_width);
+                ystart = max(ystart, 0);
+                xstart = max(xstart, 0);
+                float max_val = -FLT_MAX;
+                int max_index = -1;
+            #if CV_SIMD128
+                if( xstart > 0 && (x0 + 7) * stride_w - pad_w + kernel_w < inp_width )
                {
-                    for (int pw = 0; pw < out.width; ++pw)
+                    v_float32x4 max_val0 = v_setall_f32(max_val);
+                    v_float32x4 max_val1 = max_val0;
+                    v_float32x4 max_idx0 = v_setall_f32(-1.f);
+                    v_float32x4 max_idx1 = max_idx0;
+                    int index0 = ystart * inp_width + xstart;
+                    v_float32x4 idx0 = idx00 + v_setall_f32((float)index0);
+                    v_float32x4 idx1 = idx0 + v_setall_f32((float)(stride_w*4));
+                    for (int y = ystart; y < yend; ++y)
                    {
-                        int hstart = ph * stride.height - pad.height;
+                        for (int x = xstart; x < xend; ++x, idx0 += ones, idx1 += ones)
-                        int wstart = pw * stride.width - pad.width;
+                        {
-                        int hend = min(hstart + kernel.height, inp.height);
+                            const int index = y * inp_width + x;
-                        int wend = min(wstart + kernel.width, inp.width);
+                            v_float32x4 v0(srcData[index], srcData[index + stride_w],
-                        hstart = max(hstart, 0);
+                                           srcData[index + stride_w*2], srcData[index + stride_w*3]);
-                        wstart = max(wstart, 0);
+                            v_float32x4 v1(srcData[index + stride_w*4], srcData[index + stride_w*5],
-                        const int poolIndex = ph * out.width + pw;
+                                           srcData[index + stride_w*6], srcData[index + stride_w*7]);
-                        float max_val = -FLT_MAX;
+                            max_idx0 = v_select(v0 > max_val0, idx0, max_idx0);
-                        int max_index = -1;
+                            max_idx1 = v_select(v1 > max_val1, idx1, max_idx1);
+                            max_val0 = v_max(max_val0, v0);
-                        for (int h = hstart; h < hend; ++h)
+                            max_val1 = v_max(max_val1, v1);
-                            for (int w = wstart; w < wend; ++w)
+                        }
+                        idx0 += delta;
+                        idx1 += delta;
+                    }
+                    v_store(dstData, max_val0);
+                    v_store(dstData + 4, max_val1);
+                    v_store(dstMaskData, max_idx0);
+                    v_store(dstMaskData + 4, max_idx1);
+                    ofs += 7;
+                    dstData += 7;
+                    dstMaskData += 7;
+                    x0 += 7;
+                }
+                else
+            #endif
+                {
+                    for (int y = ystart; y < yend; ++y)
+                        for (int x = xstart; x < xend; ++x)
+                        {
+                            const int index = y * inp_width + x;
+                            float val = srcData[index];
+                            if (val > max_val)
                            {
-                                const int index = h * inp.width + w;
+                                max_val = val;
-                                if (srcData[index] > max_val)
+                                max_index = index;
-                                {
-                                    max_val = srcData[index];
-                                    max_index = index;
-                                }
                            }
+                        }
-                        dstData[poolIndex] = max_val;
+                    *dstData = max_val;
-                        dstMaskData[poolIndex] = max_index;
+                    *dstMaskData = max_index;
+                }
+                if( ++x0 >= width )
+                {
+                    x0 = 0;
+                    if( ++y0 >= height )
+                    {
+                        y0 = 0;
+                        if( ++c >= channels )
+                        {
+                            c = 0;
+                            if( ++n >= nimgs )
+                                break;
+                        }
+                        srcData = src_->ptr<float>(n, c);
                    }
                }
            }
        }
+    };
+    void maxPooling(Mat &src, Mat &dst, Mat &mask)
+    {
+        const int nstripes = getNumThreads();
+        MaxPoolingInvoker mp(src, dst, mask, kernel, stride, pad, nstripes);
+        parallel_for_(Range(0, nstripes), mp, nstripes);
    }
    void avePooling(Mat &src, Mat &dst)

--- a/modules/dnn/src/layers/recurrent_layers.cpp
+++ b/modules/dnn/src/layers/recurrent_layers.cpp
@@ -40,7 +40,6 @@
 //M*/
 #include "../precomp.hpp"
-#include "op_blas.hpp"
 #include <iostream>
 #include <iterator>
 #include <cmath>
@@ -243,9 +242,9 @@ public:
            Range curRowRange(ts*numSamples, (ts + 1)*numSamples);
            Mat xCurr = xTs.rowRange(curRowRange);
-            dnn::gemm(xCurr, Wx, 1, gates, 0, GEMM_2_T);      // Wx * x_t
+            gemm(xCurr, Wx, 1, gates, 0, gates, GEMM_2_T);      // Wx * x_t
-            dnn::gemm(hInternal, Wh, 1, gates, 1, GEMM_2_T);  //+Wh * h_{t-1}
+            gemm(hInternal, Wh, 1, gates, 1, gates, GEMM_2_T);  //+Wh * h_{t-1}
-            dnn::gemm(dummyOnes, bias, 1, gates, 1);          //+b
+            gemm(dummyOnes, bias, 1, gates, 1, gates);          //+b
            Mat getesIFO = gates.colRange(0, 3*numOut);
            Mat gateI = gates.colRange(0*numOut, 1*numOut);
@@ -419,14 +418,14 @@ public:
            Range curRowRange = Range(ts * numSamples, (ts + 1) * numSamples);
            Mat xCurr = xTs.rowRange(curRowRange);
-            dnn::gemm(hPrev, Whh, 1, hCurr, 0, GEMM_2_T); // W_{hh} * h_{prev}
+            gemm(hPrev, Whh, 1, hCurr, 0, hCurr, GEMM_2_T); // W_{hh} * h_{prev}
-            dnn::gemm(xCurr, Wxh, 1, hCurr, 1, GEMM_2_T); //+W_{xh} * x_{curr}
+            gemm(xCurr, Wxh, 1, hCurr, 1, hCurr, GEMM_2_T); //+W_{xh} * x_{curr}
-            dnn::gemm(dummyBiasOnes, bh, 1, hCurr, 1);    //+bh
+            gemm(dummyBiasOnes, bh, 1, hCurr, 1, hCurr);    //+bh
            tanh(hCurr, hPrev);
            Mat oCurr = oTs.rowRange(curRowRange);
-            dnn::gemm(hPrev, Who, 1, oCurr, 0, GEMM_2_T); // W_{ho} * h_{prev}
+            gemm(hPrev, Who, 1, oCurr, 0, oCurr, GEMM_2_T); // W_{ho} * h_{prev}
-            dnn::gemm(dummyBiasOnes, bo, 1, oCurr, 1);    //+b_o
+            gemm(dummyBiasOnes, bo, 1, oCurr, 1, oCurr);    //+b_o
            tanh(oCurr, oCurr);
            if (produceH)

--- a/modules/dnn/src/layers/shift_layer.cpp
+++ b/modules/dnn/src/layers/shift_layer.cpp
@@ -10,7 +10,6 @@ Implementation of shift layer, which adds up const values to blob.
 */
 #include "../precomp.hpp"
-#include "op_blas.hpp"
 #include <opencv2/dnn/shape_utils.hpp>
 namespace cv
@@ -25,15 +24,6 @@ public:
    {
        setParamsFrom(params);
        CV_Assert(blobs.size() == 1);
-#ifdef HAVE_LAPACK
-        {
-            if (getBlasThreads() != cv::getThreadNum())
-            {
-                setBlasThreads(cv::getThreadNum());
-            }
-        }
-#endif
    }
    bool getMemoryShapes(const std::vector<MatShape> &inputs,
@@ -76,7 +66,7 @@ public:
                {
                    Mat dstMat(inpBlob.size[1], inpBlob.size[2] * inpBlob.size[3],
                               outBlob.type(), outBlob.ptr(n));
-                    dnn::gemm(blobs[0], biasOnesMat, 1, dstMat, 1); //TODO: gemv
+                    gemm(blobs[0], biasOnesMat, 1, dstMat, 1, dstMat); //TODO: gemv
                }
            }
        }