some further optimizations and cleanups in dnn (#1237)

* some further optimizations and cleanups in dnn: + got rid of dnn::gemm; it's not perf critical anymore (perhaps) + embedded col2im functionality into convolution_layer.cpp, since it's not used anywhere else + parallel max pooling. even better performance can be achieved if we knew that max indices are not needed (and they are not needed in most networks) + somewhat optimized deconvolution layer: optimized bias addition (merged it with col2im), optimized col2im slightly. + hopefully fixed incorrect memory access in fully-connected layer; restored aligned memory reads (they should work fine now) * hopefully fixed regressions in ENet performance * fixed some typos in deconvolution; added SIMD optimization for the max pooling layer * fixed warnings in SIMD-less build configuration

some further optimizations and cleanups in dnn (#1237)
* some further optimizations and cleanups in dnn: + got rid of dnn::gemm; it's not perf critical anymore (perhaps) + embedded col2im functionality into convolution_layer.cpp, since it's not used anywhere else + parallel max pooling. even better performance can be achieved if we knew that max indices are not needed (and they are not needed in most networks) + somewhat optimized deconvolution layer: optimized bias addition (merged it with col2im), optimized col2im slightly. + hopefully fixed incorrect memory access in fully-connected layer; restored aligned memory reads (they should work fine now) * hopefully fixed regressions in ENet performance * fixed some typos in deconvolution; added SIMD optimization for the max pooling layer * fixed warnings in SIMD-less build configuration
b593cae0 · Vadim Pisarevsky · GitHub · 0b4fc061 · b593cae0 · b593cae0
Commit b593cae0 authored Jun 21, 2017 by Vadim Pisarevsky Committed by GitHub Jun 21, 2017
12 changed files
--- a/modules/dnn/src/layers/convolution_layer.cpp
+++ b/modules/dnn/src/layers/convolution_layer.cpp
@@ -41,9 +41,8 @@

 #include "../precomp.hpp"
 #include "layers_common.hpp"
-#include "op_im2col.hpp"
-#include "op_blas.hpp"
 #include "op_halide.hpp"
+#include "opencv2/core/hal/hal.hpp"
 #include "opencv2/core/hal/intrin.hpp"
 #include <iostream>

@@ -55,16 +54,7 @@ namespace dnn
 class BaseConvolutionLayerImpl : public ConvolutionLayer
 {
 public:
-    BaseConvolutionLayerImpl()
-    {
-#ifdef HAVE_LAPACK
-        int nthreads = cv::getThreadNum();
-        if (getBlasThreads() != nthreads)
-        {
-            setBlasThreads(nthreads);
-        }
-#endif
-    }
+    BaseConvolutionLayerImpl() {}

    virtual bool supportBackend(int backendId)
    {
@@ -146,7 +136,7 @@ public:
 class ConvolutionLayerImpl : public BaseConvolutionLayerImpl
 {
 public:
-    enum { VEC_ALIGN = 8 };
+    enum { VEC_ALIGN = 8, DFT_TYPE = CV_32F };
    Mat weightsMat;
    Ptr<ActivationLayer> activ;

@@ -195,7 +185,11 @@ public:
        return false;
    }

+#if 0
    bool setActivation(const Ptr<ActivationLayer>& layer) { activ = layer; return true; }
+#else
+    bool setActivation(const Ptr<ActivationLayer>&) { return false; }
+#endif

    virtual Ptr<BackendNode> initHalide(const std::vector<Ptr<BackendWrapper> > &inputs)
    {
@@ -379,9 +373,8 @@ public:
            const float* biasvec = &biasvec_[0];
            float* data_out0_ = output_->ptr<float>();
            size_t rowbufsz = (size_t)karea*BLK_SIZE_CN*BLK_SIZE;
-            const int valignBytes = (int)(valign*sizeof(float));
-            AutoBuffer<float> rowbuf0_(rowbufsz + valignBytes);
-            float* rowbuf0 = alignPtr((float*)rowbuf0_, valignBytes);
+            AutoBuffer<float> rowbuf0_(rowbufsz + valign);
+            float* rowbuf0 = alignPtr((float*)rowbuf0_, (int)(valign*sizeof(float)));

            // we clear the buffer once; ultimately, it lets us to avoid
            // tail processing after running the unrolled/vectorized loop.
@@ -588,8 +581,349 @@ public:
        }
    };

+    class ParallelDFTWeights : ParallelLoopBody
+    {
+    public:
+        const Mat* weights_;
+        Mat* wspectrums_;
+        int nstripes_;
+        Size kernel_, dftsz_;
+        int nouts_, ninps_;
+
+        static void run(const Mat& weights, Mat& wspectrums, Size kernel, Size dftsz, int nstripes)
+        {
+            CV_Assert(weights.type() == DFT_TYPE);
+
+            ParallelDFTWeights p;
+            p.weights_ = &weights;
+            p.wspectrums_ = &wspectrums;
+            p.nstripes_ = nstripes;
+            p.kernel_ = kernel;
+            p.dftsz_ = dftsz;
+            p.nouts_ = weights.rows;
+            p.ninps_ = weights.cols / (kernel.area());
+            int dft_total = dftsz.area();
+            int sz[] = { p.nouts_, p.ninps_, dft_total };
+            wspectrums.create(3, sz, DFT_TYPE);
+
+            parallel_for_(Range(0, nstripes), p, nstripes);
+        }
+
+        ParallelDFTWeights() {}
+
+        void operator()(const Range& r) const
+        {
+            int ninps = ninps_, nouts = nouts_;
+            int totalDFTs = nouts*ninps;
+            int stripeSize = (totalDFTs + nstripes_-1)/nstripes_;
+            int stripeStart = r.start*stripeSize;
+            int stripeEnd = std::min(r.end*stripeSize, totalDFTs);
+            int kernel_w = kernel_.width, kernel_h = kernel_.height;
+            int dft_w = dftsz_.width, dft_h = dftsz_.height;
+            float* wptr = (float*)weights_->ptr<float>();
+            size_t wstep = weights_->step1();
+            Ptr<hal::DFT2D> dft2d_fwd = hal::DFT2D::create(dft_w, dft_h, DFT_TYPE, 1, 1, 0, kernel_h);
+
+            for( int i = stripeStart; i < stripeEnd; i++ )
+            {
+                int out = i / ninps;
+                int inp = i % ninps;
+                float* srcptr = wptr + out*wstep + inp*kernel_w*kernel_h;
+                Mat src(kernel_h, kernel_w, DFT_TYPE, srcptr);
+                float* dstptr = wspectrums_->ptr<float>(out, inp);
+                Mat dst(dft_h, dft_w, DFT_TYPE, dstptr);
+                size_t dstep = dft_w*sizeof(dstptr[0]);
+                memset(dstptr, 0, dstep*dft_h);
+                for( int j = 0; j < kernel_h; j++ )
+                    memcpy(dstptr + dft_w*j, srcptr + kernel_w*j, kernel_w*sizeof(dstptr[0]));
+
+                dft2d_fwd->apply((uchar*)dstptr, dstep, (uchar*)dstptr, dstep);
+            }
+        }
+    };
+
+    /*class ParallelDFTConv : public ParallelLoopBody
+    {
+    public:
+        enum { BLK_SIZE = 32, BLK_SIZE_CN = 64 };
+
+        const Mat* input_;
+        const Mat* weights_;
+        Mat* output_;
+        Mat wspectrums_;
+        int outShape[4];
+        Size kernel_, pad_, blksz_, dftsz_;
+        int ngroups_, nstripes_;
+        std::vector<float> biasvec_;
+        const ActivationLayer* activ_;
+
+        static void run( const Mat& input, Mat& output,
+                         const Mat& weights, const Mat& bias,
+                         Size kernel, Size pad, int ngroups, int nstripes,
+                         const ActivationLayer* activ )
+        {
+            CV_Assert( input.dims == 4 && output.dims == 4 &&
+                       input.size[0] == output.size[0] &&
+                       weights.rows == output.size[1] &&
+                       weights.cols == (input.size[1]/ngroups)*kernel.width*kernel.height &&
+                       input.type() == output.type() &&
+                       input.type() == weights.type() &&
+                       input.type() == CV_32F &&
+                       input.isContinuous() &&
+                       output.isContinuous() &&
+                       (bias.empty() || (bias.isContinuous() && bias.type() == CV_32F &&
+                                         bias.total() == (size_t)output.size[1])));
+            ParallelDFTConv p;
+
+            p.input_ = &input;
+            p.weights_ = &weights;
+            p.output_ = &output;
+            for( int i = 0; i < 4; i++ ) p.outShape[i] = output.size[i];
+            p.outShape[1] /= ngroups;
+            p.kernel_ = kernel; p.pad_ = pad;
+            p.ngroups_ = ngroups;
+            p.nstripes_ = nstripes;
+            p.activ_ = activ;
+
+            const double blockScale = 4.5;
+            const int minBlockSize = 32;
+
+            Size resultsz(output.size[3], output.size[2]);
+            Size blksz, dftsz;
+
+            blksz.width = cvRound(kernel.width*blockScale);
+            blksz.width = std::max(blksz.width, minBlockSize - kernel.width + 1);
+            blksz.width = std::min(blksz.width, resultsz.width);
+            blksz.height = cvRound(kernel.height*blockScale);
+            blksz.height = std::max(blksz.height, minBlockSize - kernel.height + 1);
+            blksz.height = std::min(blksz.height, resultsz.height);
+
+            // compute DFT size along each dimension; make sure it's even, because we want
+            // real DFT & inverse DFT to be fast.
+            dftsz.width = blksz.width + kernel.width - 1;
+            for(;;)
+            {
+                dftsz.width = getOptimalDFTSize(dftsz.width);
+                if( dftsz.width <= 0 )
+                    CV_Error( CV_StsOutOfRange, "cannot compute the right DFT size" );
+                if(dftsz.width % 2 == 0)
+                    break;
+                dftsz.width++;
+            }
+            dftsz.height = blksz.height + kernel.height - 1;
+            for(;;)
+            {
+                dftsz.height = getOptimalDFTSize(dftsz.height);
+                if( dftsz.height <= 0 )
+                    CV_Error( CV_StsOutOfRange, "cannot compute the right DFT size" );
+                if(dftsz.height % 2 == 0)
+                    break;
+            }
+
+            // transform all the weights for the layer; we do it on each run because
+            // if we compute and store spectrums of all the weights for all the convolution
+            // layers, it may take a lot of memory
+            ParallelDFTWeights::run(weights, p.wspectrums_, kernel, dftsz, nstripes);
+
+            // recompute block size
+            blksz.width = dftsz.width - kernel.width + 1;
+            blksz.width = std::min(blksz.width, resultsz.width);
+            blksz.height = dftsz.height - kernel.height + 1;
+            blksz.height = std::min(blksz.height, resultsz.height);
+
+            printf("DFT conv: blk=(%d x %d), DFT=(%d x %d)\n", blksz.width, blksz.height, dftsz.width, dftsz.height);
+
+            p.dftsz_ = dftsz;
+            p.blksz_ = blksz;
+
+            int k, outCn = output.size[1];
+            p.biasvec_.resize(outCn+2);
+            float* biasvec = &p.biasvec_[0];
+            if( bias.empty() )
+            {
+                for( k = 0; k < outCn; k++ )
+                    biasvec[k] = 0.f;
+            }
+            else
+            {
+                for( k = 0; k < outCn; k++ )
+                    biasvec[k] = bias.at<float>(k);
+            }
+            biasvec[outCn] = biasvec[outCn+1] = biasvec[outCn-1];
+            parallel_for_(Range(0, nstripes), p, nstripes);
+        }
+
+        ParallelDFTConv() {}
+
+        void operator()(const Range& r0) const
+        {
+            int ngroups = ngroups_, batchSize = input_->size[0]*ngroups;
+            int out_w = output_->size[3], out_h = output_->size[2], outCn = output_->size[1]/ngroups;
+            int width = input_->size[3], height = input_->size[2], inpCn = input_->size[1]/ngroups;
+            int nstripes = nstripes_;
+            int kernel_w = kernel_.width, kernel_h = kernel_.height;
+            int pad_w = pad_.width, pad_h = pad_.height;
+            int blk_w = blksz_.width, blk_h = blksz_.height;
+            int dft_w = dftsz_.width, dft_h = dftsz_.height;
+            int dft_elems = dft_w*dft_h;
+            size_t dftstep = dft_w*sizeof(float);
+            int i, j;
+            size_t inpPlaneSize = width*height;
+            size_t outPlaneSize = out_w*out_h;
+            int ndfts_w = (out_w + blk_w - 1)/blk_w;
+            int ndfts_h = (out_h + blk_h - 1)/blk_h;
+            int ndfts_plane = ndfts_w*ndfts_h;
+
+            int stripesPerSample;
+            int ndfts_stripe;
+            Range r = r0;
+
+            if( nstripes >= batchSize*2 )
+            {
+                stripesPerSample = nstripes/batchSize;
+                ndfts_stripe = (ndfts_plane + stripesPerSample - 1)/stripesPerSample;
+            }
+            else
+            {
+                stripesPerSample = 1;
+                int samplesPerStripe = std::max((batchSize + nstripes - 1)/nstripes, 1);
+                r.start *= samplesPerStripe;
+                r.end *= samplesPerStripe;
+                nstripes *= samplesPerStripe;
+                ndfts_stripe = ndfts_plane;
+            }
+
+            Mat spectrums((inpCn+1)*dft_h, dft_w, DFT_TYPE);
+            Mat out_spectrum = spectrums.rowRange(dft_h*inpCn, dft_h*(inpCn+1));
+            const float* wptr0 = wspectrums_.ptr<float>();
+            const float* data_inp0_ = input_->ptr<float>();
+            const float* biasvec = &biasvec_[0];
+            float* data_out0_ = output_->ptr<float>();
+            float dft_scale = 1.f/(dft_w*dft_h);
+
+            Ptr<hal::DFT2D> dft2d_fwd = hal::DFT2D::create(dft_w, dft_h, DFT_TYPE, 1, 1,
+                                                           CV_HAL_DFT_IS_INPLACE, blk_h + kernel_h - 1);
+            Ptr<hal::DFT2D> dft2d_inv = hal::DFT2D::create(dft_w, dft_h, DFT_TYPE, 1, 1,
+                                        CV_HAL_DFT_INVERSE|CV_HAL_DFT_SCALE, blk_h);
+
+            for( int stripe = r.start; stripe < r.end; stripe++ )
+            {
+                int subsampleIdx = stripe/stripesPerSample;
+                if( subsampleIdx >= batchSize )
+                    break;
+                int startOutCn = (subsampleIdx % ngroups)*outCn;
+                const float* biasptr = biasvec + startOutCn;
+                int dft_idx0 = (stripe - subsampleIdx*stripesPerSample)*ndfts_stripe;
+                int dft_idx1 = std::min(dft_idx0 + ndfts_stripe, ndfts_plane);
+
+                for( int dft_idx = dft_idx0; dft_idx < dft_idx1; dft_idx++ )
+                {
+                    int dft_y = dft_idx / dft_w;
+                    int dft_x = dft_idx - dft_y*dft_w;
+                    dft_x *= blk_w;
+                    dft_y *= blk_h;
+                    int bw = std::min(blk_w, out_w - dft_x);
+                    int bh = std::min(blk_h, out_h - dft_y);
+                    int patch_w = bw + kernel_w - 1;
+                    int patch_h = bh + kernel_h - 1;
+                    int in_x = dft_x - pad_w;
+                    int in_y = dft_y - pad_h;
+                    int i0 = std::max(0, -in_y);
+                    int i1 = std::min(patch_h, height - in_y);
+                    int j0 = std::max(0, -in_x);
+                    int j1 = std::min(patch_w, width - in_x);
+
+                    const float* data_inp = data_inp0_ + subsampleIdx*inpPlaneSize*inpCn + in_y*width + in_x;
+                    float* sdata0 = spectrums.ptr<float>();
+                    float* data_out = data_out0_ + subsampleIdx*outPlaneSize*outCn + dft_y*out_w + dft_x;
+
+                    // phase 1. extract tiles from the input tensor channels and
+                    // compute their spectrums.
+                    float* sdata = sdata0;
+                    for( int cn = 0; cn < inpCn; cn++, data_inp += inpPlaneSize )
+                    {
+                        for( i = 0; i < dft_h; i++, sdata += dft_w )
+                        {
+                            if( i < i0 || i >= i1 )
+                                memset(sdata, 0, dft_w*sizeof(sdata[0]));
+                            else
+                            {
+                                for( j = 0; j < j0; j++ )
+                                    sdata[j] = 0.f;
+                                for( ; j < j1; j++ )
+                                    sdata[j] = data_inp[i*width + j];
+                                for( ; j < dft_w; j++ )
+                                    sdata[j] = 0.f;
+                            }
+                        }
+                        uchar* dftdata = (uchar*)(sdata - dft_elems);
+                        dft2d_fwd->apply(dftdata, dftstep, dftdata, dftstep);
+                    }
+
+                    // phase 2. iterate over output channels. For each output channel multiply
+                    // all the input channels by the corresponding weights and sum the results.
+                    // all this is done in the Fourier domain.
+                    // When the sum is computed, apply the inverse DFT, then add bias and save
+                    // the results.
+                    for( int ocn = 0; ocn < outCn; ocn++, data_out += outPlaneSize )
+                    {
+                        float* odata = out_spectrum.ptr<float>();
+                        memset(odata, 0, dft_elems*sizeof(odata[0]));
+
+                        for( int cn = 0; cn < inpCn; cn++ )
+                        {
+                            const float* wptr = wptr0 + ((ocn + startOutCn)*inpCn + cn)*dft_elems;
+                            const float* sdata = sdata0 + cn*dft_elems;
+
+                            odata[0] += sdata[0]*wptr[0];
+                            odata[dft_w-1] += sdata[dft_w-1]*wptr[dft_w-1];
+                            odata[dft_elems-dft_w] += sdata[dft_elems-dft_w]*wptr[dft_elems-dft_w];
+                            odata[dft_elems-1] += sdata[dft_elems-1]*wptr[dft_elems-1];
+
+                            for( i = 1; i < dft_h-1; i += 2 )
+                            {
+                                int re = i*dft_w, im = re + dft_w;
+                                odata[re] += sdata[re]*wptr[re] + sdata[im]*wptr[im];
+                                odata[im] += sdata[im]*wptr[re] - sdata[re]*wptr[im];
+                                re += dft_w-1; im += dft_w-1;
+                                odata[re] += sdata[re]*wptr[re] + sdata[im]*wptr[im];
+                                odata[im] += sdata[im]*wptr[re] - sdata[re]*wptr[im];
+                            }
+
+                            for( i = 0; i < dft_h; i++ )
+                            {
+                                for( j = 1; j < dft_w-1; j += 2 )
+                                {
+                                    int idx = i*dft_w + j;
+                                    float re = sdata[idx], im = sdata[idx+1];
+                                    float wre = wptr[idx], wim = wptr[idx+1];
+                                    float ore = odata[idx], oim = odata[idx+1];
+                                    odata[idx] = ore + re*wre + im*wim;
+                                    odata[idx+1] = oim + im*wre - re*wim;
+                                }
+                            }
+                        }
+                        dft2d_inv->apply((const uchar*)odata, dftstep, (uchar*)odata, dftstep);
+                        float bias = biasptr[ocn];
+                        for( i = 0; i < bh; i++ )
+                        {
+                            for( j = 0; j < bw; j++ )
+                            {
+                                data_out[i*out_w + j] = odata[i*dft_w + j] + bias;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    };*/
+
    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
    {
+        /*printf("conv %s: input (%d x %d x %d x %d), kernel (%d x %d), pad (%d x %d), stride (%d x %d), dilation (%d x %d)\n",
+               name.c_str(), inputs[0]->size[0], inputs[0]->size[1], inputs[0]->size[2], inputs[0]->size[3],
+               kernel.width, kernel.height, pad.width, pad.height,
+               stride.width, stride.height, dilation.width, dilation.height);*/
        CV_Assert(inputs.size() == (size_t)1 && inputs[0]->size[1] % blobs[0].size[1] == 0);
        int ngroups = inputs[0]->size[1]/blobs[0].size[1];
        CV_Assert(outputs[0].size[1] % ngroups == 0);
@@ -614,8 +948,18 @@ public:
        Mat biasesMat = hasBias() ? blobs[1].reshape(1, outCn) : Mat();

        int nstripes = std::max(getNumThreads(), 1);
-        ParallelConv::run(*inputs[0], outputs[0], weightsMat, biasesMat,
-                          kernel, pad, stride, dilation, ngroups, nstripes, activ.get());
+
+        /*if( stride == Size(1, 1) && dilation == Size(1, 1) && kernel.width >= 3 && kernel.height >= 3 )
+        {
+
+            ParallelDFTConv::run(*inputs[0], outputs[0], weightsMat, biasesMat,
+                                 kernel, pad, ngroups, nstripes, activ.get());
+        }
+        else*/
+        {
+            ParallelConv::run(*inputs[0], outputs[0], weightsMat, biasesMat,
+                              kernel, pad, stride, dilation, ngroups, nstripes, activ.get());
+        }
    }

    virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
@@ -636,6 +980,8 @@ public:
 class DeConvolutionLayerImpl : public BaseConvolutionLayerImpl
 {
 public:
+    Mat weightsMat, biasesMat;
+
    MatShape computeColRowShape(const MatShape &inpShape, const MatShape &outShape) const
    {
        int inpCn = inpShape[1];
@@ -682,70 +1028,283 @@ public:
        return false;
    }

+    class MatMulInvoker : public ParallelLoopBody
+    {
+    public:
+        MatMulInvoker(const Mat& a, const Mat& b, Mat& c, int nstripes)
+        {
+            a_ = &a;
+            b_ = &b;
+            c_ = &c;
+            nstripes_ = nstripes;
+            useAVX2 = checkHardwareSupport(CPU_AVX2);
+        }
+
+        void operator()(const Range& range_) const
+        {
+            int stripeSize = (int)alignSize((b_->cols + nstripes_ - 1)/nstripes_, 16);
+            Range range(range_.start*stripeSize, std::min(range_.end*stripeSize, b_->cols));
+            int mmax = a_->rows;
+            int nmax = range.end - range.start;
+            int kmax = a_->cols;
+            int m, n, k;
+            const float* aptr = a_->ptr<float>();
+            const float* bptr = b_->ptr<float>() + range.start;
+            float* cptr = c_->ptr<float>() + range.start;
+            size_t astep = a_->step1();
+            size_t bstep = b_->step1();
+            size_t cstep = c_->step1();
+
+        #if CV_DNN_TRY_AVX2
+            if( useAVX2 )
+                fastGEMM_avx2( aptr, astep, bptr, bstep, cptr, cstep, mmax, kmax, nmax );
+            else
+        #endif
+            for( m = 0; m < mmax; m += 2 )
+            {
+                float* dst0 = cptr + cstep*m;
+                float* dst1 = cptr + cstep*std::min(m+1, mmax-1);
+                const float* aptr0 = aptr + astep*m;
+                const float* aptr1 = aptr + astep*std::min(m+1, mmax-1);
+
+                for( n = 0; n < nmax; n++ )
+                {
+                    dst0[n] = 0.f;
+                    dst1[n] = 0.f;
+                }
+
+                for( k = 0; k < kmax; k += 4 )
+                {
+                    float alpha00 = aptr0[k];
+                    float alpha01 = aptr1[k];
+                    float alpha10 = 0.f, alpha11 = 0.f;
+                    float alpha20 = 0.f, alpha21 = 0.f;
+                    float alpha30 = 0.f, alpha31 = 0.f;
+                    const float* bptr0 = bptr + k*bstep;
+                    const float* bptr1 = bptr0;
+                    const float* bptr2 = bptr0;
+                    const float* bptr3 = bptr0;
+
+                    if( k+1 < kmax )
+                    {
+                        alpha10 = aptr0[k+1];
+                        alpha11 = aptr1[k+1];
+                        bptr1 = bptr0 + bstep;
+                        if( k+2 < kmax )
+                        {
+                            alpha20 = aptr0[k+2];
+                            alpha21 = aptr1[k+2];
+                            bptr2 = bptr1 + bstep;
+                            if( k+3 < kmax )
+                            {
+                                alpha30 = aptr0[k+3];
+                                alpha31 = aptr1[k+3];
+                                bptr3 = bptr2 + bstep;
+                            }
+                        }
+                    }
+                    n = 0;
+
+                #if CV_SIMD128
+                    v_float32x4 a00 = v_setall_f32(alpha00);
+                    v_float32x4 a01 = v_setall_f32(alpha01);
+                    v_float32x4 a10 = v_setall_f32(alpha10);
+                    v_float32x4 a11 = v_setall_f32(alpha11);
+                    v_float32x4 a20 = v_setall_f32(alpha20);
+                    v_float32x4 a21 = v_setall_f32(alpha21);
+                    v_float32x4 a30 = v_setall_f32(alpha30);
+                    v_float32x4 a31 = v_setall_f32(alpha31);
+
+                    for( ; n <= nmax - 4; n += 4 )
+                    {
+                        v_float32x4 b0 = v_load(bptr0 + n);
+                        v_float32x4 b1 = v_load(bptr1 + n);
+                        v_float32x4 b2 = v_load(bptr2 + n);
+                        v_float32x4 b3 = v_load(bptr3 + n);
+                        v_float32x4 d0 = v_load(dst0 + n);
+                        v_float32x4 d1 = v_load(dst1 + n);
+                        d0 += b0*a00;
+                        d1 += b0*a01;
+                        d0 += b1*a10;
+                        d1 += b1*a11;
+                        d0 += b2*a20;
+                        d1 += b2*a21;
+                        d0 += b3*a30;
+                        d1 += b3*a31;
+                        v_store(dst0 + n, d0);
+                        v_store(dst1 + n, d1);
+                    }
+                #endif
+
+                    for( ; n < nmax; n++ )
+                    {
+                        float b0 = bptr0[n], b1 = bptr1[n];
+                        float b2 = bptr2[n], b3 = bptr3[n];
+                        float d0 = dst0[n] + alpha00*b0 + alpha10*b1 + alpha20*b2 + alpha30*b3;
+                        float d1 = dst1[n] + alpha01*b0 + alpha11*b1 + alpha21*b2 + alpha31*b3;
+                        dst0[n] = d0;
+                        dst1[n] = d1;
+                    }
+                }
+            }
+        }
+
+        const Mat *a_, *b_;
+        Mat* c_;
+        int nstripes_;
+        bool useAVX2;
+    };
+
+    class Col2ImInvoker : public cv::ParallelLoopBody
+    {
+    public:
+        const float* data_col;
+        const float* biasvec;
+        int channels, height, width;
+        int kernel_h, kernel_w;
+        int pad_h, pad_w;
+        int stride_h, stride_w;
+        float* data_im;
+        int height_col, width_col;
+        int nstripes;
+        bool is1x1;
+
+        Col2ImInvoker() {}
+
+        static void run(const float* data_col,
+                        int channels, int height, int width,
+                        int kernel_h, int kernel_w,
+                        int pad_h, int pad_w,
+                        int stride_h, int stride_w,
+                        float* data_im,
+                        const float* biasvec,
+                        bool is1x1)
+        {
+            const int nstripes = getNumThreads();
+
+            Col2ImInvoker t;
+            t.data_col = data_col;
+            t.data_im = data_im;
+            t.channels = channels; t.height = height; t.width = width;
+            t.kernel_h = kernel_h; t.kernel_w = kernel_w;
+            t.pad_h = pad_h; t.pad_w = pad_w;
+            t.stride_h = stride_h; t.stride_w = stride_w;
+            t.height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
+            t.width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
+            t.nstripes = nstripes;
+            t.is1x1 = is1x1;
+            t.biasvec = biasvec;
+
+            parallel_for_(Range(0, nstripes), t, nstripes);
+        }
+
+        virtual void operator ()(const Range &r) const
+        {
+            const float* data_col_ = data_col;
+            float* data_im_ = data_im;
+            int coeff_h = (1 - stride_h * kernel_w * height_col) * width_col;
+            int coeff_w = (1 - stride_w * height_col * width_col);
+            size_t total = (size_t)channels * height * width;
+            size_t stripeSize = (total + nstripes - 1)/nstripes;
+            size_t startIndex = r.start*stripeSize;
+            size_t endIndex = std::min(r.end*stripeSize, total);
+            int w = (int)(startIndex % width + pad_w);
+            int h = (int)((startIndex / width) % height + pad_h);
+            int c = (int)(startIndex / (width * height));
+            int h_col_start = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
+            int h_col_end = std::min(h / stride_h + 1, height_col);
+            int plane_size_col = height_col * width_col;
+            int offset = (c * kernel_h * kernel_w + h * kernel_w + w) * plane_size_col;
+            bool is1x1_ = is1x1;
+            const float* biasvec_ = biasvec;
+
+            for (size_t index = startIndex; index < endIndex; index++)
+            {
+                // compute the start and end of the output
+                int w_col_start = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
+                int w_col_end = std::min(w / stride_w + 1, width_col);
+                float val;
+
+                if( is1x1_ )
+                    val = data_im_[index];
+                else
+                {
+                    val = 0.f;
+                    for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
+                        for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
+                            val += data_col_[offset + h_col * coeff_h + w_col * coeff_w];
+                        }
+                    }
+                }
+                data_im_[index] = val + biasvec_[c];
+
+                offset += plane_size_col;
+                if( ++w >= width + pad_w )
+                {
+                    w = (int)((index + 1)% width + pad_w);
+                    h = (int)(((index + 1) / width) % height + pad_h);
+                    c = (int)((index + 1) / (width * height));
+                    h_col_start = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
+                    h_col_end = std::min(h / stride_h + 1, height_col);
+                    offset = (c * kernel_h * kernel_w + h * kernel_w + w) * plane_size_col;
+                }
+            }
+        }
+    };

    void forward(std::vector<Mat *> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
    {
-        internals[0].setTo(0);
        if (hasBias())
            internals[1].setTo(1);

        int outCn = blobs[0].size[0];
        int inpCn = inputs[0]->size[1];
-        Mat weightsMat = blobs[0].reshape(1, inpCn);
-        Mat biasesMat  = hasBias() ? blobs[1].reshape(1, outCn) : Mat();
+        bool is1x1flag = is1x1();
+        int nstripes = getNumThreads();
+
+        if( weightsMat.empty() )
+        {
+            transpose(blobs[0].reshape(1, inpCn), weightsMat);
+            biasesMat = hasBias() ? blobs[1].reshape(1, outCn) : Mat::zeros(outCn, 1, CV_32F);
+        }

        for (size_t ii = 0; ii < outputs.size(); ii++)
        {
            int ngroups = inpCn / blobs[0].size[1];
            int inpGroupCn = blobs[0].size[1];
            int outGroupCn = outCn / ngroups;
-            int numImg = inputs[ii]->size[0];
+            const Mat& inp = *inputs[ii];
+            Mat& out = outputs[ii];
+            int numImg = inp.size[0];
+            int outH = out.size[2], outW = out.size[3];

            Mat convBlob = inputs[ii]->reshape(1, numImg*inpCn);
-            Mat decnBlob = outputs[ii].reshape(1, numImg*outCn);
+            Mat decnBlob = out.reshape(1, numImg*outCn);

            for (int n = 0; n < numImg; n++)
            {
                for (int g = 0; g < ngroups; g++)
                {
                    Mat dstMat = decnBlob.rowRange(_Range((g + n * ngroups) * outGroupCn, outGroupCn));
-                    Mat &colMat = (is1x1()) ? dstMat : internals[0];
+                    Mat &colMat = is1x1flag ? dstMat : internals[0];

                    Mat convMat = convBlob.rowRange(_Range((g + n * ngroups) * inpGroupCn, inpGroupCn));
-                    Mat wghtMat = weightsMat.rowRange(_Range(g * inpGroupCn, inpGroupCn));
-
-                    dnn::gemm(wghtMat, convMat, 1, colMat, 0, GEMM_1_T);
+                    Mat wghtMat = weightsMat.colRange(_Range(g * inpGroupCn, inpGroupCn));
+                    Mat curBiasMat = biasesMat.rowRange(_Range(g * outGroupCn, outGroupCn));

-                    if (!is1x1())
-                        col2im(colMat, dstMat, shape(*inputs[ii]), shape(outputs[ii]));
+                    //gemm(wghtMat, convMat, 1, colMat, 0, colMat, 0);
+                    MatMulInvoker mminvoker(wghtMat, convMat, colMat, nstripes);
+                    parallel_for_(Range(0, nstripes), mminvoker, nstripes);

-                    if (hasBias())
-                    {
-                        Mat curBiasMat = biasesMat.rowRange(_Range(g * outGroupCn, outGroupCn));
-                        dnn::gemm(curBiasMat, internals[1], 1, dstMat, 1);
-                    }
+                    Col2ImInvoker::run(colMat.ptr<float>(), outGroupCn, outH, outW,
+                                       kernel.height, kernel.width, pad.height, pad.width,
+                                       stride.height, stride.width, dstMat.ptr<float>(),
+                                       curBiasMat.ptr<float>(), is1x1flag);
                }
            }
        }
    }

-    void col2im(const Mat &colMat, Mat &dstImg, const MatShape& inShape, const MatShape& outShape)
-    {
-        int outCn = outShape[1], outH = outShape[2], outW = outShape[3];
-        int inpCn = inShape[1];
-        int ngroups = inpCn / blobs[0].size[1];
-        int outGroupCn = outCn / ngroups;
-
-        if (is1x1())
-        {
-            dstImg = colMat;
-            return;
-        }
-        cv::dnn::col2im(colMat.ptr<float>(), outGroupCn, outH, outW, kernel.height, kernel.width,
-                        pad.height, pad.width, stride.height, stride.width,
-                        dilation.height, dilation.width, dstImg.ptr<float>(), &ofsbuf[0]);
-    }
-
    virtual Ptr<BackendNode> initHalide(const std::vector<Ptr<BackendWrapper> > &inputs)
    {
 #ifdef HAVE_HALIDE
@@ -808,8 +1367,6 @@ public:

        return flops;
    }
-
-    std::vector<int> ofsbuf;
 };

 //Convolution and Deconvolution

--- a/modules/dnn/src/layers/fully_connected_layer.cpp
+++ b/modules/dnn/src/layers/fully_connected_layer.cpp
@@ -41,7 +41,6 @@

 #include "../precomp.hpp"
 #include "layers_common.hpp"
-#include "op_blas.hpp"
 #include "op_halide.hpp"
 #include <opencv2/dnn/shape_utils.hpp>

@@ -133,33 +132,42 @@ public:

        void operator()(const Range& r) const
        {
+            int valign = FullyConnectedLayerImpl::VEC_ALIGN;
            int nsamples = srcMat_->rows;
            int nw0 = weights_->rows;
-            int vecsize = srcMat_->cols;
+            int k, vecsize = srcMat_->cols;
+            int vecsize_aligned = (int)alignSize(vecsize, VEC_ALIGN);
            int nstripes = nstripes_;
            size_t total = (size_t)nsamples*nw0;
            size_t stripeSize = (total + nstripes - 1)/nstripes;
            size_t stripeStart = r.start*stripeSize;
            size_t stripeEnd = r.end == nstripes ? total : std::min(r.end*stripeSize, total);
            size_t wstep = weights_->step1();
+            AutoBuffer<float> srcbuf(vecsize_aligned + valign);
+            float* sptr = alignPtr((float*)srcbuf, (int)(valign*sizeof(float)));
+
+            for( k = vecsize; k < vecsize_aligned; k++ )
+                sptr[k] = 0.f;

            for( size_t ofs = stripeStart; ofs < stripeEnd; )
            {
                int sampleIdx = (int)(ofs / nw0);
                int delta = (int)(ofs - (size_t)sampleIdx*nw0);
-                const float* sptr = srcMat_->ptr<float>(sampleIdx);
+                const float* sptr_ = srcMat_->ptr<float>(sampleIdx);
                const float* wptr = weights_->ptr<float>(delta);
                float* dptr = dstMat_->ptr<float>(sampleIdx) + delta;
                const float* biasptr = biasMat_->ptr<float>() + delta;
                int nw = std::min(nw0 - delta, (int)(stripeEnd - ofs));

+                memcpy(sptr, sptr_, vecsize*sizeof(sptr[0]));
+
            #if CV_DNN_TRY_AVX2
                if( useAVX2_ )
                    fastGEMM1T_avx2( sptr, wptr, wstep, biasptr, dptr, nw, vecsize);
                else
            #endif
                {
-                    int i = 0, k;
+                    int i = 0;

            #if CV_SIMD128
                    for( ; i <= nw - 4; i += 4, wptr += 4*wstep )
@@ -169,7 +177,7 @@ public:

                        for( k = 0; k < vecsize; k += 4 )
                        {
-                            vfloat32x4 v = v_load(sptr + k);
+                            vfloat32x4 v = v_load_aligned(sptr + k);
                            vs0 += v*v_load_aligned(wptr + k);
                            vs1 += v*v_load_aligned(wptr + wstep + k);
                            vs2 += v*v_load_aligned(wptr + wstep*2 + k);

--- a/modules/dnn/src/layers/layers_common.avx2.cpp
+++ b/modules/dnn/src/layers/layers_common.avx2.cpp
@@ -204,7 +204,7 @@ void fastGEMM1T_avx2( const float* vec, const float* weights,

        for( int k = 0; k < vecsize; k += 8, wptr += 8 )
        {
-            __m256 v = _mm256_loadu_ps(vec + k);
+            __m256 v = _mm256_load_ps(vec + k);

            vs0 = _mm256_fmadd_ps(_mm256_load_ps(wptr), v, vs0);
            vs1 = _mm256_fmadd_ps(_mm256_load_ps(wptr + wstep), v, vs1);
@@ -237,7 +237,7 @@ void fastGEMM1T_avx2( const float* vec, const float* weights,

        for( int k = 0; k < vecsize; k += 8, wptr += 8 )
        {
-            __m256 v = _mm256_loadu_ps(vec + k);
+            __m256 v = _mm256_load_ps(vec + k);
            vs0 = _mm256_fmadd_ps(_mm256_load_ps(wptr), v, vs0);
        }

@@ -250,6 +250,76 @@ void fastGEMM1T_avx2( const float* vec, const float* weights,
    _mm256_zeroupper();
 }

+void fastGEMM_avx2( const float* aptr, size_t astep, const float* bptr,
+                    size_t bstep, float* cptr, size_t cstep,
+                    int ma, int na, int nb )
+{
+    int n = 0;
+    for( ; n <= nb - 16; n += 16 )
+    {
+        for( int m = 0; m < ma; m += 4 )
+        {
+            const float* aptr0 = aptr + astep*m;
+            const float* aptr1 = aptr + astep*std::min(m+1, ma-1);
+            const float* aptr2 = aptr + astep*std::min(m+2, ma-1);
+            const float* aptr3 = aptr + astep*std::min(m+3, ma-1);
+
+            float* cptr0 = cptr + cstep*m;
+            float* cptr1 = cptr + cstep*std::min(m+1, ma-1);
+            float* cptr2 = cptr + cstep*std::min(m+2, ma-1);
+            float* cptr3 = cptr + cstep*std::min(m+3, ma-1);
+
+            __m256 d00 = _mm256_setzero_ps(), d01 = _mm256_setzero_ps();
+            __m256 d10 = _mm256_setzero_ps(), d11 = _mm256_setzero_ps();
+            __m256 d20 = _mm256_setzero_ps(), d21 = _mm256_setzero_ps();
+            __m256 d30 = _mm256_setzero_ps(), d31 = _mm256_setzero_ps();
+
+            for( int k = 0; k < na; k++ )
+            {
+                __m256 a0 = _mm256_set1_ps(aptr0[k]);
+                __m256 a1 = _mm256_set1_ps(aptr1[k]);
+                __m256 a2 = _mm256_set1_ps(aptr2[k]);
+                __m256 a3 = _mm256_set1_ps(aptr3[k]);
+                __m256 b0 = _mm256_loadu_ps(bptr + k*bstep + n);
+                __m256 b1 = _mm256_loadu_ps(bptr + k*bstep + n + 8);
+                d00 = _mm256_fmadd_ps(a0, b0, d00);
+                d01 = _mm256_fmadd_ps(a0, b1, d01);
+                d10 = _mm256_fmadd_ps(a1, b0, d10);
+                d11 = _mm256_fmadd_ps(a1, b1, d11);
+                d20 = _mm256_fmadd_ps(a2, b0, d20);
+                d21 = _mm256_fmadd_ps(a2, b1, d21);
+                d30 = _mm256_fmadd_ps(a3, b0, d30);
+                d31 = _mm256_fmadd_ps(a3, b1, d31);
+            }
+
+            _mm256_storeu_ps(cptr0 + n, d00);
+            _mm256_storeu_ps(cptr0 + n + 8, d01);
+            _mm256_storeu_ps(cptr1 + n, d10);
+            _mm256_storeu_ps(cptr1 + n + 8, d11);
+            _mm256_storeu_ps(cptr2 + n, d20);
+            _mm256_storeu_ps(cptr2 + n + 8, d21);
+            _mm256_storeu_ps(cptr3 + n, d30);
+            _mm256_storeu_ps(cptr3 + n + 8, d31);
+        }
+    }
+    _mm256_zeroupper();
+
+    for( ; n < nb; n++ )
+    {
+        for( int m = 0; m < ma; m++ )
+        {
+            const float* aptr0 = aptr + astep*m;
+            float* cptr0 = cptr + cstep*m;
+            float d0 = 0.f;
+
+            for( int k = 0; k < na; k++ )
+                d0 += aptr0[k]*bptr[k*bstep + n];
+
+            cptr0[n] = d0;
+        }
+    }
+}
+
 }
 }


--- a/modules/dnn/src/layers/layers_common.hpp
+++ b/modules/dnn/src/layers/layers_common.hpp
@@ -42,8 +42,6 @@
 #ifndef __OPENCV_DNN_LAYERS_LAYERS_COMMON_HPP__
 #define __OPENCV_DNN_LAYERS_LAYERS_COMMON_HPP__
 #include <opencv2/dnn.hpp>
-#include "op_blas.hpp"
-#include "op_im2col.hpp"
 #include <opencv2/dnn/shape_utils.hpp>

 namespace cv
@@ -74,6 +72,9 @@ void fastConv_avx2(const float* weights, size_t wstep, const float* bias,
 void fastGEMM1T_avx2( const float* vec, const float* weights,
                     size_t wstep, const float* bias,
                     float* dst, int nvecs, int vecsize );
+void fastGEMM_avx2( const float* aptr, size_t astep, const float* bptr0,
+                   size_t bstep, float* cptr, size_t cstep,
+                   int ma, int na, int nb );

 #else
 #define CV_DNN_TRY_AVX2 0

--- a/modules/dnn/src/layers/normalize_bbox_layer.cpp
+++ b/modules/dnn/src/layers/normalize_bbox_layer.cpp
@@ -41,7 +41,6 @@

 #include "../precomp.hpp"
 #include "layers_common.hpp"
-#include "op_blas.hpp"

 #include <float.h>
 #include <algorithm>
@@ -182,14 +181,14 @@ public:
                    Mat norm(channelSize, 1, buffer.type()); // 1 x channelSize

                    // (_channels x channelSize)T * _channels x 1 -> channelSize x 1
-                    gemmCPU(buffer, sumChannelMultiplier, 1, norm, 0, GEMM_1_T);
+                    gemm(buffer, sumChannelMultiplier, 1, norm, 0, norm, GEMM_1_T);

                    // compute norm
                    pow(norm, 0.5f, norm);

                    // scale the layer
                    // _channels x 1 * (channelSize x 1)T -> _channels x channelSize
-                    gemmCPU(sumChannelMultiplier, norm, 1, buffer, 0, GEMM_2_T);
+                    gemm(sumChannelMultiplier, norm, 1, buffer, 0, buffer, GEMM_2_T);

                    dst = src / buffer;
                }
@@ -204,7 +203,7 @@ public:
                {
                    // _scale: _channels x 1
                    // _channels x 1 * 1 x channelSize -> _channels x channelSize
-                    gemmCPU(scale, sumSpatialMultiplier, 1, buffer, 0);
+                    gemm(scale, sumSpatialMultiplier, 1, buffer, 0, buffer);

                    dst = dst.mul(buffer);
                }

--- a/modules/dnn/src/layers/op_blas.cpp
+++ b/modules/dnn/src/layers/op_blas.cpp
-#include "op_blas.hpp"
-
-#ifdef HAVE_LAPACK
-#include "opencv_lapack.h"
-#endif
-
-#include <iostream>
-
-namespace cv
-{
-namespace dnn
-{
-
-void gemm(InputArray A, InputArray B, double alpha, InputOutputArray C, double beta, int flags)
-{
-    if (C.isMat())
-        gemmCPU(A.getMat(), B.getMat(), alpha, C.getMatRef(), beta, flags);
-    else
-    {
-        cv::gemm(A, B, alpha, (beta == 0) ? noArray() : C, beta, C, flags);
-    }
-}
-
-inline void SwapRowCols(const Mat &A, int &rows, int &cols, bool isTrans)
-{
-    CV_DbgAssert(A.dims == 2);
-    rows = (isTrans) ? A.cols : A.rows;
-    cols = (isTrans) ? A.rows : A.cols;
-}
-
-
-class GEMMInvoker : public ParallelLoopBody
-{
-public:
-    GEMMInvoker(const Mat* _a, const Mat* _b, double _alpha, Mat* _c, double _beta)
-    {
-        a = _a;
-        b = _b;
-        c = _c;
-        alpha = _alpha;
-        beta = _beta;
-    }
-
-    void operator()(const Range& range) const
-    {
-        int mmax = a->rows;
-        int nmax = range.end - range.start;
-        int kmax = a->cols;
-        int m, n, k;
-        AutoBuffer<float> buf(nmax);
-        float* ptr = buf;
-        if( mmax %2 != 0 )
-            memset(ptr, 0, nmax*sizeof(ptr[0]));
-
-        for( m = 0; m < mmax; m += 2 )
-        {
-            float* dst0 = c->ptr<float>(m) + range.start;
-            float* dst1 = m+1 < mmax ? c->ptr<float>(m+1) + range.start : ptr;
-            const float* aptr0 = a->ptr<float>(m);
-            const float* aptr1 = m+1 < mmax ? a->ptr<float>(m+1) : aptr0;
-
-            if( beta != 1 )
-            {
-                if( beta == 0 )
-                    for( n = 0; n < nmax; n++ )
-                    {
-                        dst0[n] = 0.f;
-                        dst1[n] = 0.f;
-                    }
-                else
-                    for( n = 0; n < nmax; n++ )
-                    {
-                        dst0[n] *= (float)beta;
-                        dst1[n] *= (float)beta;
-                    }
-            }
-
-            for( k = 0; k < kmax; k++ )
-            {
-                float alpha0 = (float)(alpha*aptr0[k]);
-                float alpha1 = (float)(alpha*aptr1[k]);
-                const float* bptr = b->ptr<float>(k) + range.start;
-
-                for( n = 0; n < nmax; n++ )
-                {
-                    float d0 = dst0[n] + alpha0*bptr[n];
-                    float d1 = dst1[n] + alpha1*bptr[n];
-                    dst0[n] = d0;
-                    dst1[n] = d1;
-                }
-            }
-        }
-    }
-
-    const Mat *a, *b;
-    Mat* c;
-    double alpha, beta;
-};
-
-void gemmCPU(const Mat &A, const Mat &B, double alpha, Mat &C, double beta, int flags /*= 0*/)
-{
-    #ifdef HAVE_LAPACK
-    bool transA = static_cast<bool>(flags & GEMM_1_T);
-    bool transB = static_cast<bool>(flags & GEMM_2_T);
-    bool transC = static_cast<bool>(flags & GEMM_3_T);
-
-    int Arows, Acols, Brows, Bcols, Crows, Ccols;
-    SwapRowCols(A, Arows, Acols, transA);
-    SwapRowCols(B, Brows, Bcols, transB);
-    SwapRowCols(C, Crows, Ccols, transC);
-
-    CV_Assert(!(flags & GEMM_3_T));
-    CV_Assert(Acols == Brows && Arows == Crows && Bcols == Ccols);
-    CV_Assert(A.isContinuous() && B.isContinuous() && C.isContinuous());
-    CV_Assert(A.type() == B.type() && B.type() == C.type());
-    CV_Assert(A.data != C.data && B.data != C.data);
-
-    if (C.type() == CV_32F)
-    {
-        cblas_sgemm(CblasRowMajor, transA ? CblasTrans : CblasNoTrans, transB ? CblasTrans : CblasNoTrans,
-                    Arows, Bcols, Acols,
-                    (float)alpha, A.ptr<float>(), A.cols,
-                    B.ptr<float>(), B.cols,
-                    (float)beta, C.ptr<float>(), C.cols);
-    }
-    else if (C.type() == CV_64F)
-    {
-        //TODO: Should be tested
-        cblas_dgemm(CblasRowMajor, transA ? CblasTrans : CblasNoTrans, transB ? CblasTrans : CblasNoTrans,
-                    Arows, Bcols, Acols,
-                    alpha, A.ptr<double>(), A.cols,
-                    B.ptr<double>(), B.cols,
-                    beta, C.ptr<double>(), C.cols);
-    }
-    else
-    {
-        CV_Error(Error::BadDepth, "Only floating point types are supported");
-    }
-    #else
-    if( C.type() == CV_32F && flags == 0 )
-    {
-        GEMMInvoker invoker(&A, &B, alpha, &C, beta);
-        double granularity = 10000000./((double)A.rows*A.cols);
-        parallel_for_(Range(0, B.cols), invoker, granularity);
-    }
-    else
-        cv::gemm(A, B, alpha, C, beta, C, flags);
-    #endif
-}
-
-int getBlasThreads()
-{
-    #ifdef OPENBLAS_VERSION
-    return openblas_get_num_threads();
-    #else
-    return 1;
-    #endif
-}
-
-void setBlasThreads(int numThreads)
-{
-    #ifdef OPENBLAS_VERSION
-    openblas_set_num_threads(numThreads);
-    goto_set_num_threads(numThreads);
-    #else
-    (void)numThreads;   //suppress compilers' warning
-    #endif
-}
-
-}
-}
--- a/modules/dnn/src/layers/op_blas.hpp
+++ b/modules/dnn/src/layers/op_blas.hpp
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __OPENCV_DNN_LAYERS_OP_BLAS_HPP__
-#define __OPENCV_DNN_LAYERS_OP_BLAS_HPP__
-#include "../precomp.hpp"
-
-namespace cv
-{
-namespace dnn
-{
-    int getBlasThreads();
-
-    void setBlasThreads(int numThreads);
-
-    void gemm(InputArray A, InputArray B, double alpha, InputOutputArray C, double beta, int flags = 0);
-
-    void gemmCPU(const Mat &A, const Mat &B, double alpha, Mat &C, double beta, int flags = 0);
-}
-}
-#endif
--- a/modules/dnn/src/layers/op_im2col.cpp
+++ b/modules/dnn/src/layers/op_im2col.cpp
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "../precomp.hpp"
-#include <opencv2/core/ocl.hpp>
-#include "opencl_kernels_dnn.hpp"
-#include "op_im2col.hpp"
-#include "opencl_kernels_dnn.hpp"
-
-namespace cv {
-namespace dnn {
-
-template <typename Dtype>
-class col2im_CpuPBody : public cv::ParallelLoopBody
-{
-    const Dtype* data_col;
-    int channels, height, width;
-    int kernel_h, kernel_w;
-    int pad_h, pad_w;
-    int stride_h, stride_w;
-    Dtype* data_im;
-    int height_col, width_col;
-
-    col2im_CpuPBody() {}
-
-public:
-    static void run(const Dtype* data_col,
-                    int channels, int height, int width,
-                    int kernel_h, int kernel_w,
-                    int pad_h, int pad_w,
-                    int stride_h, int stride_w,
-                    Dtype* data_im)
-    {
-        //TODO: single-threaded version switch
-
-        col2im_CpuPBody t;
-        t.data_col = data_col;
-        t.data_im = data_im;
-        t.channels = channels; t.height = height; t.width = width;
-        t.kernel_h = kernel_h; t.kernel_w = kernel_w;
-        t.pad_h = pad_h; t.pad_w = pad_w;
-        t.stride_h = stride_h; t.stride_w = stride_w;
-        t.height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
-        t.width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
-        int img_total = channels * height * width;
-
-        cv::parallel_for_(Range(0, img_total), t);
-    }
-
-    virtual void operator ()(const Range &r) const
-    {
-        const Dtype* data_col_ = data_col;
-        Dtype* data_im_ = data_im;
-        int coeff_h_col = (1 - stride_h * kernel_w * height_col) * width_col;
-        int coeff_w_col = (1 - stride_w * height_col * width_col);
-        for (int index = r.start; index < r.end; index++)
-        {
-            Dtype val = 0;
-            int w = index % width + pad_w;
-            int h = (index / width) % height + pad_h;
-            int c = index / (width * height);
-
-            // compute the start and end of the output
-            int w_col_start = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
-            int w_col_end = std::min(w / stride_w + 1, width_col);
-            int h_col_start = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
-            int h_col_end = std::min(h / stride_h + 1, height_col);
-
-            // equivalent implementation
-            int offset =
-            (c * kernel_h * kernel_w + h * kernel_w + w) * height_col * width_col;
-
-            for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
-                for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
-                    val += data_col_[offset + h_col * coeff_h_col + w_col * coeff_w_col];
-                }
-            }
-            data_im_[index] = val;
-        }
-    }
-};
-
-//single-threaded version
-template <typename Dtype>
-void col2im_cpu(const Dtype* data_col,
-                int channels, int height, int width,
-                int kernel_h, int kernel_w,
-                int pad_h, int pad_w,
-                int stride_h, int stride_w,
-                int dilation_h, int dilation_w,
-                Dtype* data_im,
-                const int* ofsbuf)
-{
-    int height_col = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
-    int width_col = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
-    int channels_col = channels * kernel_h * kernel_w;
-
-    std::memset(data_im, 0, height * width * channels * sizeof(Dtype));
-
-    for (int c = 0; c < channels_col; ++c, ofsbuf += 3)
-    {
-        //int w_offset = c % kernel_w;
-        //int h_offset = (c / kernel_w) % kernel_h;
-        //int c_im = c / kernel_h / kernel_w;
-        int w_offset = ofsbuf[0];
-        int h_offset = ofsbuf[1];
-        int c_im = ofsbuf[2];
-
-        for (int h = 0; h < height_col; ++h)
-        {
-            for (int w = 0; w < width_col; ++w)
-            {
-                int h_pad = h * stride_h - pad_h + h_offset * dilation_h;
-                int w_pad = w * stride_w - pad_w + w_offset * dilation_w;
-
-                if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
-                    data_im[(c_im * height + h_pad) * width + w_pad] +=
-                    data_col[(c * height_col + h) * width_col + w];
-            }
-        }
-    }
-}
-
-void col2im(const float* data_col, int channels, int height, int width,
-            int kernel_h, int kernel_w, int pad_h, int pad_w,
-            int stride_h, int stride_w, int dilation_h, int dilation_w,
-            float* data_im, const int* ofsbuf)
-{
-    (void)dilation_h;
-    (void)dilation_w;
-    (void)ofsbuf;
-    col2im_CpuPBody<float>::run(data_col, channels, height, width, kernel_h,
-                                kernel_w, pad_h, pad_w, stride_h, stride_w, data_im);
-#if 0
-    col2im_cpu(data_col, channels, height, width, kernel_h, kernel_w, pad_h, pad_w,
-               stride_h, stride_w, dilation_h, dilation_w, data_im, ofsbuf);
-#endif
-}
-
-}
-}
--- a/modules/dnn/src/layers/op_im2col.hpp
+++ b/modules/dnn/src/layers/op_im2col.hpp
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __OPENCV_DNN_LAYERS_IM2COL_HPP__
-#define __OPENCV_DNN_LAYERS_IM2COL_HPP__
-#include <opencv2/core.hpp>
-#include <cstdlib>
-
-namespace cv
-{
-namespace dnn
-{
-
-void im2row(const float* data_im, int channels, int height, int width,
-            int kernel_h, int kernel_w, int pad_h, int pad_w,
-            int stride_h, int stride_w, int dilation_h, int dilation_w,
-            int height_col, int width_col, float* data_col);
-
-void col2im(const float* data_col, int channels, int height, int width,
-            int kernel_h, int kernel_w, int pad_h, int pad_w,
-            int stride_h, int stride_w, int dilation_h, int dilation_w,
-            float* data_im, const int* ofsbuf);
-
-}
-}
-
-#endif
--- a/modules/dnn/src/layers/pooling_layer.cpp
+++ b/modules/dnn/src/layers/pooling_layer.cpp
@@ -41,6 +41,7 @@

 #include "../precomp.hpp"
 #include "layers_common.hpp"
+#include "opencv2/core/hal/intrin.hpp"
 #include "op_halide.hpp"
 #include <float.h>
 #include <algorithm>
@@ -130,50 +131,150 @@ public:
            return Ptr<BackendNode>();
    }

-    void maxPooling(Mat &src, Mat &dst, Mat &mask)
+    class MaxPoolingInvoker : public ParallelLoopBody
    {
-        Size inp(src.size[3], src.size[2]),
-            out(dst.size[3], dst.size[2]);
+    public:
+        const Mat* src_;
+        Mat *dst_, *mask_;
+        Size kernel_, stride_, pad_;
+        int nstripes_;

-        for (int n = 0; n < src.size[0]; ++n)
+        MaxPoolingInvoker(const Mat& src, Mat& dst, Mat& mask, Size kernel, Size stride, Size pad, int nstripes)
        {
-            for (int c = 0; c < src.size[1]; ++c)
-            {
-                const float *srcData = src.ptr<float>(n, c);
-                float *dstData = dst.ptr<float>(n, c);
-                float *dstMaskData = mask.ptr<float>(n, c);
+            src_ = &src;
+            dst_ = &dst;
+            mask_ = &mask;
+            kernel_ = kernel;
+            stride_ = stride;
+            pad_ = pad;
+            nstripes_ = nstripes;
+
+            CV_Assert(src.isContinuous() && dst.isContinuous() &&
+                      src.type() == CV_32F && src.type() == dst.type() &&
+                      mask.type() == src.type() && src.dims == 4 && dst.dims == 4 &&
+                      src.size[0] == dst.size[0] && src.size[1] == dst.size[1] &&
+                      mask.size == dst.size);
+        }

-                for (int ph = 0; ph < out.height; ++ph)
+        void operator()(const Range& r) const
+        {
+            int nimgs = dst_->size[0], channels = dst_->size[1];
+            int width = dst_->size[3], height = dst_->size[2];
+            int inp_width = src_->size[3], inp_height = src_->size[2];
+            size_t total = dst_->total();
+            size_t stripeSize = (total + nstripes_ - 1)/nstripes_;
+            size_t stripeStart = r.start*stripeSize;
+            size_t stripeEnd = std::min(r.end*stripeSize, total);
+            size_t ofs = stripeStart;
+            int x0 = (int)(ofs % width);
+            ofs /= width;
+            int y0 = (int)(ofs % height);
+            ofs /= height;
+            int c = (int)(ofs % channels);
+            int n = (int)(ofs / channels);
+            const float *srcData = src_->ptr<float>(n, c);
+            float *dstData = dst_->ptr<float>(n, c, y0) + x0;
+            float *dstMaskData = mask_->ptr<float>(n, c, y0) + x0;
+            int kernel_w = kernel_.width, kernel_h = kernel_.height;
+            int pad_w = pad_.width, pad_h = pad_.height;
+            int stride_w = stride_.width, stride_h = stride_.height;
+        #if CV_SIMD128
+            v_float32x4 idx00(0.f, (float)stride_w, (float)(stride_w*2), (float)(stride_w*3));
+            v_float32x4 ones = v_setall_f32(1.f);
+            v_float32x4 delta = v_setall_f32((float)(inp_width - kernel_w));
+        #endif
+
+            for( ofs = stripeStart; ofs < stripeEnd; ofs++, dstData++, dstMaskData++ )
+            {
+                int ystart = y0 * stride_h - pad_h;
+                int xstart = x0 * stride_w - pad_w;
+                int yend = min(ystart + kernel_h, inp_height);
+                int xend = min(xstart + kernel_w, inp_width);
+                ystart = max(ystart, 0);
+                xstart = max(xstart, 0);
+                float max_val = -FLT_MAX;
+                int max_index = -1;
+
+            #if CV_SIMD128
+                if( xstart > 0 && (x0 + 7) * stride_w - pad_w + kernel_w < inp_width )
                {
-                    for (int pw = 0; pw < out.width; ++pw)
+                    v_float32x4 max_val0 = v_setall_f32(max_val);
+                    v_float32x4 max_val1 = max_val0;
+                    v_float32x4 max_idx0 = v_setall_f32(-1.f);
+                    v_float32x4 max_idx1 = max_idx0;
+                    int index0 = ystart * inp_width + xstart;
+                    v_float32x4 idx0 = idx00 + v_setall_f32((float)index0);
+                    v_float32x4 idx1 = idx0 + v_setall_f32((float)(stride_w*4));
+
+                    for (int y = ystart; y < yend; ++y)
                    {
-                        int hstart = ph * stride.height - pad.height;
-                        int wstart = pw * stride.width - pad.width;
-                        int hend = min(hstart + kernel.height, inp.height);
-                        int wend = min(wstart + kernel.width, inp.width);
-                        hstart = max(hstart, 0);
-                        wstart = max(wstart, 0);
-                        const int poolIndex = ph * out.width + pw;
-                        float max_val = -FLT_MAX;
-                        int max_index = -1;
-
-                        for (int h = hstart; h < hend; ++h)
-                            for (int w = wstart; w < wend; ++w)
+                        for (int x = xstart; x < xend; ++x, idx0 += ones, idx1 += ones)
+                        {
+                            const int index = y * inp_width + x;
+                            v_float32x4 v0(srcData[index], srcData[index + stride_w],
+                                           srcData[index + stride_w*2], srcData[index + stride_w*3]);
+                            v_float32x4 v1(srcData[index + stride_w*4], srcData[index + stride_w*5],
+                                           srcData[index + stride_w*6], srcData[index + stride_w*7]);
+                            max_idx0 = v_select(v0 > max_val0, idx0, max_idx0);
+                            max_idx1 = v_select(v1 > max_val1, idx1, max_idx1);
+                            max_val0 = v_max(max_val0, v0);
+                            max_val1 = v_max(max_val1, v1);
+                        }
+                        idx0 += delta;
+                        idx1 += delta;
+                    }
+                    v_store(dstData, max_val0);
+                    v_store(dstData + 4, max_val1);
+                    v_store(dstMaskData, max_idx0);
+                    v_store(dstMaskData + 4, max_idx1);
+                    ofs += 7;
+                    dstData += 7;
+                    dstMaskData += 7;
+                    x0 += 7;
+                }
+                else
+            #endif
+                {
+                    for (int y = ystart; y < yend; ++y)
+                        for (int x = xstart; x < xend; ++x)
+                        {
+                            const int index = y * inp_width + x;
+                            float val = srcData[index];
+                            if (val > max_val)
                            {
-                                const int index = h * inp.width + w;
-                                if (srcData[index] > max_val)
-                                {
-                                    max_val = srcData[index];
-                                    max_index = index;
-                                }
+                                max_val = val;
+                                max_index = index;
                            }
+                        }

-                        dstData[poolIndex] = max_val;
-                        dstMaskData[poolIndex] = max_index;
+                    *dstData = max_val;
+                    *dstMaskData = max_index;
+                }
+
+                if( ++x0 >= width )
+                {
+                    x0 = 0;
+                    if( ++y0 >= height )
+                    {
+                        y0 = 0;
+                        if( ++c >= channels )
+                        {
+                            c = 0;
+                            if( ++n >= nimgs )
+                                break;
+                        }
+                        srcData = src_->ptr<float>(n, c);
                    }
                }
            }
        }
+    };
+
+    void maxPooling(Mat &src, Mat &dst, Mat &mask)
+    {
+        const int nstripes = getNumThreads();
+        MaxPoolingInvoker mp(src, dst, mask, kernel, stride, pad, nstripes);
+        parallel_for_(Range(0, nstripes), mp, nstripes);
    }

    void avePooling(Mat &src, Mat &dst)

--- a/modules/dnn/src/layers/recurrent_layers.cpp
+++ b/modules/dnn/src/layers/recurrent_layers.cpp
@@ -40,7 +40,6 @@
 //M*/

 #include "../precomp.hpp"
-#include "op_blas.hpp"
 #include <iostream>
 #include <iterator>
 #include <cmath>
@@ -243,9 +242,9 @@ public:
            Range curRowRange(ts*numSamples, (ts + 1)*numSamples);
            Mat xCurr = xTs.rowRange(curRowRange);

-            dnn::gemm(xCurr, Wx, 1, gates, 0, GEMM_2_T);      // Wx * x_t
-            dnn::gemm(hInternal, Wh, 1, gates, 1, GEMM_2_T);  //+Wh * h_{t-1}
-            dnn::gemm(dummyOnes, bias, 1, gates, 1);          //+b
+            gemm(xCurr, Wx, 1, gates, 0, gates, GEMM_2_T);      // Wx * x_t
+            gemm(hInternal, Wh, 1, gates, 1, gates, GEMM_2_T);  //+Wh * h_{t-1}
+            gemm(dummyOnes, bias, 1, gates, 1, gates);          //+b

            Mat getesIFO = gates.colRange(0, 3*numOut);
            Mat gateI = gates.colRange(0*numOut, 1*numOut);
@@ -419,14 +418,14 @@ public:
            Range curRowRange = Range(ts * numSamples, (ts + 1) * numSamples);
            Mat xCurr = xTs.rowRange(curRowRange);

-            dnn::gemm(hPrev, Whh, 1, hCurr, 0, GEMM_2_T); // W_{hh} * h_{prev}
-            dnn::gemm(xCurr, Wxh, 1, hCurr, 1, GEMM_2_T); //+W_{xh} * x_{curr}
-            dnn::gemm(dummyBiasOnes, bh, 1, hCurr, 1);    //+bh
+            gemm(hPrev, Whh, 1, hCurr, 0, hCurr, GEMM_2_T); // W_{hh} * h_{prev}
+            gemm(xCurr, Wxh, 1, hCurr, 1, hCurr, GEMM_2_T); //+W_{xh} * x_{curr}
+            gemm(dummyBiasOnes, bh, 1, hCurr, 1, hCurr);    //+bh
            tanh(hCurr, hPrev);

            Mat oCurr = oTs.rowRange(curRowRange);
-            dnn::gemm(hPrev, Who, 1, oCurr, 0, GEMM_2_T); // W_{ho} * h_{prev}
-            dnn::gemm(dummyBiasOnes, bo, 1, oCurr, 1);    //+b_o
+            gemm(hPrev, Who, 1, oCurr, 0, oCurr, GEMM_2_T); // W_{ho} * h_{prev}
+            gemm(dummyBiasOnes, bo, 1, oCurr, 1, oCurr);    //+b_o
            tanh(oCurr, oCurr);

            if (produceH)

--- a/modules/dnn/src/layers/shift_layer.cpp
+++ b/modules/dnn/src/layers/shift_layer.cpp
@@ -10,7 +10,6 @@ Implementation of shift layer, which adds up const values to blob.
 */

 #include "../precomp.hpp"
-#include "op_blas.hpp"
 #include <opencv2/dnn/shape_utils.hpp>

 namespace cv
@@ -25,15 +24,6 @@ public:
    {
        setParamsFrom(params);
        CV_Assert(blobs.size() == 1);
-
-#ifdef HAVE_LAPACK
-        {
-            if (getBlasThreads() != cv::getThreadNum())
-            {
-                setBlasThreads(cv::getThreadNum());
-            }
-        }
-#endif
    }

    bool getMemoryShapes(const std::vector<MatShape> &inputs,
@@ -76,7 +66,7 @@ public:
                {
                    Mat dstMat(inpBlob.size[1], inpBlob.size[2] * inpBlob.size[3],
                               outBlob.type(), outBlob.ptr(n));
-                    dnn::gemm(blobs[0], biasOnesMat, 1, dstMat, 1); //TODO: gemv
+                    gemm(blobs[0], biasOnesMat, 1, dstMat, 1, dstMat); //TODO: gemv
                }
            }
        }