Commit b593cae0 authored by Vadim Pisarevsky's avatar Vadim Pisarevsky Committed by GitHub

some further optimizations and cleanups in dnn (#1237)

* some further optimizations and cleanups in dnn:
+ got rid of dnn::gemm; it's not perf critical anymore (perhaps)
+ embedded col2im functionality into convolution_layer.cpp, since it's not used anywhere else
+ parallel max pooling. even better performance can be achieved if we knew that max indices are not needed (and they are not needed in most networks)
+ somewhat optimized deconvolution layer: optimized bias addition (merged it with col2im), optimized col2im slightly.
+ hopefully fixed incorrect memory access in fully-connected layer; restored aligned memory reads (they should work fine now)

* hopefully fixed regressions in ENet performance

* fixed some typos in deconvolution; added SIMD optimization for the max pooling layer

* fixed warnings in SIMD-less build configuration
parent 0b4fc061
......@@ -41,7 +41,6 @@
#include "../precomp.hpp"
#include "layers_common.hpp"
#include "op_blas.hpp"
#include "op_halide.hpp"
#include <opencv2/dnn/shape_utils.hpp>
......@@ -133,33 +132,42 @@ public:
void operator()(const Range& r) const
{
int valign = FullyConnectedLayerImpl::VEC_ALIGN;
int nsamples = srcMat_->rows;
int nw0 = weights_->rows;
int vecsize = srcMat_->cols;
int k, vecsize = srcMat_->cols;
int vecsize_aligned = (int)alignSize(vecsize, VEC_ALIGN);
int nstripes = nstripes_;
size_t total = (size_t)nsamples*nw0;
size_t stripeSize = (total + nstripes - 1)/nstripes;
size_t stripeStart = r.start*stripeSize;
size_t stripeEnd = r.end == nstripes ? total : std::min(r.end*stripeSize, total);
size_t wstep = weights_->step1();
AutoBuffer<float> srcbuf(vecsize_aligned + valign);
float* sptr = alignPtr((float*)srcbuf, (int)(valign*sizeof(float)));
for( k = vecsize; k < vecsize_aligned; k++ )
sptr[k] = 0.f;
for( size_t ofs = stripeStart; ofs < stripeEnd; )
{
int sampleIdx = (int)(ofs / nw0);
int delta = (int)(ofs - (size_t)sampleIdx*nw0);
const float* sptr = srcMat_->ptr<float>(sampleIdx);
const float* sptr_ = srcMat_->ptr<float>(sampleIdx);
const float* wptr = weights_->ptr<float>(delta);
float* dptr = dstMat_->ptr<float>(sampleIdx) + delta;
const float* biasptr = biasMat_->ptr<float>() + delta;
int nw = std::min(nw0 - delta, (int)(stripeEnd - ofs));
memcpy(sptr, sptr_, vecsize*sizeof(sptr[0]));
#if CV_DNN_TRY_AVX2
if( useAVX2_ )
fastGEMM1T_avx2( sptr, wptr, wstep, biasptr, dptr, nw, vecsize);
else
#endif
{
int i = 0, k;
int i = 0;
#if CV_SIMD128
for( ; i <= nw - 4; i += 4, wptr += 4*wstep )
......@@ -169,7 +177,7 @@ public:
for( k = 0; k < vecsize; k += 4 )
{
vfloat32x4 v = v_load(sptr + k);
vfloat32x4 v = v_load_aligned(sptr + k);
vs0 += v*v_load_aligned(wptr + k);
vs1 += v*v_load_aligned(wptr + wstep + k);
vs2 += v*v_load_aligned(wptr + wstep*2 + k);
......
......@@ -204,7 +204,7 @@ void fastGEMM1T_avx2( const float* vec, const float* weights,
for( int k = 0; k < vecsize; k += 8, wptr += 8 )
{
__m256 v = _mm256_loadu_ps(vec + k);
__m256 v = _mm256_load_ps(vec + k);
vs0 = _mm256_fmadd_ps(_mm256_load_ps(wptr), v, vs0);
vs1 = _mm256_fmadd_ps(_mm256_load_ps(wptr + wstep), v, vs1);
......@@ -237,7 +237,7 @@ void fastGEMM1T_avx2( const float* vec, const float* weights,
for( int k = 0; k < vecsize; k += 8, wptr += 8 )
{
__m256 v = _mm256_loadu_ps(vec + k);
__m256 v = _mm256_load_ps(vec + k);
vs0 = _mm256_fmadd_ps(_mm256_load_ps(wptr), v, vs0);
}
......@@ -250,6 +250,76 @@ void fastGEMM1T_avx2( const float* vec, const float* weights,
_mm256_zeroupper();
}
void fastGEMM_avx2( const float* aptr, size_t astep, const float* bptr,
size_t bstep, float* cptr, size_t cstep,
int ma, int na, int nb )
{
int n = 0;
for( ; n <= nb - 16; n += 16 )
{
for( int m = 0; m < ma; m += 4 )
{
const float* aptr0 = aptr + astep*m;
const float* aptr1 = aptr + astep*std::min(m+1, ma-1);
const float* aptr2 = aptr + astep*std::min(m+2, ma-1);
const float* aptr3 = aptr + astep*std::min(m+3, ma-1);
float* cptr0 = cptr + cstep*m;
float* cptr1 = cptr + cstep*std::min(m+1, ma-1);
float* cptr2 = cptr + cstep*std::min(m+2, ma-1);
float* cptr3 = cptr + cstep*std::min(m+3, ma-1);
__m256 d00 = _mm256_setzero_ps(), d01 = _mm256_setzero_ps();
__m256 d10 = _mm256_setzero_ps(), d11 = _mm256_setzero_ps();
__m256 d20 = _mm256_setzero_ps(), d21 = _mm256_setzero_ps();
__m256 d30 = _mm256_setzero_ps(), d31 = _mm256_setzero_ps();
for( int k = 0; k < na; k++ )
{
__m256 a0 = _mm256_set1_ps(aptr0[k]);
__m256 a1 = _mm256_set1_ps(aptr1[k]);
__m256 a2 = _mm256_set1_ps(aptr2[k]);
__m256 a3 = _mm256_set1_ps(aptr3[k]);
__m256 b0 = _mm256_loadu_ps(bptr + k*bstep + n);
__m256 b1 = _mm256_loadu_ps(bptr + k*bstep + n + 8);
d00 = _mm256_fmadd_ps(a0, b0, d00);
d01 = _mm256_fmadd_ps(a0, b1, d01);
d10 = _mm256_fmadd_ps(a1, b0, d10);
d11 = _mm256_fmadd_ps(a1, b1, d11);
d20 = _mm256_fmadd_ps(a2, b0, d20);
d21 = _mm256_fmadd_ps(a2, b1, d21);
d30 = _mm256_fmadd_ps(a3, b0, d30);
d31 = _mm256_fmadd_ps(a3, b1, d31);
}
_mm256_storeu_ps(cptr0 + n, d00);
_mm256_storeu_ps(cptr0 + n + 8, d01);
_mm256_storeu_ps(cptr1 + n, d10);
_mm256_storeu_ps(cptr1 + n + 8, d11);
_mm256_storeu_ps(cptr2 + n, d20);
_mm256_storeu_ps(cptr2 + n + 8, d21);
_mm256_storeu_ps(cptr3 + n, d30);
_mm256_storeu_ps(cptr3 + n + 8, d31);
}
}
_mm256_zeroupper();
for( ; n < nb; n++ )
{
for( int m = 0; m < ma; m++ )
{
const float* aptr0 = aptr + astep*m;
float* cptr0 = cptr + cstep*m;
float d0 = 0.f;
for( int k = 0; k < na; k++ )
d0 += aptr0[k]*bptr[k*bstep + n];
cptr0[n] = d0;
}
}
}
}
}
......
......@@ -42,8 +42,6 @@
#ifndef __OPENCV_DNN_LAYERS_LAYERS_COMMON_HPP__
#define __OPENCV_DNN_LAYERS_LAYERS_COMMON_HPP__
#include <opencv2/dnn.hpp>
#include "op_blas.hpp"
#include "op_im2col.hpp"
#include <opencv2/dnn/shape_utils.hpp>
namespace cv
......@@ -74,6 +72,9 @@ void fastConv_avx2(const float* weights, size_t wstep, const float* bias,
void fastGEMM1T_avx2( const float* vec, const float* weights,
size_t wstep, const float* bias,
float* dst, int nvecs, int vecsize );
void fastGEMM_avx2( const float* aptr, size_t astep, const float* bptr0,
size_t bstep, float* cptr, size_t cstep,
int ma, int na, int nb );
#else
#define CV_DNN_TRY_AVX2 0
......
......@@ -41,7 +41,6 @@
#include "../precomp.hpp"
#include "layers_common.hpp"
#include "op_blas.hpp"
#include <float.h>
#include <algorithm>
......@@ -182,14 +181,14 @@ public:
Mat norm(channelSize, 1, buffer.type()); // 1 x channelSize
// (_channels x channelSize)T * _channels x 1 -> channelSize x 1
gemmCPU(buffer, sumChannelMultiplier, 1, norm, 0, GEMM_1_T);
gemm(buffer, sumChannelMultiplier, 1, norm, 0, norm, GEMM_1_T);
// compute norm
pow(norm, 0.5f, norm);
// scale the layer
// _channels x 1 * (channelSize x 1)T -> _channels x channelSize
gemmCPU(sumChannelMultiplier, norm, 1, buffer, 0, GEMM_2_T);
gemm(sumChannelMultiplier, norm, 1, buffer, 0, buffer, GEMM_2_T);
dst = src / buffer;
}
......@@ -204,7 +203,7 @@ public:
{
// _scale: _channels x 1
// _channels x 1 * 1 x channelSize -> _channels x channelSize
gemmCPU(scale, sumSpatialMultiplier, 1, buffer, 0);
gemm(scale, sumSpatialMultiplier, 1, buffer, 0, buffer);
dst = dst.mul(buffer);
}
......
#include "op_blas.hpp"
#ifdef HAVE_LAPACK
#include "opencv_lapack.h"
#endif
#include <iostream>
namespace cv
{
namespace dnn
{
void gemm(InputArray A, InputArray B, double alpha, InputOutputArray C, double beta, int flags)
{
if (C.isMat())
gemmCPU(A.getMat(), B.getMat(), alpha, C.getMatRef(), beta, flags);
else
{
cv::gemm(A, B, alpha, (beta == 0) ? noArray() : C, beta, C, flags);
}
}
inline void SwapRowCols(const Mat &A, int &rows, int &cols, bool isTrans)
{
CV_DbgAssert(A.dims == 2);
rows = (isTrans) ? A.cols : A.rows;
cols = (isTrans) ? A.rows : A.cols;
}
class GEMMInvoker : public ParallelLoopBody
{
public:
GEMMInvoker(const Mat* _a, const Mat* _b, double _alpha, Mat* _c, double _beta)
{
a = _a;
b = _b;
c = _c;
alpha = _alpha;
beta = _beta;
}
void operator()(const Range& range) const
{
int mmax = a->rows;
int nmax = range.end - range.start;
int kmax = a->cols;
int m, n, k;
AutoBuffer<float> buf(nmax);
float* ptr = buf;
if( mmax %2 != 0 )
memset(ptr, 0, nmax*sizeof(ptr[0]));
for( m = 0; m < mmax; m += 2 )
{
float* dst0 = c->ptr<float>(m) + range.start;
float* dst1 = m+1 < mmax ? c->ptr<float>(m+1) + range.start : ptr;
const float* aptr0 = a->ptr<float>(m);
const float* aptr1 = m+1 < mmax ? a->ptr<float>(m+1) : aptr0;
if( beta != 1 )
{
if( beta == 0 )
for( n = 0; n < nmax; n++ )
{
dst0[n] = 0.f;
dst1[n] = 0.f;
}
else
for( n = 0; n < nmax; n++ )
{
dst0[n] *= (float)beta;
dst1[n] *= (float)beta;
}
}
for( k = 0; k < kmax; k++ )
{
float alpha0 = (float)(alpha*aptr0[k]);
float alpha1 = (float)(alpha*aptr1[k]);
const float* bptr = b->ptr<float>(k) + range.start;
for( n = 0; n < nmax; n++ )
{
float d0 = dst0[n] + alpha0*bptr[n];
float d1 = dst1[n] + alpha1*bptr[n];
dst0[n] = d0;
dst1[n] = d1;
}
}
}
}
const Mat *a, *b;
Mat* c;
double alpha, beta;
};
void gemmCPU(const Mat &A, const Mat &B, double alpha, Mat &C, double beta, int flags /*= 0*/)
{
#ifdef HAVE_LAPACK
bool transA = static_cast<bool>(flags & GEMM_1_T);
bool transB = static_cast<bool>(flags & GEMM_2_T);
bool transC = static_cast<bool>(flags & GEMM_3_T);
int Arows, Acols, Brows, Bcols, Crows, Ccols;
SwapRowCols(A, Arows, Acols, transA);
SwapRowCols(B, Brows, Bcols, transB);
SwapRowCols(C, Crows, Ccols, transC);
CV_Assert(!(flags & GEMM_3_T));
CV_Assert(Acols == Brows && Arows == Crows && Bcols == Ccols);
CV_Assert(A.isContinuous() && B.isContinuous() && C.isContinuous());
CV_Assert(A.type() == B.type() && B.type() == C.type());
CV_Assert(A.data != C.data && B.data != C.data);
if (C.type() == CV_32F)
{
cblas_sgemm(CblasRowMajor, transA ? CblasTrans : CblasNoTrans, transB ? CblasTrans : CblasNoTrans,
Arows, Bcols, Acols,
(float)alpha, A.ptr<float>(), A.cols,
B.ptr<float>(), B.cols,
(float)beta, C.ptr<float>(), C.cols);
}
else if (C.type() == CV_64F)
{
//TODO: Should be tested
cblas_dgemm(CblasRowMajor, transA ? CblasTrans : CblasNoTrans, transB ? CblasTrans : CblasNoTrans,
Arows, Bcols, Acols,
alpha, A.ptr<double>(), A.cols,
B.ptr<double>(), B.cols,
beta, C.ptr<double>(), C.cols);
}
else
{
CV_Error(Error::BadDepth, "Only floating point types are supported");
}
#else
if( C.type() == CV_32F && flags == 0 )
{
GEMMInvoker invoker(&A, &B, alpha, &C, beta);
double granularity = 10000000./((double)A.rows*A.cols);
parallel_for_(Range(0, B.cols), invoker, granularity);
}
else
cv::gemm(A, B, alpha, C, beta, C, flags);
#endif
}
int getBlasThreads()
{
#ifdef OPENBLAS_VERSION
return openblas_get_num_threads();
#else
return 1;
#endif
}
void setBlasThreads(int numThreads)
{
#ifdef OPENBLAS_VERSION
openblas_set_num_threads(numThreads);
goto_set_num_threads(numThreads);
#else
(void)numThreads; //suppress compilers' warning
#endif
}
}
}
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#ifndef __OPENCV_DNN_LAYERS_OP_BLAS_HPP__
#define __OPENCV_DNN_LAYERS_OP_BLAS_HPP__
#include "../precomp.hpp"
namespace cv
{
namespace dnn
{
int getBlasThreads();
void setBlasThreads(int numThreads);
void gemm(InputArray A, InputArray B, double alpha, InputOutputArray C, double beta, int flags = 0);
void gemmCPU(const Mat &A, const Mat &B, double alpha, Mat &C, double beta, int flags = 0);
}
}
#endif
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "../precomp.hpp"
#include <opencv2/core/ocl.hpp>
#include "opencl_kernels_dnn.hpp"
#include "op_im2col.hpp"
#include "opencl_kernels_dnn.hpp"
namespace cv {
namespace dnn {
template <typename Dtype>
class col2im_CpuPBody : public cv::ParallelLoopBody
{
const Dtype* data_col;
int channels, height, width;
int kernel_h, kernel_w;
int pad_h, pad_w;
int stride_h, stride_w;
Dtype* data_im;
int height_col, width_col;
col2im_CpuPBody() {}
public:
static void run(const Dtype* data_col,
int channels, int height, int width,
int kernel_h, int kernel_w,
int pad_h, int pad_w,
int stride_h, int stride_w,
Dtype* data_im)
{
//TODO: single-threaded version switch
col2im_CpuPBody t;
t.data_col = data_col;
t.data_im = data_im;
t.channels = channels; t.height = height; t.width = width;
t.kernel_h = kernel_h; t.kernel_w = kernel_w;
t.pad_h = pad_h; t.pad_w = pad_w;
t.stride_h = stride_h; t.stride_w = stride_w;
t.height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
t.width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
int img_total = channels * height * width;
cv::parallel_for_(Range(0, img_total), t);
}
virtual void operator ()(const Range &r) const
{
const Dtype* data_col_ = data_col;
Dtype* data_im_ = data_im;
int coeff_h_col = (1 - stride_h * kernel_w * height_col) * width_col;
int coeff_w_col = (1 - stride_w * height_col * width_col);
for (int index = r.start; index < r.end; index++)
{
Dtype val = 0;
int w = index % width + pad_w;
int h = (index / width) % height + pad_h;
int c = index / (width * height);
// compute the start and end of the output
int w_col_start = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
int w_col_end = std::min(w / stride_w + 1, width_col);
int h_col_start = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
int h_col_end = std::min(h / stride_h + 1, height_col);
// equivalent implementation
int offset =
(c * kernel_h * kernel_w + h * kernel_w + w) * height_col * width_col;
for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
val += data_col_[offset + h_col * coeff_h_col + w_col * coeff_w_col];
}
}
data_im_[index] = val;
}
}
};
//single-threaded version
template <typename Dtype>
void col2im_cpu(const Dtype* data_col,
int channels, int height, int width,
int kernel_h, int kernel_w,
int pad_h, int pad_w,
int stride_h, int stride_w,
int dilation_h, int dilation_w,
Dtype* data_im,
const int* ofsbuf)
{
int height_col = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
int width_col = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
int channels_col = channels * kernel_h * kernel_w;
std::memset(data_im, 0, height * width * channels * sizeof(Dtype));
for (int c = 0; c < channels_col; ++c, ofsbuf += 3)
{
//int w_offset = c % kernel_w;
//int h_offset = (c / kernel_w) % kernel_h;
//int c_im = c / kernel_h / kernel_w;
int w_offset = ofsbuf[0];
int h_offset = ofsbuf[1];
int c_im = ofsbuf[2];
for (int h = 0; h < height_col; ++h)
{
for (int w = 0; w < width_col; ++w)
{
int h_pad = h * stride_h - pad_h + h_offset * dilation_h;
int w_pad = w * stride_w - pad_w + w_offset * dilation_w;
if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
data_im[(c_im * height + h_pad) * width + w_pad] +=
data_col[(c * height_col + h) * width_col + w];
}
}
}
}
void col2im(const float* data_col, int channels, int height, int width,
int kernel_h, int kernel_w, int pad_h, int pad_w,
int stride_h, int stride_w, int dilation_h, int dilation_w,
float* data_im, const int* ofsbuf)
{
(void)dilation_h;
(void)dilation_w;
(void)ofsbuf;
col2im_CpuPBody<float>::run(data_col, channels, height, width, kernel_h,
kernel_w, pad_h, pad_w, stride_h, stride_w, data_im);
#if 0
col2im_cpu(data_col, channels, height, width, kernel_h, kernel_w, pad_h, pad_w,
stride_h, stride_w, dilation_h, dilation_w, data_im, ofsbuf);
#endif
}
}
}
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#ifndef __OPENCV_DNN_LAYERS_IM2COL_HPP__
#define __OPENCV_DNN_LAYERS_IM2COL_HPP__
#include <opencv2/core.hpp>
#include <cstdlib>
namespace cv
{
namespace dnn
{
void im2row(const float* data_im, int channels, int height, int width,
int kernel_h, int kernel_w, int pad_h, int pad_w,
int stride_h, int stride_w, int dilation_h, int dilation_w,
int height_col, int width_col, float* data_col);
void col2im(const float* data_col, int channels, int height, int width,
int kernel_h, int kernel_w, int pad_h, int pad_w,
int stride_h, int stride_w, int dilation_h, int dilation_w,
float* data_im, const int* ofsbuf);
}
}
#endif
......@@ -41,6 +41,7 @@
#include "../precomp.hpp"
#include "layers_common.hpp"
#include "opencv2/core/hal/intrin.hpp"
#include "op_halide.hpp"
#include <float.h>
#include <algorithm>
......@@ -130,50 +131,150 @@ public:
return Ptr<BackendNode>();
}
void maxPooling(Mat &src, Mat &dst, Mat &mask)
class MaxPoolingInvoker : public ParallelLoopBody
{
Size inp(src.size[3], src.size[2]),
out(dst.size[3], dst.size[2]);
public:
const Mat* src_;
Mat *dst_, *mask_;
Size kernel_, stride_, pad_;
int nstripes_;
for (int n = 0; n < src.size[0]; ++n)
MaxPoolingInvoker(const Mat& src, Mat& dst, Mat& mask, Size kernel, Size stride, Size pad, int nstripes)
{
for (int c = 0; c < src.size[1]; ++c)
{
const float *srcData = src.ptr<float>(n, c);
float *dstData = dst.ptr<float>(n, c);
float *dstMaskData = mask.ptr<float>(n, c);
src_ = &src;
dst_ = &dst;
mask_ = &mask;
kernel_ = kernel;
stride_ = stride;
pad_ = pad;
nstripes_ = nstripes;
CV_Assert(src.isContinuous() && dst.isContinuous() &&
src.type() == CV_32F && src.type() == dst.type() &&
mask.type() == src.type() && src.dims == 4 && dst.dims == 4 &&
src.size[0] == dst.size[0] && src.size[1] == dst.size[1] &&
mask.size == dst.size);
}
for (int ph = 0; ph < out.height; ++ph)
void operator()(const Range& r) const
{
int nimgs = dst_->size[0], channels = dst_->size[1];
int width = dst_->size[3], height = dst_->size[2];
int inp_width = src_->size[3], inp_height = src_->size[2];
size_t total = dst_->total();
size_t stripeSize = (total + nstripes_ - 1)/nstripes_;
size_t stripeStart = r.start*stripeSize;
size_t stripeEnd = std::min(r.end*stripeSize, total);
size_t ofs = stripeStart;
int x0 = (int)(ofs % width);
ofs /= width;
int y0 = (int)(ofs % height);
ofs /= height;
int c = (int)(ofs % channels);
int n = (int)(ofs / channels);
const float *srcData = src_->ptr<float>(n, c);
float *dstData = dst_->ptr<float>(n, c, y0) + x0;
float *dstMaskData = mask_->ptr<float>(n, c, y0) + x0;
int kernel_w = kernel_.width, kernel_h = kernel_.height;
int pad_w = pad_.width, pad_h = pad_.height;
int stride_w = stride_.width, stride_h = stride_.height;
#if CV_SIMD128
v_float32x4 idx00(0.f, (float)stride_w, (float)(stride_w*2), (float)(stride_w*3));
v_float32x4 ones = v_setall_f32(1.f);
v_float32x4 delta = v_setall_f32((float)(inp_width - kernel_w));
#endif
for( ofs = stripeStart; ofs < stripeEnd; ofs++, dstData++, dstMaskData++ )
{
int ystart = y0 * stride_h - pad_h;
int xstart = x0 * stride_w - pad_w;
int yend = min(ystart + kernel_h, inp_height);
int xend = min(xstart + kernel_w, inp_width);
ystart = max(ystart, 0);
xstart = max(xstart, 0);
float max_val = -FLT_MAX;
int max_index = -1;
#if CV_SIMD128
if( xstart > 0 && (x0 + 7) * stride_w - pad_w + kernel_w < inp_width )
{
for (int pw = 0; pw < out.width; ++pw)
v_float32x4 max_val0 = v_setall_f32(max_val);
v_float32x4 max_val1 = max_val0;
v_float32x4 max_idx0 = v_setall_f32(-1.f);
v_float32x4 max_idx1 = max_idx0;
int index0 = ystart * inp_width + xstart;
v_float32x4 idx0 = idx00 + v_setall_f32((float)index0);
v_float32x4 idx1 = idx0 + v_setall_f32((float)(stride_w*4));
for (int y = ystart; y < yend; ++y)
{
int hstart = ph * stride.height - pad.height;
int wstart = pw * stride.width - pad.width;
int hend = min(hstart + kernel.height, inp.height);
int wend = min(wstart + kernel.width, inp.width);
hstart = max(hstart, 0);
wstart = max(wstart, 0);
const int poolIndex = ph * out.width + pw;
float max_val = -FLT_MAX;
int max_index = -1;
for (int h = hstart; h < hend; ++h)
for (int w = wstart; w < wend; ++w)
for (int x = xstart; x < xend; ++x, idx0 += ones, idx1 += ones)
{
const int index = y * inp_width + x;
v_float32x4 v0(srcData[index], srcData[index + stride_w],
srcData[index + stride_w*2], srcData[index + stride_w*3]);
v_float32x4 v1(srcData[index + stride_w*4], srcData[index + stride_w*5],
srcData[index + stride_w*6], srcData[index + stride_w*7]);
max_idx0 = v_select(v0 > max_val0, idx0, max_idx0);
max_idx1 = v_select(v1 > max_val1, idx1, max_idx1);
max_val0 = v_max(max_val0, v0);
max_val1 = v_max(max_val1, v1);
}
idx0 += delta;
idx1 += delta;
}
v_store(dstData, max_val0);
v_store(dstData + 4, max_val1);
v_store(dstMaskData, max_idx0);
v_store(dstMaskData + 4, max_idx1);
ofs += 7;
dstData += 7;
dstMaskData += 7;
x0 += 7;
}
else
#endif
{
for (int y = ystart; y < yend; ++y)
for (int x = xstart; x < xend; ++x)
{
const int index = y * inp_width + x;
float val = srcData[index];
if (val > max_val)
{
const int index = h * inp.width + w;
if (srcData[index] > max_val)
{
max_val = srcData[index];
max_index = index;
}
max_val = val;
max_index = index;
}
}
dstData[poolIndex] = max_val;
dstMaskData[poolIndex] = max_index;
*dstData = max_val;
*dstMaskData = max_index;
}
if( ++x0 >= width )
{
x0 = 0;
if( ++y0 >= height )
{
y0 = 0;
if( ++c >= channels )
{
c = 0;
if( ++n >= nimgs )
break;
}
srcData = src_->ptr<float>(n, c);
}
}
}
}
};
void maxPooling(Mat &src, Mat &dst, Mat &mask)
{
const int nstripes = getNumThreads();
MaxPoolingInvoker mp(src, dst, mask, kernel, stride, pad, nstripes);
parallel_for_(Range(0, nstripes), mp, nstripes);
}
void avePooling(Mat &src, Mat &dst)
......
......@@ -40,7 +40,6 @@
//M*/
#include "../precomp.hpp"
#include "op_blas.hpp"
#include <iostream>
#include <iterator>
#include <cmath>
......@@ -243,9 +242,9 @@ public:
Range curRowRange(ts*numSamples, (ts + 1)*numSamples);
Mat xCurr = xTs.rowRange(curRowRange);
dnn::gemm(xCurr, Wx, 1, gates, 0, GEMM_2_T); // Wx * x_t
dnn::gemm(hInternal, Wh, 1, gates, 1, GEMM_2_T); //+Wh * h_{t-1}
dnn::gemm(dummyOnes, bias, 1, gates, 1); //+b
gemm(xCurr, Wx, 1, gates, 0, gates, GEMM_2_T); // Wx * x_t
gemm(hInternal, Wh, 1, gates, 1, gates, GEMM_2_T); //+Wh * h_{t-1}
gemm(dummyOnes, bias, 1, gates, 1, gates); //+b
Mat getesIFO = gates.colRange(0, 3*numOut);
Mat gateI = gates.colRange(0*numOut, 1*numOut);
......@@ -419,14 +418,14 @@ public:
Range curRowRange = Range(ts * numSamples, (ts + 1) * numSamples);
Mat xCurr = xTs.rowRange(curRowRange);
dnn::gemm(hPrev, Whh, 1, hCurr, 0, GEMM_2_T); // W_{hh} * h_{prev}
dnn::gemm(xCurr, Wxh, 1, hCurr, 1, GEMM_2_T); //+W_{xh} * x_{curr}
dnn::gemm(dummyBiasOnes, bh, 1, hCurr, 1); //+bh
gemm(hPrev, Whh, 1, hCurr, 0, hCurr, GEMM_2_T); // W_{hh} * h_{prev}
gemm(xCurr, Wxh, 1, hCurr, 1, hCurr, GEMM_2_T); //+W_{xh} * x_{curr}
gemm(dummyBiasOnes, bh, 1, hCurr, 1, hCurr); //+bh
tanh(hCurr, hPrev);
Mat oCurr = oTs.rowRange(curRowRange);
dnn::gemm(hPrev, Who, 1, oCurr, 0, GEMM_2_T); // W_{ho} * h_{prev}
dnn::gemm(dummyBiasOnes, bo, 1, oCurr, 1); //+b_o
gemm(hPrev, Who, 1, oCurr, 0, oCurr, GEMM_2_T); // W_{ho} * h_{prev}
gemm(dummyBiasOnes, bo, 1, oCurr, 1, oCurr); //+b_o
tanh(oCurr, oCurr);
if (produceH)
......
......@@ -10,7 +10,6 @@ Implementation of shift layer, which adds up const values to blob.
*/
#include "../precomp.hpp"
#include "op_blas.hpp"
#include <opencv2/dnn/shape_utils.hpp>
namespace cv
......@@ -25,15 +24,6 @@ public:
{
setParamsFrom(params);
CV_Assert(blobs.size() == 1);
#ifdef HAVE_LAPACK
{
if (getBlasThreads() != cv::getThreadNum())
{
setBlasThreads(cv::getThreadNum());
}
}
#endif
}
bool getMemoryShapes(const std::vector<MatShape> &inputs,
......@@ -76,7 +66,7 @@ public:
{
Mat dstMat(inpBlob.size[1], inpBlob.size[2] * inpBlob.size[3],
outBlob.type(), outBlob.ptr(n));
dnn::gemm(blobs[0], biasOnesMat, 1, dstMat, 1); //TODO: gemv
gemm(blobs[0], biasOnesMat, 1, dstMat, 1, dstMat); //TODO: gemv
}
}
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment