Commit b593cae0 authored by Vadim Pisarevsky's avatar Vadim Pisarevsky Committed by GitHub

some further optimizations and cleanups in dnn (#1237)

* some further optimizations and cleanups in dnn:
+ got rid of dnn::gemm; it's not perf critical anymore (perhaps)
+ embedded col2im functionality into convolution_layer.cpp, since it's not used anywhere else
+ parallel max pooling. even better performance can be achieved if we knew that max indices are not needed (and they are not needed in most networks)
+ somewhat optimized deconvolution layer: optimized bias addition (merged it with col2im), optimized col2im slightly.
+ hopefully fixed incorrect memory access in fully-connected layer; restored aligned memory reads (they should work fine now)

* hopefully fixed regressions in ENet performance

* fixed some typos in deconvolution; added SIMD optimization for the max pooling layer

* fixed warnings in SIMD-less build configuration
parent 0b4fc061
...@@ -41,9 +41,8 @@ ...@@ -41,9 +41,8 @@
#include "../precomp.hpp" #include "../precomp.hpp"
#include "layers_common.hpp" #include "layers_common.hpp"
#include "op_im2col.hpp"
#include "op_blas.hpp"
#include "op_halide.hpp" #include "op_halide.hpp"
#include "opencv2/core/hal/hal.hpp"
#include "opencv2/core/hal/intrin.hpp" #include "opencv2/core/hal/intrin.hpp"
#include <iostream> #include <iostream>
...@@ -55,16 +54,7 @@ namespace dnn ...@@ -55,16 +54,7 @@ namespace dnn
class BaseConvolutionLayerImpl : public ConvolutionLayer class BaseConvolutionLayerImpl : public ConvolutionLayer
{ {
public: public:
BaseConvolutionLayerImpl() BaseConvolutionLayerImpl() {}
{
#ifdef HAVE_LAPACK
int nthreads = cv::getThreadNum();
if (getBlasThreads() != nthreads)
{
setBlasThreads(nthreads);
}
#endif
}
virtual bool supportBackend(int backendId) virtual bool supportBackend(int backendId)
{ {
...@@ -146,7 +136,7 @@ public: ...@@ -146,7 +136,7 @@ public:
class ConvolutionLayerImpl : public BaseConvolutionLayerImpl class ConvolutionLayerImpl : public BaseConvolutionLayerImpl
{ {
public: public:
enum { VEC_ALIGN = 8 }; enum { VEC_ALIGN = 8, DFT_TYPE = CV_32F };
Mat weightsMat; Mat weightsMat;
Ptr<ActivationLayer> activ; Ptr<ActivationLayer> activ;
...@@ -195,7 +185,11 @@ public: ...@@ -195,7 +185,11 @@ public:
return false; return false;
} }
#if 0
bool setActivation(const Ptr<ActivationLayer>& layer) { activ = layer; return true; } bool setActivation(const Ptr<ActivationLayer>& layer) { activ = layer; return true; }
#else
bool setActivation(const Ptr<ActivationLayer>&) { return false; }
#endif
virtual Ptr<BackendNode> initHalide(const std::vector<Ptr<BackendWrapper> > &inputs) virtual Ptr<BackendNode> initHalide(const std::vector<Ptr<BackendWrapper> > &inputs)
{ {
...@@ -379,9 +373,8 @@ public: ...@@ -379,9 +373,8 @@ public:
const float* biasvec = &biasvec_[0]; const float* biasvec = &biasvec_[0];
float* data_out0_ = output_->ptr<float>(); float* data_out0_ = output_->ptr<float>();
size_t rowbufsz = (size_t)karea*BLK_SIZE_CN*BLK_SIZE; size_t rowbufsz = (size_t)karea*BLK_SIZE_CN*BLK_SIZE;
const int valignBytes = (int)(valign*sizeof(float)); AutoBuffer<float> rowbuf0_(rowbufsz + valign);
AutoBuffer<float> rowbuf0_(rowbufsz + valignBytes); float* rowbuf0 = alignPtr((float*)rowbuf0_, (int)(valign*sizeof(float)));
float* rowbuf0 = alignPtr((float*)rowbuf0_, valignBytes);
// we clear the buffer once; ultimately, it lets us to avoid // we clear the buffer once; ultimately, it lets us to avoid
// tail processing after running the unrolled/vectorized loop. // tail processing after running the unrolled/vectorized loop.
...@@ -588,8 +581,349 @@ public: ...@@ -588,8 +581,349 @@ public:
} }
}; };
class ParallelDFTWeights : ParallelLoopBody
{
public:
const Mat* weights_;
Mat* wspectrums_;
int nstripes_;
Size kernel_, dftsz_;
int nouts_, ninps_;
static void run(const Mat& weights, Mat& wspectrums, Size kernel, Size dftsz, int nstripes)
{
CV_Assert(weights.type() == DFT_TYPE);
ParallelDFTWeights p;
p.weights_ = &weights;
p.wspectrums_ = &wspectrums;
p.nstripes_ = nstripes;
p.kernel_ = kernel;
p.dftsz_ = dftsz;
p.nouts_ = weights.rows;
p.ninps_ = weights.cols / (kernel.area());
int dft_total = dftsz.area();
int sz[] = { p.nouts_, p.ninps_, dft_total };
wspectrums.create(3, sz, DFT_TYPE);
parallel_for_(Range(0, nstripes), p, nstripes);
}
ParallelDFTWeights() {}
void operator()(const Range& r) const
{
int ninps = ninps_, nouts = nouts_;
int totalDFTs = nouts*ninps;
int stripeSize = (totalDFTs + nstripes_-1)/nstripes_;
int stripeStart = r.start*stripeSize;
int stripeEnd = std::min(r.end*stripeSize, totalDFTs);
int kernel_w = kernel_.width, kernel_h = kernel_.height;
int dft_w = dftsz_.width, dft_h = dftsz_.height;
float* wptr = (float*)weights_->ptr<float>();
size_t wstep = weights_->step1();
Ptr<hal::DFT2D> dft2d_fwd = hal::DFT2D::create(dft_w, dft_h, DFT_TYPE, 1, 1, 0, kernel_h);
for( int i = stripeStart; i < stripeEnd; i++ )
{
int out = i / ninps;
int inp = i % ninps;
float* srcptr = wptr + out*wstep + inp*kernel_w*kernel_h;
Mat src(kernel_h, kernel_w, DFT_TYPE, srcptr);
float* dstptr = wspectrums_->ptr<float>(out, inp);
Mat dst(dft_h, dft_w, DFT_TYPE, dstptr);
size_t dstep = dft_w*sizeof(dstptr[0]);
memset(dstptr, 0, dstep*dft_h);
for( int j = 0; j < kernel_h; j++ )
memcpy(dstptr + dft_w*j, srcptr + kernel_w*j, kernel_w*sizeof(dstptr[0]));
dft2d_fwd->apply((uchar*)dstptr, dstep, (uchar*)dstptr, dstep);
}
}
};
/*class ParallelDFTConv : public ParallelLoopBody
{
public:
enum { BLK_SIZE = 32, BLK_SIZE_CN = 64 };
const Mat* input_;
const Mat* weights_;
Mat* output_;
Mat wspectrums_;
int outShape[4];
Size kernel_, pad_, blksz_, dftsz_;
int ngroups_, nstripes_;
std::vector<float> biasvec_;
const ActivationLayer* activ_;
static void run( const Mat& input, Mat& output,
const Mat& weights, const Mat& bias,
Size kernel, Size pad, int ngroups, int nstripes,
const ActivationLayer* activ )
{
CV_Assert( input.dims == 4 && output.dims == 4 &&
input.size[0] == output.size[0] &&
weights.rows == output.size[1] &&
weights.cols == (input.size[1]/ngroups)*kernel.width*kernel.height &&
input.type() == output.type() &&
input.type() == weights.type() &&
input.type() == CV_32F &&
input.isContinuous() &&
output.isContinuous() &&
(bias.empty() || (bias.isContinuous() && bias.type() == CV_32F &&
bias.total() == (size_t)output.size[1])));
ParallelDFTConv p;
p.input_ = &input;
p.weights_ = &weights;
p.output_ = &output;
for( int i = 0; i < 4; i++ ) p.outShape[i] = output.size[i];
p.outShape[1] /= ngroups;
p.kernel_ = kernel; p.pad_ = pad;
p.ngroups_ = ngroups;
p.nstripes_ = nstripes;
p.activ_ = activ;
const double blockScale = 4.5;
const int minBlockSize = 32;
Size resultsz(output.size[3], output.size[2]);
Size blksz, dftsz;
blksz.width = cvRound(kernel.width*blockScale);
blksz.width = std::max(blksz.width, minBlockSize - kernel.width + 1);
blksz.width = std::min(blksz.width, resultsz.width);
blksz.height = cvRound(kernel.height*blockScale);
blksz.height = std::max(blksz.height, minBlockSize - kernel.height + 1);
blksz.height = std::min(blksz.height, resultsz.height);
// compute DFT size along each dimension; make sure it's even, because we want
// real DFT & inverse DFT to be fast.
dftsz.width = blksz.width + kernel.width - 1;
for(;;)
{
dftsz.width = getOptimalDFTSize(dftsz.width);
if( dftsz.width <= 0 )
CV_Error( CV_StsOutOfRange, "cannot compute the right DFT size" );
if(dftsz.width % 2 == 0)
break;
dftsz.width++;
}
dftsz.height = blksz.height + kernel.height - 1;
for(;;)
{
dftsz.height = getOptimalDFTSize(dftsz.height);
if( dftsz.height <= 0 )
CV_Error( CV_StsOutOfRange, "cannot compute the right DFT size" );
if(dftsz.height % 2 == 0)
break;
}
// transform all the weights for the layer; we do it on each run because
// if we compute and store spectrums of all the weights for all the convolution
// layers, it may take a lot of memory
ParallelDFTWeights::run(weights, p.wspectrums_, kernel, dftsz, nstripes);
// recompute block size
blksz.width = dftsz.width - kernel.width + 1;
blksz.width = std::min(blksz.width, resultsz.width);
blksz.height = dftsz.height - kernel.height + 1;
blksz.height = std::min(blksz.height, resultsz.height);
printf("DFT conv: blk=(%d x %d), DFT=(%d x %d)\n", blksz.width, blksz.height, dftsz.width, dftsz.height);
p.dftsz_ = dftsz;
p.blksz_ = blksz;
int k, outCn = output.size[1];
p.biasvec_.resize(outCn+2);
float* biasvec = &p.biasvec_[0];
if( bias.empty() )
{
for( k = 0; k < outCn; k++ )
biasvec[k] = 0.f;
}
else
{
for( k = 0; k < outCn; k++ )
biasvec[k] = bias.at<float>(k);
}
biasvec[outCn] = biasvec[outCn+1] = biasvec[outCn-1];
parallel_for_(Range(0, nstripes), p, nstripes);
}
ParallelDFTConv() {}
void operator()(const Range& r0) const
{
int ngroups = ngroups_, batchSize = input_->size[0]*ngroups;
int out_w = output_->size[3], out_h = output_->size[2], outCn = output_->size[1]/ngroups;
int width = input_->size[3], height = input_->size[2], inpCn = input_->size[1]/ngroups;
int nstripes = nstripes_;
int kernel_w = kernel_.width, kernel_h = kernel_.height;
int pad_w = pad_.width, pad_h = pad_.height;
int blk_w = blksz_.width, blk_h = blksz_.height;
int dft_w = dftsz_.width, dft_h = dftsz_.height;
int dft_elems = dft_w*dft_h;
size_t dftstep = dft_w*sizeof(float);
int i, j;
size_t inpPlaneSize = width*height;
size_t outPlaneSize = out_w*out_h;
int ndfts_w = (out_w + blk_w - 1)/blk_w;
int ndfts_h = (out_h + blk_h - 1)/blk_h;
int ndfts_plane = ndfts_w*ndfts_h;
int stripesPerSample;
int ndfts_stripe;
Range r = r0;
if( nstripes >= batchSize*2 )
{
stripesPerSample = nstripes/batchSize;
ndfts_stripe = (ndfts_plane + stripesPerSample - 1)/stripesPerSample;
}
else
{
stripesPerSample = 1;
int samplesPerStripe = std::max((batchSize + nstripes - 1)/nstripes, 1);
r.start *= samplesPerStripe;
r.end *= samplesPerStripe;
nstripes *= samplesPerStripe;
ndfts_stripe = ndfts_plane;
}
Mat spectrums((inpCn+1)*dft_h, dft_w, DFT_TYPE);
Mat out_spectrum = spectrums.rowRange(dft_h*inpCn, dft_h*(inpCn+1));
const float* wptr0 = wspectrums_.ptr<float>();
const float* data_inp0_ = input_->ptr<float>();
const float* biasvec = &biasvec_[0];
float* data_out0_ = output_->ptr<float>();
float dft_scale = 1.f/(dft_w*dft_h);
Ptr<hal::DFT2D> dft2d_fwd = hal::DFT2D::create(dft_w, dft_h, DFT_TYPE, 1, 1,
CV_HAL_DFT_IS_INPLACE, blk_h + kernel_h - 1);
Ptr<hal::DFT2D> dft2d_inv = hal::DFT2D::create(dft_w, dft_h, DFT_TYPE, 1, 1,
CV_HAL_DFT_INVERSE|CV_HAL_DFT_SCALE, blk_h);
for( int stripe = r.start; stripe < r.end; stripe++ )
{
int subsampleIdx = stripe/stripesPerSample;
if( subsampleIdx >= batchSize )
break;
int startOutCn = (subsampleIdx % ngroups)*outCn;
const float* biasptr = biasvec + startOutCn;
int dft_idx0 = (stripe - subsampleIdx*stripesPerSample)*ndfts_stripe;
int dft_idx1 = std::min(dft_idx0 + ndfts_stripe, ndfts_plane);
for( int dft_idx = dft_idx0; dft_idx < dft_idx1; dft_idx++ )
{
int dft_y = dft_idx / dft_w;
int dft_x = dft_idx - dft_y*dft_w;
dft_x *= blk_w;
dft_y *= blk_h;
int bw = std::min(blk_w, out_w - dft_x);
int bh = std::min(blk_h, out_h - dft_y);
int patch_w = bw + kernel_w - 1;
int patch_h = bh + kernel_h - 1;
int in_x = dft_x - pad_w;
int in_y = dft_y - pad_h;
int i0 = std::max(0, -in_y);
int i1 = std::min(patch_h, height - in_y);
int j0 = std::max(0, -in_x);
int j1 = std::min(patch_w, width - in_x);
const float* data_inp = data_inp0_ + subsampleIdx*inpPlaneSize*inpCn + in_y*width + in_x;
float* sdata0 = spectrums.ptr<float>();
float* data_out = data_out0_ + subsampleIdx*outPlaneSize*outCn + dft_y*out_w + dft_x;
// phase 1. extract tiles from the input tensor channels and
// compute their spectrums.
float* sdata = sdata0;
for( int cn = 0; cn < inpCn; cn++, data_inp += inpPlaneSize )
{
for( i = 0; i < dft_h; i++, sdata += dft_w )
{
if( i < i0 || i >= i1 )
memset(sdata, 0, dft_w*sizeof(sdata[0]));
else
{
for( j = 0; j < j0; j++ )
sdata[j] = 0.f;
for( ; j < j1; j++ )
sdata[j] = data_inp[i*width + j];
for( ; j < dft_w; j++ )
sdata[j] = 0.f;
}
}
uchar* dftdata = (uchar*)(sdata - dft_elems);
dft2d_fwd->apply(dftdata, dftstep, dftdata, dftstep);
}
// phase 2. iterate over output channels. For each output channel multiply
// all the input channels by the corresponding weights and sum the results.
// all this is done in the Fourier domain.
// When the sum is computed, apply the inverse DFT, then add bias and save
// the results.
for( int ocn = 0; ocn < outCn; ocn++, data_out += outPlaneSize )
{
float* odata = out_spectrum.ptr<float>();
memset(odata, 0, dft_elems*sizeof(odata[0]));
for( int cn = 0; cn < inpCn; cn++ )
{
const float* wptr = wptr0 + ((ocn + startOutCn)*inpCn + cn)*dft_elems;
const float* sdata = sdata0 + cn*dft_elems;
odata[0] += sdata[0]*wptr[0];
odata[dft_w-1] += sdata[dft_w-1]*wptr[dft_w-1];
odata[dft_elems-dft_w] += sdata[dft_elems-dft_w]*wptr[dft_elems-dft_w];
odata[dft_elems-1] += sdata[dft_elems-1]*wptr[dft_elems-1];
for( i = 1; i < dft_h-1; i += 2 )
{
int re = i*dft_w, im = re + dft_w;
odata[re] += sdata[re]*wptr[re] + sdata[im]*wptr[im];
odata[im] += sdata[im]*wptr[re] - sdata[re]*wptr[im];
re += dft_w-1; im += dft_w-1;
odata[re] += sdata[re]*wptr[re] + sdata[im]*wptr[im];
odata[im] += sdata[im]*wptr[re] - sdata[re]*wptr[im];
}
for( i = 0; i < dft_h; i++ )
{
for( j = 1; j < dft_w-1; j += 2 )
{
int idx = i*dft_w + j;
float re = sdata[idx], im = sdata[idx+1];
float wre = wptr[idx], wim = wptr[idx+1];
float ore = odata[idx], oim = odata[idx+1];
odata[idx] = ore + re*wre + im*wim;
odata[idx+1] = oim + im*wre - re*wim;
}
}
}
dft2d_inv->apply((const uchar*)odata, dftstep, (uchar*)odata, dftstep);
float bias = biasptr[ocn];
for( i = 0; i < bh; i++ )
{
for( j = 0; j < bw; j++ )
{
data_out[i*out_w + j] = odata[i*dft_w + j] + bias;
}
}
}
}
}
}
};*/
void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals) void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
{ {
/*printf("conv %s: input (%d x %d x %d x %d), kernel (%d x %d), pad (%d x %d), stride (%d x %d), dilation (%d x %d)\n",
name.c_str(), inputs[0]->size[0], inputs[0]->size[1], inputs[0]->size[2], inputs[0]->size[3],
kernel.width, kernel.height, pad.width, pad.height,
stride.width, stride.height, dilation.width, dilation.height);*/
CV_Assert(inputs.size() == (size_t)1 && inputs[0]->size[1] % blobs[0].size[1] == 0); CV_Assert(inputs.size() == (size_t)1 && inputs[0]->size[1] % blobs[0].size[1] == 0);
int ngroups = inputs[0]->size[1]/blobs[0].size[1]; int ngroups = inputs[0]->size[1]/blobs[0].size[1];
CV_Assert(outputs[0].size[1] % ngroups == 0); CV_Assert(outputs[0].size[1] % ngroups == 0);
...@@ -614,9 +948,19 @@ public: ...@@ -614,9 +948,19 @@ public:
Mat biasesMat = hasBias() ? blobs[1].reshape(1, outCn) : Mat(); Mat biasesMat = hasBias() ? blobs[1].reshape(1, outCn) : Mat();
int nstripes = std::max(getNumThreads(), 1); int nstripes = std::max(getNumThreads(), 1);
/*if( stride == Size(1, 1) && dilation == Size(1, 1) && kernel.width >= 3 && kernel.height >= 3 )
{
ParallelDFTConv::run(*inputs[0], outputs[0], weightsMat, biasesMat,
kernel, pad, ngroups, nstripes, activ.get());
}
else*/
{
ParallelConv::run(*inputs[0], outputs[0], weightsMat, biasesMat, ParallelConv::run(*inputs[0], outputs[0], weightsMat, biasesMat,
kernel, pad, stride, dilation, ngroups, nstripes, activ.get()); kernel, pad, stride, dilation, ngroups, nstripes, activ.get());
} }
}
virtual int64 getFLOPS(const std::vector<MatShape> &inputs, virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
const std::vector<MatShape> &outputs) const const std::vector<MatShape> &outputs) const
...@@ -636,6 +980,8 @@ public: ...@@ -636,6 +980,8 @@ public:
class DeConvolutionLayerImpl : public BaseConvolutionLayerImpl class DeConvolutionLayerImpl : public BaseConvolutionLayerImpl
{ {
public: public:
Mat weightsMat, biasesMat;
MatShape computeColRowShape(const MatShape &inpShape, const MatShape &outShape) const MatShape computeColRowShape(const MatShape &inpShape, const MatShape &outShape) const
{ {
int inpCn = inpShape[1]; int inpCn = inpShape[1];
...@@ -682,70 +1028,283 @@ public: ...@@ -682,70 +1028,283 @@ public:
return false; return false;
} }
class MatMulInvoker : public ParallelLoopBody
{
public:
MatMulInvoker(const Mat& a, const Mat& b, Mat& c, int nstripes)
{
a_ = &a;
b_ = &b;
c_ = &c;
nstripes_ = nstripes;
useAVX2 = checkHardwareSupport(CPU_AVX2);
}
void operator()(const Range& range_) const
{
int stripeSize = (int)alignSize((b_->cols + nstripes_ - 1)/nstripes_, 16);
Range range(range_.start*stripeSize, std::min(range_.end*stripeSize, b_->cols));
int mmax = a_->rows;
int nmax = range.end - range.start;
int kmax = a_->cols;
int m, n, k;
const float* aptr = a_->ptr<float>();
const float* bptr = b_->ptr<float>() + range.start;
float* cptr = c_->ptr<float>() + range.start;
size_t astep = a_->step1();
size_t bstep = b_->step1();
size_t cstep = c_->step1();
#if CV_DNN_TRY_AVX2
if( useAVX2 )
fastGEMM_avx2( aptr, astep, bptr, bstep, cptr, cstep, mmax, kmax, nmax );
else
#endif
for( m = 0; m < mmax; m += 2 )
{
float* dst0 = cptr + cstep*m;
float* dst1 = cptr + cstep*std::min(m+1, mmax-1);
const float* aptr0 = aptr + astep*m;
const float* aptr1 = aptr + astep*std::min(m+1, mmax-1);
for( n = 0; n < nmax; n++ )
{
dst0[n] = 0.f;
dst1[n] = 0.f;
}
for( k = 0; k < kmax; k += 4 )
{
float alpha00 = aptr0[k];
float alpha01 = aptr1[k];
float alpha10 = 0.f, alpha11 = 0.f;
float alpha20 = 0.f, alpha21 = 0.f;
float alpha30 = 0.f, alpha31 = 0.f;
const float* bptr0 = bptr + k*bstep;
const float* bptr1 = bptr0;
const float* bptr2 = bptr0;
const float* bptr3 = bptr0;
if( k+1 < kmax )
{
alpha10 = aptr0[k+1];
alpha11 = aptr1[k+1];
bptr1 = bptr0 + bstep;
if( k+2 < kmax )
{
alpha20 = aptr0[k+2];
alpha21 = aptr1[k+2];
bptr2 = bptr1 + bstep;
if( k+3 < kmax )
{
alpha30 = aptr0[k+3];
alpha31 = aptr1[k+3];
bptr3 = bptr2 + bstep;
}
}
}
n = 0;
#if CV_SIMD128
v_float32x4 a00 = v_setall_f32(alpha00);
v_float32x4 a01 = v_setall_f32(alpha01);
v_float32x4 a10 = v_setall_f32(alpha10);
v_float32x4 a11 = v_setall_f32(alpha11);
v_float32x4 a20 = v_setall_f32(alpha20);
v_float32x4 a21 = v_setall_f32(alpha21);
v_float32x4 a30 = v_setall_f32(alpha30);
v_float32x4 a31 = v_setall_f32(alpha31);
for( ; n <= nmax - 4; n += 4 )
{
v_float32x4 b0 = v_load(bptr0 + n);
v_float32x4 b1 = v_load(bptr1 + n);
v_float32x4 b2 = v_load(bptr2 + n);
v_float32x4 b3 = v_load(bptr3 + n);
v_float32x4 d0 = v_load(dst0 + n);
v_float32x4 d1 = v_load(dst1 + n);
d0 += b0*a00;
d1 += b0*a01;
d0 += b1*a10;
d1 += b1*a11;
d0 += b2*a20;
d1 += b2*a21;
d0 += b3*a30;
d1 += b3*a31;
v_store(dst0 + n, d0);
v_store(dst1 + n, d1);
}
#endif
for( ; n < nmax; n++ )
{
float b0 = bptr0[n], b1 = bptr1[n];
float b2 = bptr2[n], b3 = bptr3[n];
float d0 = dst0[n] + alpha00*b0 + alpha10*b1 + alpha20*b2 + alpha30*b3;
float d1 = dst1[n] + alpha01*b0 + alpha11*b1 + alpha21*b2 + alpha31*b3;
dst0[n] = d0;
dst1[n] = d1;
}
}
}
}
const Mat *a_, *b_;
Mat* c_;
int nstripes_;
bool useAVX2;
};
class Col2ImInvoker : public cv::ParallelLoopBody
{
public:
const float* data_col;
const float* biasvec;
int channels, height, width;
int kernel_h, kernel_w;
int pad_h, pad_w;
int stride_h, stride_w;
float* data_im;
int height_col, width_col;
int nstripes;
bool is1x1;
Col2ImInvoker() {}
static void run(const float* data_col,
int channels, int height, int width,
int kernel_h, int kernel_w,
int pad_h, int pad_w,
int stride_h, int stride_w,
float* data_im,
const float* biasvec,
bool is1x1)
{
const int nstripes = getNumThreads();
Col2ImInvoker t;
t.data_col = data_col;
t.data_im = data_im;
t.channels = channels; t.height = height; t.width = width;
t.kernel_h = kernel_h; t.kernel_w = kernel_w;
t.pad_h = pad_h; t.pad_w = pad_w;
t.stride_h = stride_h; t.stride_w = stride_w;
t.height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
t.width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
t.nstripes = nstripes;
t.is1x1 = is1x1;
t.biasvec = biasvec;
parallel_for_(Range(0, nstripes), t, nstripes);
}
virtual void operator ()(const Range &r) const
{
const float* data_col_ = data_col;
float* data_im_ = data_im;
int coeff_h = (1 - stride_h * kernel_w * height_col) * width_col;
int coeff_w = (1 - stride_w * height_col * width_col);
size_t total = (size_t)channels * height * width;
size_t stripeSize = (total + nstripes - 1)/nstripes;
size_t startIndex = r.start*stripeSize;
size_t endIndex = std::min(r.end*stripeSize, total);
int w = (int)(startIndex % width + pad_w);
int h = (int)((startIndex / width) % height + pad_h);
int c = (int)(startIndex / (width * height));
int h_col_start = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
int h_col_end = std::min(h / stride_h + 1, height_col);
int plane_size_col = height_col * width_col;
int offset = (c * kernel_h * kernel_w + h * kernel_w + w) * plane_size_col;
bool is1x1_ = is1x1;
const float* biasvec_ = biasvec;
for (size_t index = startIndex; index < endIndex; index++)
{
// compute the start and end of the output
int w_col_start = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
int w_col_end = std::min(w / stride_w + 1, width_col);
float val;
if( is1x1_ )
val = data_im_[index];
else
{
val = 0.f;
for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
val += data_col_[offset + h_col * coeff_h + w_col * coeff_w];
}
}
}
data_im_[index] = val + biasvec_[c];
offset += plane_size_col;
if( ++w >= width + pad_w )
{
w = (int)((index + 1)% width + pad_w);
h = (int)(((index + 1) / width) % height + pad_h);
c = (int)((index + 1) / (width * height));
h_col_start = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
h_col_end = std::min(h / stride_h + 1, height_col);
offset = (c * kernel_h * kernel_w + h * kernel_w + w) * plane_size_col;
}
}
}
};
void forward(std::vector<Mat *> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals) void forward(std::vector<Mat *> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
{ {
internals[0].setTo(0);
if (hasBias()) if (hasBias())
internals[1].setTo(1); internals[1].setTo(1);
int outCn = blobs[0].size[0]; int outCn = blobs[0].size[0];
int inpCn = inputs[0]->size[1]; int inpCn = inputs[0]->size[1];
Mat weightsMat = blobs[0].reshape(1, inpCn); bool is1x1flag = is1x1();
Mat biasesMat = hasBias() ? blobs[1].reshape(1, outCn) : Mat(); int nstripes = getNumThreads();
if( weightsMat.empty() )
{
transpose(blobs[0].reshape(1, inpCn), weightsMat);
biasesMat = hasBias() ? blobs[1].reshape(1, outCn) : Mat::zeros(outCn, 1, CV_32F);
}
for (size_t ii = 0; ii < outputs.size(); ii++) for (size_t ii = 0; ii < outputs.size(); ii++)
{ {
int ngroups = inpCn / blobs[0].size[1]; int ngroups = inpCn / blobs[0].size[1];
int inpGroupCn = blobs[0].size[1]; int inpGroupCn = blobs[0].size[1];
int outGroupCn = outCn / ngroups; int outGroupCn = outCn / ngroups;
int numImg = inputs[ii]->size[0]; const Mat& inp = *inputs[ii];
Mat& out = outputs[ii];
int numImg = inp.size[0];
int outH = out.size[2], outW = out.size[3];
Mat convBlob = inputs[ii]->reshape(1, numImg*inpCn); Mat convBlob = inputs[ii]->reshape(1, numImg*inpCn);
Mat decnBlob = outputs[ii].reshape(1, numImg*outCn); Mat decnBlob = out.reshape(1, numImg*outCn);
for (int n = 0; n < numImg; n++) for (int n = 0; n < numImg; n++)
{ {
for (int g = 0; g < ngroups; g++) for (int g = 0; g < ngroups; g++)
{ {
Mat dstMat = decnBlob.rowRange(_Range((g + n * ngroups) * outGroupCn, outGroupCn)); Mat dstMat = decnBlob.rowRange(_Range((g + n * ngroups) * outGroupCn, outGroupCn));
Mat &colMat = (is1x1()) ? dstMat : internals[0]; Mat &colMat = is1x1flag ? dstMat : internals[0];
Mat convMat = convBlob.rowRange(_Range((g + n * ngroups) * inpGroupCn, inpGroupCn)); Mat convMat = convBlob.rowRange(_Range((g + n * ngroups) * inpGroupCn, inpGroupCn));
Mat wghtMat = weightsMat.rowRange(_Range(g * inpGroupCn, inpGroupCn)); Mat wghtMat = weightsMat.colRange(_Range(g * inpGroupCn, inpGroupCn));
Mat curBiasMat = biasesMat.rowRange(_Range(g * outGroupCn, outGroupCn));
dnn::gemm(wghtMat, convMat, 1, colMat, 0, GEMM_1_T);
if (!is1x1()) //gemm(wghtMat, convMat, 1, colMat, 0, colMat, 0);
col2im(colMat, dstMat, shape(*inputs[ii]), shape(outputs[ii])); MatMulInvoker mminvoker(wghtMat, convMat, colMat, nstripes);
parallel_for_(Range(0, nstripes), mminvoker, nstripes);
if (hasBias()) Col2ImInvoker::run(colMat.ptr<float>(), outGroupCn, outH, outW,
{ kernel.height, kernel.width, pad.height, pad.width,
Mat curBiasMat = biasesMat.rowRange(_Range(g * outGroupCn, outGroupCn)); stride.height, stride.width, dstMat.ptr<float>(),
dnn::gemm(curBiasMat, internals[1], 1, dstMat, 1); curBiasMat.ptr<float>(), is1x1flag);
}
} }
} }
} }
} }
void col2im(const Mat &colMat, Mat &dstImg, const MatShape& inShape, const MatShape& outShape)
{
int outCn = outShape[1], outH = outShape[2], outW = outShape[3];
int inpCn = inShape[1];
int ngroups = inpCn / blobs[0].size[1];
int outGroupCn = outCn / ngroups;
if (is1x1())
{
dstImg = colMat;
return;
}
cv::dnn::col2im(colMat.ptr<float>(), outGroupCn, outH, outW, kernel.height, kernel.width,
pad.height, pad.width, stride.height, stride.width,
dilation.height, dilation.width, dstImg.ptr<float>(), &ofsbuf[0]);
}
virtual Ptr<BackendNode> initHalide(const std::vector<Ptr<BackendWrapper> > &inputs) virtual Ptr<BackendNode> initHalide(const std::vector<Ptr<BackendWrapper> > &inputs)
{ {
#ifdef HAVE_HALIDE #ifdef HAVE_HALIDE
...@@ -808,8 +1367,6 @@ public: ...@@ -808,8 +1367,6 @@ public:
return flops; return flops;
} }
std::vector<int> ofsbuf;
}; };
//Convolution and Deconvolution //Convolution and Deconvolution
......
...@@ -41,7 +41,6 @@ ...@@ -41,7 +41,6 @@
#include "../precomp.hpp" #include "../precomp.hpp"
#include "layers_common.hpp" #include "layers_common.hpp"
#include "op_blas.hpp"
#include "op_halide.hpp" #include "op_halide.hpp"
#include <opencv2/dnn/shape_utils.hpp> #include <opencv2/dnn/shape_utils.hpp>
...@@ -133,33 +132,42 @@ public: ...@@ -133,33 +132,42 @@ public:
void operator()(const Range& r) const void operator()(const Range& r) const
{ {
int valign = FullyConnectedLayerImpl::VEC_ALIGN;
int nsamples = srcMat_->rows; int nsamples = srcMat_->rows;
int nw0 = weights_->rows; int nw0 = weights_->rows;
int vecsize = srcMat_->cols; int k, vecsize = srcMat_->cols;
int vecsize_aligned = (int)alignSize(vecsize, VEC_ALIGN);
int nstripes = nstripes_; int nstripes = nstripes_;
size_t total = (size_t)nsamples*nw0; size_t total = (size_t)nsamples*nw0;
size_t stripeSize = (total + nstripes - 1)/nstripes; size_t stripeSize = (total + nstripes - 1)/nstripes;
size_t stripeStart = r.start*stripeSize; size_t stripeStart = r.start*stripeSize;
size_t stripeEnd = r.end == nstripes ? total : std::min(r.end*stripeSize, total); size_t stripeEnd = r.end == nstripes ? total : std::min(r.end*stripeSize, total);
size_t wstep = weights_->step1(); size_t wstep = weights_->step1();
AutoBuffer<float> srcbuf(vecsize_aligned + valign);
float* sptr = alignPtr((float*)srcbuf, (int)(valign*sizeof(float)));
for( k = vecsize; k < vecsize_aligned; k++ )
sptr[k] = 0.f;
for( size_t ofs = stripeStart; ofs < stripeEnd; ) for( size_t ofs = stripeStart; ofs < stripeEnd; )
{ {
int sampleIdx = (int)(ofs / nw0); int sampleIdx = (int)(ofs / nw0);
int delta = (int)(ofs - (size_t)sampleIdx*nw0); int delta = (int)(ofs - (size_t)sampleIdx*nw0);
const float* sptr = srcMat_->ptr<float>(sampleIdx); const float* sptr_ = srcMat_->ptr<float>(sampleIdx);
const float* wptr = weights_->ptr<float>(delta); const float* wptr = weights_->ptr<float>(delta);
float* dptr = dstMat_->ptr<float>(sampleIdx) + delta; float* dptr = dstMat_->ptr<float>(sampleIdx) + delta;
const float* biasptr = biasMat_->ptr<float>() + delta; const float* biasptr = biasMat_->ptr<float>() + delta;
int nw = std::min(nw0 - delta, (int)(stripeEnd - ofs)); int nw = std::min(nw0 - delta, (int)(stripeEnd - ofs));
memcpy(sptr, sptr_, vecsize*sizeof(sptr[0]));
#if CV_DNN_TRY_AVX2 #if CV_DNN_TRY_AVX2
if( useAVX2_ ) if( useAVX2_ )
fastGEMM1T_avx2( sptr, wptr, wstep, biasptr, dptr, nw, vecsize); fastGEMM1T_avx2( sptr, wptr, wstep, biasptr, dptr, nw, vecsize);
else else
#endif #endif
{ {
int i = 0, k; int i = 0;
#if CV_SIMD128 #if CV_SIMD128
for( ; i <= nw - 4; i += 4, wptr += 4*wstep ) for( ; i <= nw - 4; i += 4, wptr += 4*wstep )
...@@ -169,7 +177,7 @@ public: ...@@ -169,7 +177,7 @@ public:
for( k = 0; k < vecsize; k += 4 ) for( k = 0; k < vecsize; k += 4 )
{ {
vfloat32x4 v = v_load(sptr + k); vfloat32x4 v = v_load_aligned(sptr + k);
vs0 += v*v_load_aligned(wptr + k); vs0 += v*v_load_aligned(wptr + k);
vs1 += v*v_load_aligned(wptr + wstep + k); vs1 += v*v_load_aligned(wptr + wstep + k);
vs2 += v*v_load_aligned(wptr + wstep*2 + k); vs2 += v*v_load_aligned(wptr + wstep*2 + k);
......
...@@ -204,7 +204,7 @@ void fastGEMM1T_avx2( const float* vec, const float* weights, ...@@ -204,7 +204,7 @@ void fastGEMM1T_avx2( const float* vec, const float* weights,
for( int k = 0; k < vecsize; k += 8, wptr += 8 ) for( int k = 0; k < vecsize; k += 8, wptr += 8 )
{ {
__m256 v = _mm256_loadu_ps(vec + k); __m256 v = _mm256_load_ps(vec + k);
vs0 = _mm256_fmadd_ps(_mm256_load_ps(wptr), v, vs0); vs0 = _mm256_fmadd_ps(_mm256_load_ps(wptr), v, vs0);
vs1 = _mm256_fmadd_ps(_mm256_load_ps(wptr + wstep), v, vs1); vs1 = _mm256_fmadd_ps(_mm256_load_ps(wptr + wstep), v, vs1);
...@@ -237,7 +237,7 @@ void fastGEMM1T_avx2( const float* vec, const float* weights, ...@@ -237,7 +237,7 @@ void fastGEMM1T_avx2( const float* vec, const float* weights,
for( int k = 0; k < vecsize; k += 8, wptr += 8 ) for( int k = 0; k < vecsize; k += 8, wptr += 8 )
{ {
__m256 v = _mm256_loadu_ps(vec + k); __m256 v = _mm256_load_ps(vec + k);
vs0 = _mm256_fmadd_ps(_mm256_load_ps(wptr), v, vs0); vs0 = _mm256_fmadd_ps(_mm256_load_ps(wptr), v, vs0);
} }
...@@ -250,6 +250,76 @@ void fastGEMM1T_avx2( const float* vec, const float* weights, ...@@ -250,6 +250,76 @@ void fastGEMM1T_avx2( const float* vec, const float* weights,
_mm256_zeroupper(); _mm256_zeroupper();
} }
void fastGEMM_avx2( const float* aptr, size_t astep, const float* bptr,
size_t bstep, float* cptr, size_t cstep,
int ma, int na, int nb )
{
int n = 0;
for( ; n <= nb - 16; n += 16 )
{
for( int m = 0; m < ma; m += 4 )
{
const float* aptr0 = aptr + astep*m;
const float* aptr1 = aptr + astep*std::min(m+1, ma-1);
const float* aptr2 = aptr + astep*std::min(m+2, ma-1);
const float* aptr3 = aptr + astep*std::min(m+3, ma-1);
float* cptr0 = cptr + cstep*m;
float* cptr1 = cptr + cstep*std::min(m+1, ma-1);
float* cptr2 = cptr + cstep*std::min(m+2, ma-1);
float* cptr3 = cptr + cstep*std::min(m+3, ma-1);
__m256 d00 = _mm256_setzero_ps(), d01 = _mm256_setzero_ps();
__m256 d10 = _mm256_setzero_ps(), d11 = _mm256_setzero_ps();
__m256 d20 = _mm256_setzero_ps(), d21 = _mm256_setzero_ps();
__m256 d30 = _mm256_setzero_ps(), d31 = _mm256_setzero_ps();
for( int k = 0; k < na; k++ )
{
__m256 a0 = _mm256_set1_ps(aptr0[k]);
__m256 a1 = _mm256_set1_ps(aptr1[k]);
__m256 a2 = _mm256_set1_ps(aptr2[k]);
__m256 a3 = _mm256_set1_ps(aptr3[k]);
__m256 b0 = _mm256_loadu_ps(bptr + k*bstep + n);
__m256 b1 = _mm256_loadu_ps(bptr + k*bstep + n + 8);
d00 = _mm256_fmadd_ps(a0, b0, d00);
d01 = _mm256_fmadd_ps(a0, b1, d01);
d10 = _mm256_fmadd_ps(a1, b0, d10);
d11 = _mm256_fmadd_ps(a1, b1, d11);
d20 = _mm256_fmadd_ps(a2, b0, d20);
d21 = _mm256_fmadd_ps(a2, b1, d21);
d30 = _mm256_fmadd_ps(a3, b0, d30);
d31 = _mm256_fmadd_ps(a3, b1, d31);
}
_mm256_storeu_ps(cptr0 + n, d00);
_mm256_storeu_ps(cptr0 + n + 8, d01);
_mm256_storeu_ps(cptr1 + n, d10);
_mm256_storeu_ps(cptr1 + n + 8, d11);
_mm256_storeu_ps(cptr2 + n, d20);
_mm256_storeu_ps(cptr2 + n + 8, d21);
_mm256_storeu_ps(cptr3 + n, d30);
_mm256_storeu_ps(cptr3 + n + 8, d31);
}
}
_mm256_zeroupper();
for( ; n < nb; n++ )
{
for( int m = 0; m < ma; m++ )
{
const float* aptr0 = aptr + astep*m;
float* cptr0 = cptr + cstep*m;
float d0 = 0.f;
for( int k = 0; k < na; k++ )
d0 += aptr0[k]*bptr[k*bstep + n];
cptr0[n] = d0;
}
}
}
} }
} }
......
...@@ -42,8 +42,6 @@ ...@@ -42,8 +42,6 @@
#ifndef __OPENCV_DNN_LAYERS_LAYERS_COMMON_HPP__ #ifndef __OPENCV_DNN_LAYERS_LAYERS_COMMON_HPP__
#define __OPENCV_DNN_LAYERS_LAYERS_COMMON_HPP__ #define __OPENCV_DNN_LAYERS_LAYERS_COMMON_HPP__
#include <opencv2/dnn.hpp> #include <opencv2/dnn.hpp>
#include "op_blas.hpp"
#include "op_im2col.hpp"
#include <opencv2/dnn/shape_utils.hpp> #include <opencv2/dnn/shape_utils.hpp>
namespace cv namespace cv
...@@ -74,6 +72,9 @@ void fastConv_avx2(const float* weights, size_t wstep, const float* bias, ...@@ -74,6 +72,9 @@ void fastConv_avx2(const float* weights, size_t wstep, const float* bias,
void fastGEMM1T_avx2( const float* vec, const float* weights, void fastGEMM1T_avx2( const float* vec, const float* weights,
size_t wstep, const float* bias, size_t wstep, const float* bias,
float* dst, int nvecs, int vecsize ); float* dst, int nvecs, int vecsize );
void fastGEMM_avx2( const float* aptr, size_t astep, const float* bptr0,
size_t bstep, float* cptr, size_t cstep,
int ma, int na, int nb );
#else #else
#define CV_DNN_TRY_AVX2 0 #define CV_DNN_TRY_AVX2 0
......
...@@ -41,7 +41,6 @@ ...@@ -41,7 +41,6 @@
#include "../precomp.hpp" #include "../precomp.hpp"
#include "layers_common.hpp" #include "layers_common.hpp"
#include "op_blas.hpp"
#include <float.h> #include <float.h>
#include <algorithm> #include <algorithm>
...@@ -182,14 +181,14 @@ public: ...@@ -182,14 +181,14 @@ public:
Mat norm(channelSize, 1, buffer.type()); // 1 x channelSize Mat norm(channelSize, 1, buffer.type()); // 1 x channelSize
// (_channels x channelSize)T * _channels x 1 -> channelSize x 1 // (_channels x channelSize)T * _channels x 1 -> channelSize x 1
gemmCPU(buffer, sumChannelMultiplier, 1, norm, 0, GEMM_1_T); gemm(buffer, sumChannelMultiplier, 1, norm, 0, norm, GEMM_1_T);
// compute norm // compute norm
pow(norm, 0.5f, norm); pow(norm, 0.5f, norm);
// scale the layer // scale the layer
// _channels x 1 * (channelSize x 1)T -> _channels x channelSize // _channels x 1 * (channelSize x 1)T -> _channels x channelSize
gemmCPU(sumChannelMultiplier, norm, 1, buffer, 0, GEMM_2_T); gemm(sumChannelMultiplier, norm, 1, buffer, 0, buffer, GEMM_2_T);
dst = src / buffer; dst = src / buffer;
} }
...@@ -204,7 +203,7 @@ public: ...@@ -204,7 +203,7 @@ public:
{ {
// _scale: _channels x 1 // _scale: _channels x 1
// _channels x 1 * 1 x channelSize -> _channels x channelSize // _channels x 1 * 1 x channelSize -> _channels x channelSize
gemmCPU(scale, sumSpatialMultiplier, 1, buffer, 0); gemm(scale, sumSpatialMultiplier, 1, buffer, 0, buffer);
dst = dst.mul(buffer); dst = dst.mul(buffer);
} }
......
#include "op_blas.hpp"
#ifdef HAVE_LAPACK
#include "opencv_lapack.h"
#endif
#include <iostream>
namespace cv
{
namespace dnn
{
void gemm(InputArray A, InputArray B, double alpha, InputOutputArray C, double beta, int flags)
{
if (C.isMat())
gemmCPU(A.getMat(), B.getMat(), alpha, C.getMatRef(), beta, flags);
else
{
cv::gemm(A, B, alpha, (beta == 0) ? noArray() : C, beta, C, flags);
}
}
inline void SwapRowCols(const Mat &A, int &rows, int &cols, bool isTrans)
{
CV_DbgAssert(A.dims == 2);
rows = (isTrans) ? A.cols : A.rows;
cols = (isTrans) ? A.rows : A.cols;
}
class GEMMInvoker : public ParallelLoopBody
{
public:
GEMMInvoker(const Mat* _a, const Mat* _b, double _alpha, Mat* _c, double _beta)
{
a = _a;
b = _b;
c = _c;
alpha = _alpha;
beta = _beta;
}
void operator()(const Range& range) const
{
int mmax = a->rows;
int nmax = range.end - range.start;
int kmax = a->cols;
int m, n, k;
AutoBuffer<float> buf(nmax);
float* ptr = buf;
if( mmax %2 != 0 )
memset(ptr, 0, nmax*sizeof(ptr[0]));
for( m = 0; m < mmax; m += 2 )
{
float* dst0 = c->ptr<float>(m) + range.start;
float* dst1 = m+1 < mmax ? c->ptr<float>(m+1) + range.start : ptr;
const float* aptr0 = a->ptr<float>(m);
const float* aptr1 = m+1 < mmax ? a->ptr<float>(m+1) : aptr0;
if( beta != 1 )
{
if( beta == 0 )
for( n = 0; n < nmax; n++ )
{
dst0[n] = 0.f;
dst1[n] = 0.f;
}
else
for( n = 0; n < nmax; n++ )
{
dst0[n] *= (float)beta;
dst1[n] *= (float)beta;
}
}
for( k = 0; k < kmax; k++ )
{
float alpha0 = (float)(alpha*aptr0[k]);
float alpha1 = (float)(alpha*aptr1[k]);
const float* bptr = b->ptr<float>(k) + range.start;
for( n = 0; n < nmax; n++ )
{
float d0 = dst0[n] + alpha0*bptr[n];
float d1 = dst1[n] + alpha1*bptr[n];
dst0[n] = d0;
dst1[n] = d1;
}
}
}
}
const Mat *a, *b;
Mat* c;
double alpha, beta;
};
void gemmCPU(const Mat &A, const Mat &B, double alpha, Mat &C, double beta, int flags /*= 0*/)
{
#ifdef HAVE_LAPACK
bool transA = static_cast<bool>(flags & GEMM_1_T);
bool transB = static_cast<bool>(flags & GEMM_2_T);
bool transC = static_cast<bool>(flags & GEMM_3_T);
int Arows, Acols, Brows, Bcols, Crows, Ccols;
SwapRowCols(A, Arows, Acols, transA);
SwapRowCols(B, Brows, Bcols, transB);
SwapRowCols(C, Crows, Ccols, transC);
CV_Assert(!(flags & GEMM_3_T));
CV_Assert(Acols == Brows && Arows == Crows && Bcols == Ccols);
CV_Assert(A.isContinuous() && B.isContinuous() && C.isContinuous());
CV_Assert(A.type() == B.type() && B.type() == C.type());
CV_Assert(A.data != C.data && B.data != C.data);
if (C.type() == CV_32F)
{
cblas_sgemm(CblasRowMajor, transA ? CblasTrans : CblasNoTrans, transB ? CblasTrans : CblasNoTrans,
Arows, Bcols, Acols,
(float)alpha, A.ptr<float>(), A.cols,
B.ptr<float>(), B.cols,
(float)beta, C.ptr<float>(), C.cols);
}
else if (C.type() == CV_64F)
{
//TODO: Should be tested
cblas_dgemm(CblasRowMajor, transA ? CblasTrans : CblasNoTrans, transB ? CblasTrans : CblasNoTrans,
Arows, Bcols, Acols,
alpha, A.ptr<double>(), A.cols,
B.ptr<double>(), B.cols,
beta, C.ptr<double>(), C.cols);
}
else
{
CV_Error(Error::BadDepth, "Only floating point types are supported");
}
#else
if( C.type() == CV_32F && flags == 0 )
{
GEMMInvoker invoker(&A, &B, alpha, &C, beta);
double granularity = 10000000./((double)A.rows*A.cols);
parallel_for_(Range(0, B.cols), invoker, granularity);
}
else
cv::gemm(A, B, alpha, C, beta, C, flags);
#endif
}
int getBlasThreads()
{
#ifdef OPENBLAS_VERSION
return openblas_get_num_threads();
#else
return 1;
#endif
}
void setBlasThreads(int numThreads)
{
#ifdef OPENBLAS_VERSION
openblas_set_num_threads(numThreads);
goto_set_num_threads(numThreads);
#else
(void)numThreads; //suppress compilers' warning
#endif
}
}
}
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#ifndef __OPENCV_DNN_LAYERS_OP_BLAS_HPP__
#define __OPENCV_DNN_LAYERS_OP_BLAS_HPP__
#include "../precomp.hpp"
namespace cv
{
namespace dnn
{
int getBlasThreads();
void setBlasThreads(int numThreads);
void gemm(InputArray A, InputArray B, double alpha, InputOutputArray C, double beta, int flags = 0);
void gemmCPU(const Mat &A, const Mat &B, double alpha, Mat &C, double beta, int flags = 0);
}
}
#endif
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "../precomp.hpp"
#include <opencv2/core/ocl.hpp>
#include "opencl_kernels_dnn.hpp"
#include "op_im2col.hpp"
#include "opencl_kernels_dnn.hpp"
namespace cv {
namespace dnn {
template <typename Dtype>
class col2im_CpuPBody : public cv::ParallelLoopBody
{
const Dtype* data_col;
int channels, height, width;
int kernel_h, kernel_w;
int pad_h, pad_w;
int stride_h, stride_w;
Dtype* data_im;
int height_col, width_col;
col2im_CpuPBody() {}
public:
static void run(const Dtype* data_col,
int channels, int height, int width,
int kernel_h, int kernel_w,
int pad_h, int pad_w,
int stride_h, int stride_w,
Dtype* data_im)
{
//TODO: single-threaded version switch
col2im_CpuPBody t;
t.data_col = data_col;
t.data_im = data_im;
t.channels = channels; t.height = height; t.width = width;
t.kernel_h = kernel_h; t.kernel_w = kernel_w;
t.pad_h = pad_h; t.pad_w = pad_w;
t.stride_h = stride_h; t.stride_w = stride_w;
t.height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
t.width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
int img_total = channels * height * width;
cv::parallel_for_(Range(0, img_total), t);
}
virtual void operator ()(const Range &r) const
{
const Dtype* data_col_ = data_col;
Dtype* data_im_ = data_im;
int coeff_h_col = (1 - stride_h * kernel_w * height_col) * width_col;
int coeff_w_col = (1 - stride_w * height_col * width_col);
for (int index = r.start; index < r.end; index++)
{
Dtype val = 0;
int w = index % width + pad_w;
int h = (index / width) % height + pad_h;
int c = index / (width * height);
// compute the start and end of the output
int w_col_start = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
int w_col_end = std::min(w / stride_w + 1, width_col);
int h_col_start = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
int h_col_end = std::min(h / stride_h + 1, height_col);
// equivalent implementation
int offset =
(c * kernel_h * kernel_w + h * kernel_w + w) * height_col * width_col;
for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
val += data_col_[offset + h_col * coeff_h_col + w_col * coeff_w_col];
}
}
data_im_[index] = val;
}
}
};
//single-threaded version
template <typename Dtype>
void col2im_cpu(const Dtype* data_col,
int channels, int height, int width,
int kernel_h, int kernel_w,
int pad_h, int pad_w,
int stride_h, int stride_w,
int dilation_h, int dilation_w,
Dtype* data_im,
const int* ofsbuf)
{
int height_col = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
int width_col = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
int channels_col = channels * kernel_h * kernel_w;
std::memset(data_im, 0, height * width * channels * sizeof(Dtype));
for (int c = 0; c < channels_col; ++c, ofsbuf += 3)
{
//int w_offset = c % kernel_w;
//int h_offset = (c / kernel_w) % kernel_h;
//int c_im = c / kernel_h / kernel_w;
int w_offset = ofsbuf[0];
int h_offset = ofsbuf[1];
int c_im = ofsbuf[2];
for (int h = 0; h < height_col; ++h)
{
for (int w = 0; w < width_col; ++w)
{
int h_pad = h * stride_h - pad_h + h_offset * dilation_h;
int w_pad = w * stride_w - pad_w + w_offset * dilation_w;
if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
data_im[(c_im * height + h_pad) * width + w_pad] +=
data_col[(c * height_col + h) * width_col + w];
}
}
}
}
void col2im(const float* data_col, int channels, int height, int width,
int kernel_h, int kernel_w, int pad_h, int pad_w,
int stride_h, int stride_w, int dilation_h, int dilation_w,
float* data_im, const int* ofsbuf)
{
(void)dilation_h;
(void)dilation_w;
(void)ofsbuf;
col2im_CpuPBody<float>::run(data_col, channels, height, width, kernel_h,
kernel_w, pad_h, pad_w, stride_h, stride_w, data_im);
#if 0
col2im_cpu(data_col, channels, height, width, kernel_h, kernel_w, pad_h, pad_w,
stride_h, stride_w, dilation_h, dilation_w, data_im, ofsbuf);
#endif
}
}
}
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#ifndef __OPENCV_DNN_LAYERS_IM2COL_HPP__
#define __OPENCV_DNN_LAYERS_IM2COL_HPP__
#include <opencv2/core.hpp>
#include <cstdlib>
namespace cv
{
namespace dnn
{
void im2row(const float* data_im, int channels, int height, int width,
int kernel_h, int kernel_w, int pad_h, int pad_w,
int stride_h, int stride_w, int dilation_h, int dilation_w,
int height_col, int width_col, float* data_col);
void col2im(const float* data_col, int channels, int height, int width,
int kernel_h, int kernel_w, int pad_h, int pad_w,
int stride_h, int stride_w, int dilation_h, int dilation_w,
float* data_im, const int* ofsbuf);
}
}
#endif
...@@ -41,6 +41,7 @@ ...@@ -41,6 +41,7 @@
#include "../precomp.hpp" #include "../precomp.hpp"
#include "layers_common.hpp" #include "layers_common.hpp"
#include "opencv2/core/hal/intrin.hpp"
#include "op_halide.hpp" #include "op_halide.hpp"
#include <float.h> #include <float.h>
#include <algorithm> #include <algorithm>
...@@ -130,51 +131,151 @@ public: ...@@ -130,51 +131,151 @@ public:
return Ptr<BackendNode>(); return Ptr<BackendNode>();
} }
void maxPooling(Mat &src, Mat &dst, Mat &mask) class MaxPoolingInvoker : public ParallelLoopBody
{ {
Size inp(src.size[3], src.size[2]), public:
out(dst.size[3], dst.size[2]); const Mat* src_;
Mat *dst_, *mask_;
for (int n = 0; n < src.size[0]; ++n) Size kernel_, stride_, pad_;
{ int nstripes_;
for (int c = 0; c < src.size[1]; ++c)
{ MaxPoolingInvoker(const Mat& src, Mat& dst, Mat& mask, Size kernel, Size stride, Size pad, int nstripes)
const float *srcData = src.ptr<float>(n, c); {
float *dstData = dst.ptr<float>(n, c); src_ = &src;
float *dstMaskData = mask.ptr<float>(n, c); dst_ = &dst;
mask_ = &mask;
for (int ph = 0; ph < out.height; ++ph) kernel_ = kernel;
{ stride_ = stride;
for (int pw = 0; pw < out.width; ++pw) pad_ = pad;
{ nstripes_ = nstripes;
int hstart = ph * stride.height - pad.height;
int wstart = pw * stride.width - pad.width; CV_Assert(src.isContinuous() && dst.isContinuous() &&
int hend = min(hstart + kernel.height, inp.height); src.type() == CV_32F && src.type() == dst.type() &&
int wend = min(wstart + kernel.width, inp.width); mask.type() == src.type() && src.dims == 4 && dst.dims == 4 &&
hstart = max(hstart, 0); src.size[0] == dst.size[0] && src.size[1] == dst.size[1] &&
wstart = max(wstart, 0); mask.size == dst.size);
const int poolIndex = ph * out.width + pw; }
void operator()(const Range& r) const
{
int nimgs = dst_->size[0], channels = dst_->size[1];
int width = dst_->size[3], height = dst_->size[2];
int inp_width = src_->size[3], inp_height = src_->size[2];
size_t total = dst_->total();
size_t stripeSize = (total + nstripes_ - 1)/nstripes_;
size_t stripeStart = r.start*stripeSize;
size_t stripeEnd = std::min(r.end*stripeSize, total);
size_t ofs = stripeStart;
int x0 = (int)(ofs % width);
ofs /= width;
int y0 = (int)(ofs % height);
ofs /= height;
int c = (int)(ofs % channels);
int n = (int)(ofs / channels);
const float *srcData = src_->ptr<float>(n, c);
float *dstData = dst_->ptr<float>(n, c, y0) + x0;
float *dstMaskData = mask_->ptr<float>(n, c, y0) + x0;
int kernel_w = kernel_.width, kernel_h = kernel_.height;
int pad_w = pad_.width, pad_h = pad_.height;
int stride_w = stride_.width, stride_h = stride_.height;
#if CV_SIMD128
v_float32x4 idx00(0.f, (float)stride_w, (float)(stride_w*2), (float)(stride_w*3));
v_float32x4 ones = v_setall_f32(1.f);
v_float32x4 delta = v_setall_f32((float)(inp_width - kernel_w));
#endif
for( ofs = stripeStart; ofs < stripeEnd; ofs++, dstData++, dstMaskData++ )
{
int ystart = y0 * stride_h - pad_h;
int xstart = x0 * stride_w - pad_w;
int yend = min(ystart + kernel_h, inp_height);
int xend = min(xstart + kernel_w, inp_width);
ystart = max(ystart, 0);
xstart = max(xstart, 0);
float max_val = -FLT_MAX; float max_val = -FLT_MAX;
int max_index = -1; int max_index = -1;
for (int h = hstart; h < hend; ++h) #if CV_SIMD128
for (int w = wstart; w < wend; ++w) if( xstart > 0 && (x0 + 7) * stride_w - pad_w + kernel_w < inp_width )
{
v_float32x4 max_val0 = v_setall_f32(max_val);
v_float32x4 max_val1 = max_val0;
v_float32x4 max_idx0 = v_setall_f32(-1.f);
v_float32x4 max_idx1 = max_idx0;
int index0 = ystart * inp_width + xstart;
v_float32x4 idx0 = idx00 + v_setall_f32((float)index0);
v_float32x4 idx1 = idx0 + v_setall_f32((float)(stride_w*4));
for (int y = ystart; y < yend; ++y)
{
for (int x = xstart; x < xend; ++x, idx0 += ones, idx1 += ones)
{
const int index = y * inp_width + x;
v_float32x4 v0(srcData[index], srcData[index + stride_w],
srcData[index + stride_w*2], srcData[index + stride_w*3]);
v_float32x4 v1(srcData[index + stride_w*4], srcData[index + stride_w*5],
srcData[index + stride_w*6], srcData[index + stride_w*7]);
max_idx0 = v_select(v0 > max_val0, idx0, max_idx0);
max_idx1 = v_select(v1 > max_val1, idx1, max_idx1);
max_val0 = v_max(max_val0, v0);
max_val1 = v_max(max_val1, v1);
}
idx0 += delta;
idx1 += delta;
}
v_store(dstData, max_val0);
v_store(dstData + 4, max_val1);
v_store(dstMaskData, max_idx0);
v_store(dstMaskData + 4, max_idx1);
ofs += 7;
dstData += 7;
dstMaskData += 7;
x0 += 7;
}
else
#endif
{
for (int y = ystart; y < yend; ++y)
for (int x = xstart; x < xend; ++x)
{ {
const int index = h * inp.width + w; const int index = y * inp_width + x;
if (srcData[index] > max_val) float val = srcData[index];
if (val > max_val)
{ {
max_val = srcData[index]; max_val = val;
max_index = index; max_index = index;
} }
} }
dstData[poolIndex] = max_val; *dstData = max_val;
dstMaskData[poolIndex] = max_index; *dstMaskData = max_index;
} }
if( ++x0 >= width )
{
x0 = 0;
if( ++y0 >= height )
{
y0 = 0;
if( ++c >= channels )
{
c = 0;
if( ++n >= nimgs )
break;
}
srcData = src_->ptr<float>(n, c);
} }
} }
} }
} }
};
void maxPooling(Mat &src, Mat &dst, Mat &mask)
{
const int nstripes = getNumThreads();
MaxPoolingInvoker mp(src, dst, mask, kernel, stride, pad, nstripes);
parallel_for_(Range(0, nstripes), mp, nstripes);
}
void avePooling(Mat &src, Mat &dst) void avePooling(Mat &src, Mat &dst)
{ {
......
...@@ -40,7 +40,6 @@ ...@@ -40,7 +40,6 @@
//M*/ //M*/
#include "../precomp.hpp" #include "../precomp.hpp"
#include "op_blas.hpp"
#include <iostream> #include <iostream>
#include <iterator> #include <iterator>
#include <cmath> #include <cmath>
...@@ -243,9 +242,9 @@ public: ...@@ -243,9 +242,9 @@ public:
Range curRowRange(ts*numSamples, (ts + 1)*numSamples); Range curRowRange(ts*numSamples, (ts + 1)*numSamples);
Mat xCurr = xTs.rowRange(curRowRange); Mat xCurr = xTs.rowRange(curRowRange);
dnn::gemm(xCurr, Wx, 1, gates, 0, GEMM_2_T); // Wx * x_t gemm(xCurr, Wx, 1, gates, 0, gates, GEMM_2_T); // Wx * x_t
dnn::gemm(hInternal, Wh, 1, gates, 1, GEMM_2_T); //+Wh * h_{t-1} gemm(hInternal, Wh, 1, gates, 1, gates, GEMM_2_T); //+Wh * h_{t-1}
dnn::gemm(dummyOnes, bias, 1, gates, 1); //+b gemm(dummyOnes, bias, 1, gates, 1, gates); //+b
Mat getesIFO = gates.colRange(0, 3*numOut); Mat getesIFO = gates.colRange(0, 3*numOut);
Mat gateI = gates.colRange(0*numOut, 1*numOut); Mat gateI = gates.colRange(0*numOut, 1*numOut);
...@@ -419,14 +418,14 @@ public: ...@@ -419,14 +418,14 @@ public:
Range curRowRange = Range(ts * numSamples, (ts + 1) * numSamples); Range curRowRange = Range(ts * numSamples, (ts + 1) * numSamples);
Mat xCurr = xTs.rowRange(curRowRange); Mat xCurr = xTs.rowRange(curRowRange);
dnn::gemm(hPrev, Whh, 1, hCurr, 0, GEMM_2_T); // W_{hh} * h_{prev} gemm(hPrev, Whh, 1, hCurr, 0, hCurr, GEMM_2_T); // W_{hh} * h_{prev}
dnn::gemm(xCurr, Wxh, 1, hCurr, 1, GEMM_2_T); //+W_{xh} * x_{curr} gemm(xCurr, Wxh, 1, hCurr, 1, hCurr, GEMM_2_T); //+W_{xh} * x_{curr}
dnn::gemm(dummyBiasOnes, bh, 1, hCurr, 1); //+bh gemm(dummyBiasOnes, bh, 1, hCurr, 1, hCurr); //+bh
tanh(hCurr, hPrev); tanh(hCurr, hPrev);
Mat oCurr = oTs.rowRange(curRowRange); Mat oCurr = oTs.rowRange(curRowRange);
dnn::gemm(hPrev, Who, 1, oCurr, 0, GEMM_2_T); // W_{ho} * h_{prev} gemm(hPrev, Who, 1, oCurr, 0, oCurr, GEMM_2_T); // W_{ho} * h_{prev}
dnn::gemm(dummyBiasOnes, bo, 1, oCurr, 1); //+b_o gemm(dummyBiasOnes, bo, 1, oCurr, 1, oCurr); //+b_o
tanh(oCurr, oCurr); tanh(oCurr, oCurr);
if (produceH) if (produceH)
......
...@@ -10,7 +10,6 @@ Implementation of shift layer, which adds up const values to blob. ...@@ -10,7 +10,6 @@ Implementation of shift layer, which adds up const values to blob.
*/ */
#include "../precomp.hpp" #include "../precomp.hpp"
#include "op_blas.hpp"
#include <opencv2/dnn/shape_utils.hpp> #include <opencv2/dnn/shape_utils.hpp>
namespace cv namespace cv
...@@ -25,15 +24,6 @@ public: ...@@ -25,15 +24,6 @@ public:
{ {
setParamsFrom(params); setParamsFrom(params);
CV_Assert(blobs.size() == 1); CV_Assert(blobs.size() == 1);
#ifdef HAVE_LAPACK
{
if (getBlasThreads() != cv::getThreadNum())
{
setBlasThreads(cv::getThreadNum());
}
}
#endif
} }
bool getMemoryShapes(const std::vector<MatShape> &inputs, bool getMemoryShapes(const std::vector<MatShape> &inputs,
...@@ -76,7 +66,7 @@ public: ...@@ -76,7 +66,7 @@ public:
{ {
Mat dstMat(inpBlob.size[1], inpBlob.size[2] * inpBlob.size[3], Mat dstMat(inpBlob.size[1], inpBlob.size[2] * inpBlob.size[3],
outBlob.type(), outBlob.ptr(n)); outBlob.type(), outBlob.ptr(n));
dnn::gemm(blobs[0], biasOnesMat, 1, dstMat, 1); //TODO: gemv gemm(blobs[0], biasOnesMat, 1, dstMat, 1, dstMat); //TODO: gemv
} }
} }
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment