Commit 645260af authored by Vadim Pisarevsky's avatar Vadim Pisarevsky Committed by GitHub

optimized several conv net layers (#1227)

* rewritten the following layers to be [much] more efficient: convolution, fully connected, activations (relu, tanh, ...), LRN. Use optional AVX optimization for the first two.

* eliminated trailing whitespaces
parent 009d2efb
......@@ -201,9 +201,13 @@ namespace dnn
String padMode;
};
class CV_EXPORTS ActivationLayer;
class CV_EXPORTS ConvolutionLayer : public BaseConvolutionLayer
{
public:
virtual bool setActivation(const Ptr<ActivationLayer>& layer) = 0;
static Ptr<BaseConvolutionLayer> create(const LayerParams& params);
};
......@@ -327,8 +331,14 @@ namespace dnn
};
/* Activations */
class CV_EXPORTS ActivationLayer : public Layer
{
public:
virtual void forwardSlice(const float* src, float* dst, int len,
size_t outPlaneSize, int cn0, int cn1) const = 0;
};
class CV_EXPORTS ReLULayer : public Layer
class CV_EXPORTS ReLULayer : public ActivationLayer
{
public:
float negativeSlope;
......@@ -336,37 +346,37 @@ namespace dnn
static Ptr<ReLULayer> create(const LayerParams &params);
};
class CV_EXPORTS ChannelsPReLULayer : public Layer
class CV_EXPORTS ChannelsPReLULayer : public ActivationLayer
{
public:
static Ptr<ChannelsPReLULayer> create(const LayerParams& params);
};
class CV_EXPORTS TanHLayer : public Layer
class CV_EXPORTS TanHLayer : public ActivationLayer
{
public:
static Ptr<TanHLayer> create(const LayerParams &params);
};
class CV_EXPORTS SigmoidLayer : public Layer
class CV_EXPORTS SigmoidLayer : public ActivationLayer
{
public:
static Ptr<SigmoidLayer> create(const LayerParams &params);
};
class CV_EXPORTS BNLLLayer : public Layer
class CV_EXPORTS BNLLLayer : public ActivationLayer
{
public:
static Ptr<BNLLLayer> create(const LayerParams &params);
};
class CV_EXPORTS AbsLayer : public Layer
class CV_EXPORTS AbsLayer : public ActivationLayer
{
public:
static Ptr<AbsLayer> create(const LayerParams &params);
};
class CV_EXPORTS PowerLayer : public Layer
class CV_EXPORTS PowerLayer : public ActivationLayer
{
public:
float power, scale, shift;
......@@ -374,7 +384,7 @@ namespace dnn
static Ptr<PowerLayer> create(const LayerParams &params);
};
/* Layers using in semantic segmentation */
/* Layers used in semantic segmentation */
class CV_EXPORTS CropLayer : public Layer
{
......
......@@ -43,6 +43,7 @@
#include "layers_common.hpp"
#include "op_im2col.hpp"
#include "op_blas.hpp"
#include "opencv2/core/hal/intrin.hpp"
#include <iostream>
namespace cv
......@@ -96,12 +97,17 @@ public:
(stride.height == 1 && stride.width == 1) &&
(dilation.height == 1 && dilation.width == 1);
}
bool setActivation(const Ptr<ActivationLayer>& ) { return false; }
};
//TODO: simultaneously convolution and bias addition for cache optimization
class ConvolutionLayerImpl : public BaseConvolutionLayerImpl
{
public:
enum { VEC_ALIGN = 8 };
Mat weightsMat;
Ptr<ActivationLayer> activ;
MatShape computeColRowShape(const MatShape &inpShape, const MatShape &outShape) const
{
Size out(outShape[3], outShape[2]);
......@@ -117,7 +123,7 @@ public:
{
CV_Assert(blobs.size() != 0);
CV_Assert(!hasBias() || blobs[1].total() == (size_t)blobs[0].size[0]);
CV_Assert(inputs.size() != 0);
CV_Assert(inputs.size() == (size_t)1);
internals.clear();
......@@ -138,91 +144,376 @@ public:
getConvPoolOutParams(Size(inpH, inpW), kernel, stride, padMode, out);
}
int group = inpCn / blobs[0].size[1];
CV_Assert(inpCn % group == 0 && outCn % group == 0);
CV_Assert(blobs[0].size[0] == outCn);
int ngroups = inpCn / blobs[0].size[1];
CV_Assert(inpCn % ngroups == 0 && outCn % ngroups == 0);
int dims[] = {inputs[0][0], outCn, out.height, out.width};
outputs.resize(inputs.size(), shape(dims));
internals.push_back(MatShape());
if (!is1x1())
internals[0] = computeColRowShape(inputs[0], outputs[0]);
if (hasBias())
internals.push_back(shape(1, out.area()));
return false;
}
void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
{
CV_Assert(inputs.size() > 0);
internals[0].setTo(0);
if (hasBias())
internals[1].setTo(1);
int outCn = blobs[0].size[0];
int inpCn = inputs[0]->size[1];
int inpGroupCn = blobs[0].size[1];
bool setActivation(const Ptr<ActivationLayer>& layer) { activ = layer; return true; }
Mat weightsMat = blobs[0].reshape(1, outCn);
Mat biasesMat = hasBias() ? blobs[1].reshape(1, outCn) : Mat();
class ParallelConv : public cv::ParallelLoopBody
{
public:
enum { BLK_SIZE = 32, BLK_SIZE_CN = 64 };
const Mat* input_;
const Mat* weights_;
Mat* output_;
int outShape[4];
Size kernel_, pad_, stride_, dilation_;
int ngroups_, nstripes_;
std::vector<int> ofstab_;
std::vector<float> biasvec_;
const ActivationLayer* activ_;
bool is1x1_;
bool useAVX2;
ParallelConv() {}
static void run( const Mat& input, Mat& output,
const Mat& weights, const Mat& bias,
Size kernel, Size pad, Size stride, Size dilation,
int ngroups, int nstripes, const ActivationLayer* activ )
{
CV_Assert( input.dims == 4 && output.dims == 4 &&
input.size[0] == output.size[0] &&
weights.rows == output.size[1] &&
weights.cols == (input.size[1]/ngroups)*kernel.width*kernel.height &&
input.type() == output.type() &&
input.type() == weights.type() &&
input.type() == CV_32F &&
input.isContinuous() &&
output.isContinuous() &&
(bias.empty() || (bias.isContinuous() && bias.type() == CV_32F &&
bias.total() == (size_t)output.size[1])));
ParallelConv p;
p.input_ = &input;
p.weights_ = &weights;
p.output_ = &output;
for( int i = 0; i < 4; i++ ) p.outShape[i] = output.size[i];
p.outShape[1] /= ngroups;
p.kernel_ = kernel; p.pad_ = pad; p.stride_ = stride; p.dilation_ = dilation;
p.ngroups_ = ngroups;
p.nstripes_ = nstripes;
p.activ_ = activ;
int inpCnAll = input.size[1], width = input.size[3], height = input.size[2];
int inpCn = inpCnAll / ngroups;
int k, outCn = output.size[1];
p.is1x1_ = kernel == Size(0,0) && pad == Size(0, 0);
p.useAVX2 = checkHardwareSupport(CPU_AVX2);
int ncn = std::min(inpCn, (int)BLK_SIZE_CN);
p.ofstab_.resize(kernel.width*kernel.height*ncn);
int* ofstab = &p.ofstab_[0];
for( k = 0; k < ncn; k++ )
for( int k_r = 0; k_r < kernel.height; k_r++ )
for( int k_c = 0; k_c < kernel.width; k_c++ )
ofstab[(k*kernel.height + k_r)*kernel.width + k_c] =
(k*height + k_r*dilation.height)*width + k_c*dilation.width;
p.biasvec_.resize(outCn+2);
float* biasvec = &p.biasvec_[0];
if( bias.empty() )
{
for( k = 0; k < outCn; k++ )
biasvec[k] = 0.f;
}
else
{
for( k = 0; k < outCn; k++ )
biasvec[k] = bias.at<float>(k);
}
biasvec[outCn] = biasvec[outCn+1] = biasvec[outCn-1];
parallel_for_(Range(0, nstripes), p, nstripes);
}
for (size_t ii = 0; ii < outputs.size(); ii++)
virtual void operator ()(const Range &r0) const
{
int numImg = inputs[ii]->size[0];
int group = inpCn / blobs[0].size[1];
int outGroupCn = outCn / group;
Mat inpMat = *inputs[ii];
Mat outMat = outputs[ii].reshape(1, numImg*group*outGroupCn);
const int valign = ConvolutionLayerImpl::VEC_ALIGN;
int ngroups = ngroups_, batchSize = input_->size[0]*ngroups;
int outW = output_->size[3], outH = output_->size[2], outCn = output_->size[1]/ngroups;
int width = input_->size[3], height = input_->size[2], inpCn = input_->size[1]/ngroups;
int nstripes = nstripes_;
int kernel_w = kernel_.width, kernel_h = kernel_.height;
int pad_w = pad_.width, pad_h = pad_.height;
int stride_w = stride_.width, stride_h = stride_.height;
int dilation_w = dilation_.width, dilation_h = dilation_.height;
int karea = kernel_w*kernel_h;
int i, j, k;
size_t inpPlaneSize = width*height;
size_t outPlaneSize = outW*outH;
bool is1x1 = is1x1_;
int stripesPerSample;
size_t stripeSize;
Range r = r0;
if( nstripes >= batchSize*2 )
{
stripesPerSample = nstripes/batchSize;
stripeSize = alignSize((outPlaneSize + stripesPerSample - 1)/stripesPerSample, valign);
stripeSize = std::min(stripeSize, outPlaneSize);
}
else
{
stripesPerSample = 1;
int samplesPerStripe = std::max((batchSize + nstripes - 1)/nstripes, 1);
r.start *= samplesPerStripe;
r.end *= samplesPerStripe;
nstripes *= samplesPerStripe;
stripeSize = outPlaneSize;
}
for (int n = 0; n < numImg; n++)
const float* data_inp0_ = input_->ptr<float>();
const int* ofstab = &ofstab_[0];
const float* wptr_orig_ = weights_->ptr<float>();
size_t wstep = weights_->step1();
const float* biasvec = &biasvec_[0];
float* data_out0_ = output_->ptr<float>();
size_t rowbufsz = (size_t)karea*BLK_SIZE_CN*BLK_SIZE;
const int valignBytes = (int)(valign*sizeof(float));
AutoBuffer<float> rowbuf0_(rowbufsz + valignBytes);
float* rowbuf0 = alignPtr((float*)rowbuf0_, valignBytes);
// we clear the buffer once; ultimately, it lets us to avoid
// tail processing after running the unrolled/vectorized loop.
// the main idea is to make sure that the tail (a.k.a. padding) of each row
// (i.e. the elements with indices between vsz=karea*ncn and vsz_a)
// does not contain NaNs or Infs. Because the padding in the weights
// matrix is explicitly initialized with 0's, we handle all other
// cases nicely, i.e. we can skip expliciting re-initialization
// of the padding - we just retain elements from the previous iteration
// of the loop over channels (cn0).
memset(rowbuf0, 0, rowbufsz*sizeof(rowbuf0[0]) );
for( int stripe = r.start; stripe < r.end; stripe++ )
{
for (int g = 0; g < group; g++)
int subsampleIdx = stripe/stripesPerSample;
if( subsampleIdx >= batchSize )
break;
int stripeStart = (int)((stripe - subsampleIdx*stripesPerSample)*stripeSize);
int stripeEnd = (int)std::min(stripeStart + stripeSize, outPlaneSize);
const float* data_inp0 = data_inp0_ + subsampleIdx*inpPlaneSize*inpCn;
float* data_out0 = data_out0_ + subsampleIdx*outPlaneSize*outCn;
int startOutCn = (subsampleIdx % ngroups)*outCn;
const float* wptr_orig = wptr_orig_ + wstep*startOutCn;
const float* biasptr = biasvec + startOutCn;
for( int cn0 = 0; cn0 < inpCn; cn0 += BLK_SIZE_CN )
{
Mat curInp = slice(inpMat, n, _Range(g * inpGroupCn, inpGroupCn));
im2row(curInp, internals[0], shape(inpMat), shape(outputs[ii]));
_Range kerRange(g * outGroupCn, outGroupCn);
Mat kerMat = weightsMat.rowRange(kerRange);
_Range outRange((g + n * group) * outGroupCn, outGroupCn);
Mat dstMat = outMat.rowRange(outRange);
int cn1 = std::min(cn0 + BLK_SIZE_CN, inpCn);
int ncn = cn1 - cn0, vsz = karea*ncn;
int vsz_a = (int)alignSize(vsz, valign);
const float* wptr = wptr_orig + cn0*karea;
dnn::gemm(kerMat, internals[0], 1, dstMat, 0, GEMM_2_T);
if (hasBias())
for( int ofs0 = stripeStart; ofs0 < stripeEnd; ofs0 += BLK_SIZE )
{
dnn::gemm(biasesMat.rowRange(kerRange), internals[1], 1, dstMat, 1);
int ofs, ofs1 = std::min(ofs0 + BLK_SIZE, stripeEnd);
// do im2row for a part of input tensor
if( is1x1 )
{
for( ofs = ofs0; ofs < ofs1; ofs++ )
{
int out_i = ofs / outW;
int out_j = ofs - out_i * outW;
float* rowbuf = rowbuf0 + (ofs - ofs0)*vsz_a;
int in_i = out_i * stride_h - pad_h;
int in_j = out_j * stride_w - pad_w;
const float* imgptr = data_inp0 + (cn0*height + in_i)*width + in_j;
for( k = 0; k < vsz; k++ )
rowbuf[k] = imgptr[k*inpPlaneSize];
}
}
else
{
for( ofs = ofs0; ofs < ofs1; ofs++ )
{
int out_i = ofs / outW;
int out_j = ofs - out_i * outW;
float* rowbuf = rowbuf0 + (ofs - ofs0)*vsz_a;
int in_i = out_i * stride_h - pad_h;
int in_j = out_j * stride_w - pad_w;
const float* imgptr = data_inp0 + (cn0*height + in_i)*width + in_j;
// this condition should be true for most of the tensor elements, i.e.
// most of the time the kernel aperture is inside the tensor X-Y plane.
if( 0 <= in_i && in_i < height - (kernel_h-1)*dilation_h &&
0 <= in_j && in_j < width - (kernel_w-1)*dilation_w )
{
for( k = 0; k < vsz; k++ )
rowbuf[k] = imgptr[ofstab[k]];
}
else
{
int i0 = std::max(0, (-in_i + dilation_h-1)/dilation_h);
int i1 = std::min(kernel_h, (height - in_i + dilation_h-1)/dilation_h);
int j0 = std::max(0, (-in_j + dilation_w-1)/dilation_w);
int j1 = std::min(kernel_w, (width - in_j + dilation_w-1)/dilation_w);
// here some non-continous sub-row of the row will not be
// filled from the tensor; we need to make sure that the uncovered
// elements are explicitly set to 0's. the easiest way is to
// set all the elements to 0's before the loop.
memset(rowbuf, 0, vsz*sizeof(rowbuf[0]));
for( k = 0; k < ncn; k++, imgptr += width*height )
{
for( i = i0; i < i1; i++ )
{
for( j = j0; j < j1; j++ )
{
int imgofs = i*(dilation_h*width) + j*dilation_w;
rowbuf[(k*kernel_h + i)*kernel_w + j] = imgptr[imgofs];
}
}
}
}
}
}
// now compute dot product of the weights
// and im2row-transformed part of the tensor
int bsz = ofs1 - ofs0;
#if CV_DNN_TRY_AVX2
if(useAVX2)
fastConv_avx2(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0, outShape, bsz, vsz, vsz_a, cn0 == 0);
else
#endif
for( int i = 0; i < outCn; i += 2 )
{
const float* wptr0 = wptr + i*wstep;
const float* wptr1 = wptr0 + wstep;
float* outptr0 = data_out0 + ofs0 + i*outPlaneSize;
float* outptr1 = outptr0 + outPlaneSize;
float bias0 = biasptr[i], bias1 = biasptr[i+1];
if( i+1 >= outCn )
{
wptr1 = wptr0;
outptr1 = outptr0;
bias1 = bias0;
}
int j = 0;
#if CV_SIMD128
for( ; j <= bsz - 4; j += 4 )
{
const float* rptr = rowbuf0 + j*vsz_a;
v_float32x4 s0, s1;
if( cn0 == 0 )
{
s0 = v_setall_f32(bias0);
s1 = v_setall_f32(bias1);
}
else
{
s0 = v_load(outptr0 + j);
s1 = v_load(outptr1 + j);
}
v_float32x4 vs00 = v_setzero_f32(), vs01 = v_setzero_f32(),
vs02 = v_setzero_f32(), vs03 = v_setzero_f32(),
vs10 = v_setzero_f32(), vs11 = v_setzero_f32(),
vs12 = v_setzero_f32(), vs13 = v_setzero_f32();
for( k = 0; k < vsz; k += 4, rptr += 4 )
{
v_float32x4 w0 = v_load_aligned(wptr0 + k), w1 = v_load_aligned(wptr1 + k);
v_float32x4 r0 = v_load_aligned(rptr), r1 = v_load_aligned(rptr + vsz_a),
r2 = v_load_aligned(rptr + vsz_a*2), r3 = v_load_aligned(rptr + vsz_a*3);
vs00 += w0*r0;
vs01 += w0*r1;
vs02 += w0*r2;
vs03 += w0*r3;
vs10 += w1*r0;
vs11 += w1*r1;
vs12 += w1*r2;
vs13 += w1*r3;
}
s0 += v_reduce_sum4(vs00, vs01, vs02, vs03);
s1 += v_reduce_sum4(vs10, vs11, vs12, vs13);
v_store(outptr0 + j, s0);
v_store(outptr1 + j, s1);
}
#endif
for( ; j < bsz; j++ )
{
const float* rptr = rowbuf0 + j*vsz_a;
float s00, s10;
if( cn0 == 0 )
{
s00 = bias0;
s10 = bias1;
}
else
{
s00 = outptr0[j];
s10 = outptr1[j];
}
for( k = 0; k < vsz; k++ )
{
float r0 = rptr[k];
s00 += wptr0[k]*r0;
s10 += wptr1[k]*r0;
}
outptr0[j] = s00;
outptr1[j] = s10;
}
}
}
}
if( activ_ )
activ_->forwardSlice(data_out0 + stripeStart, data_out0 + stripeStart,
(int)(stripeEnd - stripeStart),
outPlaneSize, startOutCn, startOutCn + outCn);
}
}
}
};
void im2row(const Mat &srcImg, Mat &dstRow, const MatShape& inShape, const MatShape& outShape)
void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
{
int inpH = inShape[2];
int inpW = inShape[3];
int outH = outShape[2], outW = outShape[3];
int inpGroupCn = blobs[0].size[1];
int ksize = inpGroupCn * kernel.height * kernel.width;
CV_Assert(inputs.size() == (size_t)1 && inputs[0]->size[1] % blobs[0].size[1] == 0);
int ngroups = inputs[0]->size[1]/blobs[0].size[1];
CV_Assert(outputs[0].size[1] % ngroups == 0);
if (is1x1())
{
transpose(srcImg.reshape(1, ksize), dstRow);
}
else
int outCn = blobs[0].size[0];
if( weightsMat.empty() )
{
cv::dnn::im2row(srcImg.ptr<float>(), inpGroupCn, inpH, inpW, kernel.height,
kernel.width, pad.height, pad.width, stride.height, stride.width,
dilation.height, dilation.width, outH, outW, dstRow.ptr<float>());
Mat wm = blobs[0].reshape(1, outCn);
if( wm.step1() % VEC_ALIGN != 0 )
{
int newcols = (int)alignSize(wm.step1(), VEC_ALIGN);
Mat wm_buffer = Mat(outCn, newcols, wm.type());
Mat wm_padding = wm_buffer.colRange(wm.cols, newcols);
wm_padding.setTo(Scalar::all(0.));
Mat wm_aligned = wm_buffer.colRange(0, wm.cols);
wm.copyTo(wm_aligned);
wm = wm_aligned;
}
weightsMat = wm;
}
Mat biasesMat = hasBias() ? blobs[1].reshape(1, outCn) : Mat();
int nstripes = std::max(getNumThreads(), 1);
ParallelConv::run(*inputs[0], outputs[0], weightsMat, biasesMat,
kernel, pad, stride, dilation, ngroups, nstripes, activ.get());
}
virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
......@@ -249,8 +540,8 @@ public:
int inpH = inpShape[2];
int inpW = inpShape[3];
int outCn = outShape[1];
int group = inpCn / blobs[0].size[1];
int outGroupCn = outCn / group;
int ngroups = inpCn / blobs[0].size[1];
int outGroupCn = outCn / ngroups;
int ksize = outGroupCn * kernel.height * kernel.width;
return shape(ksize, inpH * inpW);
}
......@@ -271,10 +562,10 @@ public:
int outW = stride.width * (inpW - 1) + kernel.width - 2 * pad.width + adjustPad.width;
int outCn = blobs[0].size[0];
int group = inpCn / blobs[0].size[1];
int ngroups = inpCn / blobs[0].size[1];
CV_Assert(inpCn % group == 0 && outCn % group == 0);
CV_Assert(blobs[0].size[0] == outCn && blobs[0].size[1] == inpCn / group);
CV_Assert(inpCn % ngroups == 0 && outCn % ngroups == 0);
CV_Assert(blobs[0].size[0] == outCn && blobs[0].size[1] == inpCn / ngroups);
int dims[] = {inputs[0][0], outCn, outH, outW};
outputs.resize(inputs.size(), shape(dims));
......@@ -303,9 +594,9 @@ public:
for (size_t ii = 0; ii < outputs.size(); ii++)
{
int group = inpCn / blobs[0].size[1];
int ngroups = inpCn / blobs[0].size[1];
int inpGroupCn = blobs[0].size[1];
int outGroupCn = outCn / group;
int outGroupCn = outCn / ngroups;
int numImg = inputs[ii]->size[0];
Mat convBlob = inputs[ii]->reshape(1, numImg*inpCn);
......@@ -313,12 +604,12 @@ public:
for (int n = 0; n < numImg; n++)
{
for (int g = 0; g < group; g++)
for (int g = 0; g < ngroups; g++)
{
Mat dstMat = decnBlob.rowRange(_Range((g + n * group) * outGroupCn, outGroupCn));
Mat dstMat = decnBlob.rowRange(_Range((g + n * ngroups) * outGroupCn, outGroupCn));
Mat &colMat = (is1x1()) ? dstMat : internals[0];
Mat convMat = convBlob.rowRange(_Range((g + n * group) * inpGroupCn, inpGroupCn));
Mat convMat = convBlob.rowRange(_Range((g + n * ngroups) * inpGroupCn, inpGroupCn));
Mat wghtMat = weightsMat.rowRange(_Range(g * inpGroupCn, inpGroupCn));
dnn::gemm(wghtMat, convMat, 1, colMat, 0, GEMM_1_T);
......@@ -340,8 +631,8 @@ public:
{
int outCn = outShape[1], outH = outShape[2], outW = outShape[3];
int inpCn = inShape[1];
int group = inpCn / blobs[0].size[1];
int outGroupCn = outCn / group;
int ngroups = inpCn / blobs[0].size[1];
int outGroupCn = outCn / ngroups;
if (is1x1())
{
......@@ -382,12 +673,12 @@ static void initConvDeconvLayerFromCaffe(Ptr<BaseConvolutionLayer> l, const Laye
bool bias = params.get<bool>("bias_term", true);
int numOutput = params.get<int>("num_output");
int group = params.get<int>("group", 1);
int ngroups = params.get<int>("group", 1);
l->adjustPad.height = params.get<int>("adj_h", 0);
l->adjustPad.width = params.get<int>("adj_w", 0);
CV_Assert(numOutput % group == 0);
CV_Assert(numOutput % ngroups == 0);
CV_Assert((bias && l->blobs.size() == 2) || (!bias && l->blobs.size() == 1));
}
......
......@@ -16,25 +16,53 @@ template<typename Func>
class ElementWiseLayer : public Func::Layer
{
public:
template<typename Dtype>
class PBody : public cv::ParallelLoopBody
{
Func &func;
Dtype *src, *dst;
public:
const Func* func_;
const Mat* src_;
Mat* dst_;
int nstripes_;
PBody(Mat &src, Mat &dst, Func &func_) :
func(func_), src(src.ptr<Dtype>()), dst(dst.ptr<Dtype>())
{}
PBody(const Func &func, const Mat &src, Mat& dst, int nstripes)
{
func_ = &func;
src_ = &src;
dst_ = &dst;
nstripes_ = nstripes;
}
void operator()(const Range &r) const
{
for (int i = r.start; i < r.end; i++)
dst[i] = func(src[i]);
int nstripes = nstripes_, nsamples, outCn;
size_t planeSize;
if( src_->dims == 4 )
{
nsamples = src_->size[0];
outCn = src_->size[1];
planeSize = (size_t)src_->size[2]*src_->size[3];
}
else
{
nsamples = outCn = 1;
planeSize = (size_t)src_->total();
}
size_t stripeSize = (planeSize + nstripes - 1)/nstripes;
size_t stripeStart = r.start*stripeSize;
size_t stripeEnd = std::min(r.end*stripeSize, planeSize);
for( int i = 0; i < nsamples; i++ )
{
const float* srcptr = src_->ptr<float>(i) + stripeStart;
float* dstptr = dst_->ptr<float>(i) + stripeStart;
func_->apply(srcptr, dstptr, (int)(stripeEnd - stripeStart), planeSize, 0, outCn);
}
}
};
ElementWiseLayer(bool run_parallel_=false, const Func &f=Func()) : func(f), run_parallel(run_parallel_) {}
ElementWiseLayer(const Func &f=Func()) { func = f; }
bool getMemoryShapes(const std::vector<MatShape> &inputs,
const int requiredOutputs,
......@@ -49,20 +77,22 @@ public:
{
for (size_t i = 0; i < inputs.size(); i++)
{
Mat &src = *inputs[i];
const Mat &src = *inputs[i];
Mat &dst = outputs[i];
CV_Assert(src.isContinuous() && dst.isContinuous());
CV_Assert(src.size == dst.size && src.type() == dst.type() &&
src.isContinuous() && dst.isContinuous() && src.type() == CV_32F);
Range sizeRange = Range(0, dst.total());
CV_Assert(src.type() == CV_32F);
PBody<float> body(src, dst, func);
if( run_parallel )
cv::parallel_for_(sizeRange, body);
else
body(sizeRange);
const int nstripes = getNumThreads();
PBody body(func, src, dst, nstripes);
parallel_for_(Range(0, nstripes), body, nstripes);
}
}
void forwardSlice(const float* src, float* dst, int len, size_t planeSize, int cn0, int cn1) const
{
func.apply(src, dst, len, planeSize, cn0, cn1);
}
virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
const std::vector<MatShape> &outputs) const
{
......@@ -83,172 +113,208 @@ struct ReLUFunctor
typedef ReLULayer Layer;
float slope;
ReLUFunctor(float slope_) : slope(slope_) {}
explicit ReLUFunctor(float slope_=1.f) : slope(slope_) {}
template<typename TFloat>
inline TFloat operator()(TFloat x) const
void apply(const float* srcptr, float* dstptr, int len, size_t planeSize, int cn0, int cn1) const
{
return (x >= (TFloat)0) ? x : (TFloat)slope * x;
float s = slope;
for( int cn = cn0; cn < cn1; cn++, srcptr += planeSize, dstptr += planeSize )
{
int i = 0;
#if CV_SIMD128
v_float32x4 s4 = v_setall_f32(s), z = v_setzero_f32();
for( ; i <= len - 16; i += 16 )
{
v_float32x4 x0 = v_load(srcptr + i);
v_float32x4 x1 = v_load(srcptr + i + 4);
v_float32x4 x2 = v_load(srcptr + i + 8);
v_float32x4 x3 = v_load(srcptr + i + 12);
x0 = v_select(x0 >= z, x0, x0*s4);
x1 = v_select(x1 >= z, x1, x1*s4);
x2 = v_select(x2 >= z, x2, x2*s4);
x3 = v_select(x3 >= z, x3, x3*s4);
v_store(dstptr + i, x0);
v_store(dstptr + i + 4, x1);
v_store(dstptr + i + 8, x2);
v_store(dstptr + i + 12, x3);
}
#endif
for( ; i < len; i++ )
{
float x = srcptr[i];
dstptr[i] = x >= 0.f ? x : s*x;
}
}
}
int64 getFLOPSPerElement() const {return 1;}
int64 getFLOPSPerElement() const { return 1; }
};
struct TanHFunctor
{
typedef TanHLayer Layer;
template<typename TFloat>
inline TFloat operator()(TFloat x) const
void apply(const float* srcptr, float* dstptr, int len, size_t planeSize, int cn0, int cn1) const
{
return tanh(x);
for( int cn = cn0; cn < cn1; cn++, srcptr += planeSize, dstptr += planeSize )
{
for( int i = 0; i < len; i++ )
{
float x = srcptr[i];
dstptr[i] = tanh(x);
}
}
}
int64 getFLOPSPerElement() const {return 1;}
int64 getFLOPSPerElement() const { return 1; }
};
struct SigmoidFunctor
{
typedef SigmoidLayer Layer;
template<typename TFloat>
inline TFloat operator()(TFloat x) const
void apply(const float* srcptr, float* dstptr, int len, size_t planeSize, int cn0, int cn1) const
{
return (TFloat)1 / ((TFloat)1 + exp(-x));
for( int cn = cn0; cn < cn1; cn++, srcptr += planeSize, dstptr += planeSize )
{
for( int i = 0; i < len; i++ )
{
float x = srcptr[i];
dstptr[i] = 1.f/(1.f + exp(-x));
}
}
}
int64 getFLOPSPerElement() const {return 3;}
int64 getFLOPSPerElement() const { return 3; }
};
struct AbsValFunctor
{
typedef AbsLayer Layer;
template<typename TFloat>
inline TFloat operator()(TFloat x) const
void apply(const float* srcptr, float* dstptr, int len, size_t planeSize, int cn0, int cn1) const
{
return abs(x);
for( int cn = cn0; cn < cn1; cn++, srcptr += planeSize, dstptr += planeSize )
{
for( int i = 0; i < len; i++ )
{
float x = srcptr[i];
dstptr[i] = abs(x);
}
}
}
int64 getFLOPSPerElement() const {return 1;}
int64 getFLOPSPerElement() const { return 1; }
};
struct BNLLFunctor
{
typedef BNLLLayer Layer;
template<typename TFloat>
inline TFloat operator()(TFloat x) const
void apply(const float* srcptr, float* dstptr, int len, size_t planeSize, int cn0, int cn1) const
{
return log((TFloat)1 + exp(-abs(x)));
for( int cn = cn0; cn < cn1; cn++, srcptr += planeSize, dstptr += planeSize )
{
for( int i = 0; i < len; i++ )
{
float x = srcptr[i];
dstptr[i] = log(1.f + exp(-abs(x)));
}
}
}
int64 getFLOPSPerElement() const {return 5;}
int64 getFLOPSPerElement() const { return 5; }
};
struct PowerFunctor
{
typedef PowerLayer Layer;
const float power;
const float scale;
const float shift;
float power;
float scale;
float shift;
PowerFunctor(float power_, float scale_ = 1.f, float shift_ = 0)
explicit PowerFunctor(float power_ = 1.f, float scale_ = 1.f, float shift_ = 0.f)
: power(power_), scale(scale_), shift(shift_) {}
template<typename TFloat>
inline TFloat operator()(TFloat x) const
void apply(const float* srcptr, float* dstptr, int len, size_t planeSize, int cn0, int cn1) const
{
return pow((TFloat)shift + (TFloat)scale * x, (TFloat)power);
float a = scale, b = shift, p = power;
if( p == 1.f )
{
for( int cn = cn0; cn < cn1; cn++, srcptr += planeSize, dstptr += planeSize )
{
for( int i = 0; i < len; i++ )
{
float x = srcptr[i];
dstptr[i] = a*x + b;
}
}
}
else
{
for( int cn = cn0; cn < cn1; cn++, srcptr += planeSize, dstptr += planeSize )
{
for( int i = 0; i < len; i++ )
{
float x = srcptr[i];
dstptr[i] = pow(a*x + b, p);
}
}
}
}
int64 getFLOPSPerElement() const {return 3;}
int64 getFLOPSPerElement() const { return power == 1 ? 2 : 10; }
};
struct PowerFunctor1
{
typedef PowerLayer Layer;
const float scale;
const float shift;
PowerFunctor1(float scale_ = 1.f, float shift_ = 0)
: scale(scale_), shift(shift_) {}
template<typename TFloat>
inline TFloat operator()(TFloat x) const
{
return (TFloat)shift + (TFloat)scale * x;
}
int64 getFLOPSPerElement() const {return 2;}
};
class ChannelsPReLULayerImpl : public ChannelsPReLULayer
struct ChannelsPReLUFunctor
{
public:
ChannelsPReLULayerImpl(const LayerParams& params)
{
CV_Assert(params.blobs.size() == 1);
setParamsFrom(params);
}
typedef ChannelsPReLULayer Layer;
Mat scale;
bool getMemoryShapes(const std::vector<MatShape> &inputs,
const int requiredOutputs,
std::vector<MatShape> &outputs,
std::vector<MatShape> &internals) const
explicit ChannelsPReLUFunctor(const Mat& scale_=Mat()) : scale(scale_)
{
Layer::getMemoryShapes(inputs, requiredOutputs, outputs, internals);
return true;
}
void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
void apply(const float* srcptr, float* dstptr, int len, size_t planeSize, int cn0, int cn1) const
{
CV_Assert(inputs.size() == 1);
Mat &inpBlob = *inputs[0];
for (size_t ii = 0; ii < outputs.size(); ii++)
{
Mat &outBlob = outputs[ii];
CV_Assert(inpBlob.isContinuous() && outBlob.isContinuous());
CV_Assert(scale.isContinuous() && scale.type() == CV_32F);
CV_Assert(blobs[0].total() == inpBlob.size[1]);
const float* scaleptr = scale.ptr<float>();
CV_Assert( 0 <= cn0 && cn0 < cn1 && cn1 <= (int)scale.total() );
for (int n = 0; n < inpBlob.size[1]; n++)
for( int cn = cn0; cn < cn1; cn++, srcptr += planeSize, dstptr += planeSize )
{
float s = scaleptr[cn];
int i = 0;
#if CV_SIMD128
v_float32x4 s4 = v_setall_f32(s), z = v_setzero_f32();
for( ; i <= len - 16; i += 16 )
{
float slopeWeight = blobs[0].at<float>(n);
Mat inpBlobPlane = getPlane(inpBlob, 0, n);
Mat outBlobPlane = getPlane(outBlob, 0, n);
size_t i, planeTotal = inpBlobPlane.total();
const float* inptr = inpBlobPlane.ptr<float>();
float* outptr = outBlobPlane.ptr<float>();
for( i = 0; i < planeTotal; i++ )
{
float val = inptr[i];
outptr[i] = val*(val >= 0.f ? 1.f : slopeWeight);
}
//threshold(inpBlobPlane, outBlobPlane, 0, 0, cv::THRESH_TOZERO_INV);
//scaleAdd(outBlobPlane, slopeWeight-1, inpBlobPlane, outBlobPlane);
v_float32x4 x0 = v_load(ptr + i);
v_float32x4 x1 = v_load(ptr + i + 4);
v_float32x4 x2 = v_load(ptr + i + 8);
v_float32x4 x3 = v_load(ptr + i + 12);
x0 = v_select(x0 >= z, x0, x0*s4);
x1 = v_select(x1 >= z, x1, x1*s4);
x2 = v_select(x2 >= z, x2, x2*s4);
x3 = v_select(x3 >= z, x3, x3*s4);
v_store(ptr + i, x0);
v_store(ptr + i + 4, x1);
v_store(ptr + i + 8, x2);
v_store(ptr + i + 12, x3);
}
#endif
for( ; i < len; i++ )
{
float x = srcptr[i];
dstptr[i] = x >= 0.f ? x : s*x;
}
}
}
virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
const std::vector<MatShape> &outputs) const
{
(void)inputs; // suppress unused variable warning
long flops = 0;
for (int i = 0; i < outputs.size(); i++)
{
flops += total(outputs[i]) * 3;
}
return flops;
}
int64 getFLOPSPerElement() const { return 1; }
};
#define ACTIVATION_CREATOR_FOR(_Layer, _Functor, ...) \
......@@ -259,7 +325,7 @@ Ptr<_Layer> _Layer::create() { \
Ptr<ReLULayer> ReLULayer::create(const LayerParams& params)
{
float negativeSlope = params.get<float>("negative_slope", 0.f);
Ptr<ReLULayer> l(new ElementWiseLayer<ReLUFunctor>(true, ReLUFunctor(negativeSlope)));
Ptr<ReLULayer> l(new ElementWiseLayer<ReLUFunctor>(ReLUFunctor(negativeSlope)));
l->setParamsFrom(params);
l->negativeSlope = negativeSlope;
......@@ -268,7 +334,7 @@ Ptr<ReLULayer> ReLULayer::create(const LayerParams& params)
Ptr<TanHLayer> TanHLayer::create(const LayerParams& params)
{
Ptr<TanHLayer> l(new ElementWiseLayer<TanHFunctor>(true));
Ptr<TanHLayer> l(new ElementWiseLayer<TanHFunctor>());
l->setParamsFrom(params);
return l;
......@@ -276,7 +342,7 @@ Ptr<TanHLayer> TanHLayer::create(const LayerParams& params)
Ptr<SigmoidLayer> SigmoidLayer::create(const LayerParams& params)
{
Ptr<SigmoidLayer> l(new ElementWiseLayer<SigmoidFunctor>(true));
Ptr<SigmoidLayer> l(new ElementWiseLayer<SigmoidFunctor>());
l->setParamsFrom(params);
return l;
......@@ -292,7 +358,7 @@ Ptr<AbsLayer> AbsLayer::create(const LayerParams& params)
Ptr<BNLLLayer> BNLLLayer::create(const LayerParams& params)
{
Ptr<BNLLLayer> l(new ElementWiseLayer<BNLLFunctor>(true));
Ptr<BNLLLayer> l(new ElementWiseLayer<BNLLFunctor>());
l->setParamsFrom(params);
return l;
......@@ -303,9 +369,7 @@ Ptr<PowerLayer> PowerLayer::create(const LayerParams& params)
float power = params.get<float>("power", 1.0f);
float scale = params.get<float>("scale", 1.0f);
float shift = params.get<float>("shift", 0.0f);
Ptr<PowerLayer> l(power == 1.f ?
(PowerLayer*)(new ElementWiseLayer<PowerFunctor1>(false, PowerFunctor1(scale, shift))) :
(PowerLayer*)(new ElementWiseLayer<PowerFunctor>(true, PowerFunctor(power, scale, shift))));
Ptr<PowerLayer> l(new ElementWiseLayer<PowerFunctor>(PowerFunctor(power, scale, shift)));
l->setParamsFrom(params);
l->power = power;
l->scale = scale;
......@@ -314,10 +378,12 @@ Ptr<PowerLayer> PowerLayer::create(const LayerParams& params)
return l;
}
Ptr<ChannelsPReLULayer> ChannelsPReLULayer::create(const LayerParams& params)
{
return Ptr<ChannelsPReLULayer>(new ChannelsPReLULayerImpl(params));
Ptr<ChannelsPReLULayer> l(new ElementWiseLayer<ChannelsPReLUFunctor>(ChannelsPReLUFunctor(params.blobs[0])));
l->setParamsFrom(params);
return l;
}
}
......
......@@ -52,6 +52,8 @@ namespace dnn
class FullyConnectedLayerImpl : public InnerProductLayer
{
public:
enum { VEC_ALIGN = 8 };
FullyConnectedLayerImpl(const LayerParams& params)
{
setParamsFrom(params);
......@@ -65,15 +67,29 @@ public:
CV_Assert(blobs[0].dims >= 2 && (size_t)(innerSize * numOutput) == blobs[0].total());
CV_Assert(!bias || (blobs.size() == 2 && (size_t)numOutput == blobs[1].total()));
blobs[0] = blobs[0].reshape(1, numOutput);
weightsMat = blobs[0] = blobs[0].reshape(1, numOutput);
int vecsize = weightsMat.cols;
if( vecsize % VEC_ALIGN != 0 )
{
int vecsize_aligned = (int)alignSize(vecsize, VEC_ALIGN);
Mat weightsBuf(weightsMat.rows, vecsize_aligned, weightsMat.type());
Mat wpadding = weightsBuf.colRange(vecsize, vecsize_aligned);
wpadding.setTo(Scalar::all(0.));
weightsMat = weightsBuf.colRange(0, vecsize);
blobs[0].copyTo(weightsMat);
blobs[0] = weightsMat;
}
if (bias)
blobs[1] = blobs[1].reshape(1, 1);
biasMat = blobs[1] = blobs[1].reshape(1, 1);
else
biasMat = Mat::zeros(1, numOutput, weightsMat.type());
}
bool getMemoryShapes(const std::vector<MatShape> &inputs,
const int requiredOutputs,
std::vector<MatShape> &outputs,
std::vector<MatShape> &internals) const
std::vector<MatShape> &) const
{
CV_Assert(inputs.size() > 0);
CV_Assert(1 <= blobs.size() && blobs.size() <= 2);
......@@ -84,36 +100,116 @@ public:
int numOutput = blobs[0].size[0];
outputs.resize(inputs.size(), shape(outerSize, numOutput));
internals.push_back(shape(outerSize, 1));
CV_Assert(!bias || (size_t)numOutput == blobs[1].total());
return false;
}
void forward(std::vector<Mat*> &input, std::vector<Mat> &output, std::vector<Mat> &internals)
class FullConnected : public ParallelLoopBody
{
internals[0].setTo(1.);
const Mat &weight = blobs[0];
const Mat *biasMat = NULL, *biasOnesMat = NULL;
int axisCan = clamp(axis, input[0]->dims);
int outerSize = input[0]->total(0, axisCan);
public:
FullConnected(const Mat& srcMat, const Mat& weights, const Mat& biasMat, Mat& dstMat, int nstripes)
{
CV_Assert( srcMat.dims == 2 && srcMat.cols == weights.cols &&
dstMat.rows == srcMat.rows && dstMat.cols == weights.rows &&
srcMat.type() == weights.type() && weights.type() == dstMat.type() &&
srcMat.type() == CV_32F &&
(biasMat.empty() || (biasMat.type() == srcMat.type() &&
biasMat.isContinuous() && (int)biasMat.total() == dstMat.cols)) );
srcMat_ = &srcMat;
weights_ = &weights;
biasMat_ = &biasMat;
dstMat_ = &dstMat;
nstripes_ = nstripes;
useAVX2_ = checkHardwareSupport(CPU_AVX2);
}
if (bias)
void operator()(const Range& r) const
{
biasOnesMat = &internals[0];
biasMat = &blobs[1];
int nsamples = srcMat_->rows;
int nw0 = weights_->rows;
int vecsize = srcMat_->cols;
int nstripes = nstripes_;
size_t total = (size_t)nsamples*nw0;
size_t stripeSize = (total + nstripes - 1)/nstripes;
size_t stripeStart = r.start*stripeSize;
size_t stripeEnd = r.end == nstripes ? total : std::min(r.end*stripeSize, total);
size_t wstep = weights_->step1();
for( size_t ofs = stripeStart; ofs < stripeEnd; )
{
int sampleIdx = (int)(ofs / nw0);
int delta = (int)(ofs - (size_t)sampleIdx*nw0);
const float* sptr = srcMat_->ptr<float>(sampleIdx);
const float* wptr = weights_->ptr<float>(delta);
float* dptr = dstMat_->ptr<float>(sampleIdx) + delta;
const float* biasptr = biasMat_->ptr<float>() + delta;
int nw = std::min(nw0 - delta, (int)(stripeEnd - ofs));
#if CV_DNN_TRY_AVX2
if( useAVX2_ )
fastGEMM1T_avx2( sptr, wptr, wstep, biasptr, dptr, nw, vecsize);
else
#endif
{
int i = 0, k;
#if CV_SIMD128
for( ; i <= nw - 4; i += 4, wptr += 4*wstep )
{
vfloat32x4 vs0 = v_setall_f32(0.f), vs1 = v_setall_f32(0.f);
vfloat32x4 vs2 = v_setall_f32(0.f), vs3 = v_setall_f32(0.f);
for( k = 0; k < vecsize; k += 4 )
{
vfloat32x4 v = v_load_aligned(sptr + k);
vs0 += v*v_load_aligned(wptr + k);
vs1 += v*v_load_aligned(wptr + wstep + k);
vs2 += v*v_load_aligned(wptr + wstep*2 + k);
vs3 += v*v_load_aligned(wptr + wstep*3 + k);
}
vfloat32x4 s = v_reduce_sum4(vs0, vs1, vs2, vs3);
s += v_load(biasptr + i);
v_store(dptr + i, s);
}
#endif
for( ; i < nw; i++, wptr += wstep )
{
float s0=biasptr[i];
for( k = 0; k < vecsize; k++ )
{
float v = sptr[k];
s0 += v*wptr[k];
}
dptr[i] = s0;
}
}
ofs += nw;
}
}
const Mat *srcMat_, *weights_, *biasMat_;
Mat* dstMat_;
int nstripes_;
bool useAVX2_;
};
void forward(std::vector<Mat*> &input, std::vector<Mat> &output, std::vector<Mat> &)
{
int axisCan = clamp(axis, input[0]->dims);
int outerSize = input[0]->total(0, axisCan);
for (size_t i = 0; i < input.size(); i++)
{
Mat srcMat = input[i]->reshape(1, outerSize);
Mat dstMat = output[i].reshape(1, outerSize);
dnn::gemm(srcMat, weight, 1, dstMat, 0, GEMM_2_T);
if (bias)
dnn::gemm(*biasOnesMat, *biasMat, 1, dstMat, 1);
const int nstripes = getNumThreads();
FullConnected fconn(srcMat, weightsMat, biasMat, dstMat, nstripes);
parallel_for_(Range(0, nstripes), fconn, nstripes);
}
}
......@@ -134,6 +230,7 @@ public:
}
bool bias;
Mat weightsMat, biasMat;
};
Ptr<InnerProductLayer> InnerProductLayer::create(const LayerParams& params)
......
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "precomp.hpp"
#include "layers_common.hpp"
#include "opencv2/core/hal/intrin.hpp"
#if CV_DNN_TRY_AVX2
#include <immintrin.h>
namespace cv {
namespace dnn {
void fastConv_avx2( const float* weights, size_t wstep, const float* bias,
const float* rowbuf, float* output, const int* outShape,
int blockSize, int vecsize, int vecsize_aligned, bool initOutput )
{
int outCn = outShape[1];
size_t outPlaneSize = outShape[2]*outShape[3];
// now compute dot product of the weights
// and im2row-transformed part of the tensor
for( int i = 0; i < outCn; i += 3 )
{
const float* wptr0 = weights + i*wstep;
const float* wptr1 = wptr0 + wstep;
const float* wptr2 = wptr1 + wstep;
float* outptr0 = output + i*outPlaneSize;
float* outptr1 = outptr0 + outPlaneSize;
float* outptr2 = outptr1 + outPlaneSize;
float bias0 = bias[i], bias1 = bias[i+1], bias2 = bias[i+2];
if( i+2 >= outCn )
{
wptr2 = wptr1;
outptr2 = outptr1;
bias2 = bias1;
if( i+1 >= outCn )
{
wptr2 = wptr1 = wptr0;
outptr2 = outptr1 = outptr0;
bias2 = bias1 = bias0;
}
}
int j = 0;
for( ; j <= blockSize - 4; j += 4 )
{
const float* rptr = rowbuf + j*vecsize_aligned;
__m256 vs00 = _mm256_setzero_ps(), vs01 = _mm256_setzero_ps(),
vs02 = _mm256_setzero_ps(), vs03 = _mm256_setzero_ps(),
vs10 = _mm256_setzero_ps(), vs11 = _mm256_setzero_ps(),
vs12 = _mm256_setzero_ps(), vs13 = _mm256_setzero_ps(),
vs20 = _mm256_setzero_ps(), vs21 = _mm256_setzero_ps(),
vs22 = _mm256_setzero_ps(), vs23 = _mm256_setzero_ps();
for( int k = 0; k < vecsize; k += 8, rptr += 8 )
{
__m256 w0 = _mm256_load_ps(wptr0 + k);
__m256 w1 = _mm256_load_ps(wptr1 + k);
__m256 w2 = _mm256_load_ps(wptr2 + k);
__m256 r0 = _mm256_load_ps(rptr);
vs00 = _mm256_fmadd_ps(w0, r0, vs00);
vs10 = _mm256_fmadd_ps(w1, r0, vs10);
vs20 = _mm256_fmadd_ps(w2, r0, vs20);
r0 = _mm256_load_ps(rptr + vecsize_aligned);
vs01 = _mm256_fmadd_ps(w0, r0, vs01);
vs11 = _mm256_fmadd_ps(w1, r0, vs11);
vs21 = _mm256_fmadd_ps(w2, r0, vs21);
r0 = _mm256_load_ps(rptr + vecsize_aligned*2);
vs02 = _mm256_fmadd_ps(w0, r0, vs02);
vs12 = _mm256_fmadd_ps(w1, r0, vs12);
vs22 = _mm256_fmadd_ps(w2, r0, vs22);
r0 = _mm256_load_ps(rptr + vecsize_aligned*3);
vs03 = _mm256_fmadd_ps(w0, r0, vs03);
vs13 = _mm256_fmadd_ps(w1, r0, vs13);
vs23 = _mm256_fmadd_ps(w2, r0, vs23);
}
__m256 t0 = _mm256_hadd_ps(_mm256_hadd_ps(vs00, vs01), _mm256_hadd_ps(vs02, vs03));
__m256 t1 = _mm256_hadd_ps(_mm256_hadd_ps(vs10, vs11), _mm256_hadd_ps(vs12, vs13));
__m256 t2 = _mm256_hadd_ps(_mm256_hadd_ps(vs20, vs21), _mm256_hadd_ps(vs22, vs23));
t0 = _mm256_add_ps(t0, _mm256_permute2f128_ps(t0, t0, 1));
t1 = _mm256_add_ps(t1, _mm256_permute2f128_ps(t1, t1, 1));
t2 = _mm256_add_ps(t2, _mm256_permute2f128_ps(t2, t2, 1));
__m256 s0, s1, s2;
if( initOutput )
{
s0 = _mm256_set1_ps(bias0);
s1 = _mm256_set1_ps(bias1);
s2 = _mm256_set1_ps(bias2);
}
else
{
s0 = _mm256_castps128_ps256(_mm_loadu_ps(outptr0 + j));
s1 = _mm256_castps128_ps256(_mm_loadu_ps(outptr1 + j));
s2 = _mm256_castps128_ps256(_mm_loadu_ps(outptr2 + j));
}
s0 = _mm256_add_ps(s0, t0);
s1 = _mm256_add_ps(s1, t1);
s2 = _mm256_add_ps(s2, t2);
_mm_storeu_ps(outptr0 + j, _mm256_castps256_ps128(s0));
_mm_storeu_ps(outptr1 + j, _mm256_castps256_ps128(s1));
_mm_storeu_ps(outptr2 + j, _mm256_castps256_ps128(s2));
}
for( ; j < blockSize; j++ )
{
const float* rptr = rowbuf + j*vecsize_aligned;
float s00, s10, s20;
if( initOutput )
{
s00 = bias0;
s10 = bias1;
s20 = bias2;
}
else
{
s00 = outptr0[j];
s10 = outptr1[j];
s20 = outptr2[j];
}
for( int k = 0; k < vecsize; k++ )
{
float r0 = rptr[k];
s00 += wptr0[k]*r0;
s10 += wptr1[k]*r0;
s20 += wptr2[k]*r0;
}
outptr0[j] = s00;
outptr1[j] = s10;
outptr2[j] = s20;
}
}
_mm256_zeroupper();
}
// dst = vec * weights^t + bias
void fastGEMM1T_avx2( const float* vec, const float* weights,
size_t wstep, const float* bias,
float* dst, int nvecs, int vecsize )
{
int i = 0;
for( ; i <= nvecs - 8; i += 8 )
{
const float* wptr = weights + i*wstep;
__m256 vs0 = _mm256_setzero_ps(), vs1 = _mm256_setzero_ps(),
vs2 = _mm256_setzero_ps(), vs3 = _mm256_setzero_ps(),
vs4 = _mm256_setzero_ps(), vs5 = _mm256_setzero_ps(),
vs6 = _mm256_setzero_ps(), vs7 = _mm256_setzero_ps();
for( int k = 0; k < vecsize; k += 8, wptr += 8 )
{
__m256 v = _mm256_load_ps(vec + k);
vs0 = _mm256_fmadd_ps(_mm256_load_ps(wptr), v, vs0);
vs1 = _mm256_fmadd_ps(_mm256_load_ps(wptr + wstep), v, vs1);
vs2 = _mm256_fmadd_ps(_mm256_load_ps(wptr + wstep*2), v, vs2);
vs3 = _mm256_fmadd_ps(_mm256_load_ps(wptr + wstep*3), v, vs3);
vs4 = _mm256_fmadd_ps(_mm256_load_ps(wptr + wstep*4), v, vs4);
vs5 = _mm256_fmadd_ps(_mm256_load_ps(wptr + wstep*5), v, vs5);
vs6 = _mm256_fmadd_ps(_mm256_load_ps(wptr + wstep*6), v, vs6);
vs7 = _mm256_fmadd_ps(_mm256_load_ps(wptr + wstep*7), v, vs7);
}
__m256 s0 = _mm256_hadd_ps(_mm256_hadd_ps(vs0, vs1), _mm256_hadd_ps(vs2, vs3));
__m256 s1 = _mm256_hadd_ps(_mm256_hadd_ps(vs4, vs5), _mm256_hadd_ps(vs6, vs7));
s0 = _mm256_add_ps(s0, _mm256_permute2f128_ps(s0, s0, 1));
s1 = _mm256_add_ps(s1, _mm256_permute2f128_ps(s1, s1, 1));
s0 = _mm256_add_ps(s0, _mm256_castps128_ps256(_mm_loadu_ps(bias + i)));
s1 = _mm256_add_ps(s1, _mm256_castps128_ps256(_mm_loadu_ps(bias + i + 4)));
_mm_storeu_ps(dst + i, _mm256_castps256_ps128(s0));
_mm_storeu_ps(dst + i + 4, _mm256_castps256_ps128(s1));
}
float temp = 0.f;
for( ; i < nvecs; i++ )
{
const float* wptr = weights + i*wstep;
__m256 vs0 = _mm256_setzero_ps();
for( int k = 0; k < vecsize; k += 8, wptr += 8 )
{
__m256 v = _mm256_load_ps(vec + k);
vs0 = _mm256_fmadd_ps(_mm256_load_ps(wptr), v, vs0);
}
__m256 s0 = _mm256_hadd_ps(_mm256_hadd_ps(vs0, vs0), vs0);
s0 = _mm256_add_ps(s0, _mm256_permute2f128_ps(s0, s0, 1));
_mm_store_ss(&temp, _mm256_castps256_ps128(s0));
dst[i] = temp + bias[i];
}
_mm256_zeroupper();
}
}
}
#endif
......@@ -64,6 +64,21 @@ void getConvPoolOutParams(const Size& inp, const Size &kernel,
void getConvPoolPaddings(const Size& inp, const Size& out,
const Size &kernel, const Size &stride,
const String &padMode, Size &pad);
#if CV_SSE2
#define CV_DNN_TRY_AVX2 1
void fastConv_avx2(const float* weights, size_t wstep, const float* bias,
const float* rowbuf, float* output, const int* outShape,
int blockSize, int vecsize, int vecsize_aligned, bool initOutput);
void fastGEMM1T_avx2( const float* vec, const float* weights,
size_t wstep, const float* bias,
float* dst, int nvecs, int vecsize );
#else
#define CV_DNN_TRY_AVX2 0
#endif
}
}
......
......@@ -41,8 +41,9 @@
#include "../precomp.hpp"
#include "layers_common.hpp"
#include <opencv2/imgproc.hpp>
#include <opencv2/dnn/shape_utils.hpp>
#include "opencv2/imgproc.hpp"
#include "opencv2/dnn/shape_utils.hpp"
#include "opencv2/core/hal/hal.hpp"
#include <algorithm>
namespace cv
......@@ -100,45 +101,94 @@ public:
}
}
void channelNormalization(Mat &srcBlob, Mat &dstBlob)
class ChannelLRN : public ParallelLoopBody
{
int num = srcBlob.size[0];
int channels = srcBlob.size[1];
int ksize = (size - 1) / 2;
int sizeNormFactor = normBySize ? size : 1;
Mat srcMat = srcBlob.clone();
Mat dstMat = dstBlob;
public:
ChannelLRN(const float* src, float* dst, int channels, int ksize,
float alpha1, float bias1, float beta1,
size_t planeSize, int nsamples, int nstripes)
{
src_ = src; dst_ = dst;
channels_ = channels;
ksize_ = ksize;
alpha1_ = alpha1; bias1_ = bias1; beta1_ = beta1;
planeSize_ = planeSize; nsamples_ = nsamples; nstripes_ = nstripes;
}
for (int n = 0; n < num; n++)
void operator()(const Range& r) const
{
Mat accum = getPlane(dstMat, n, channels-1); //trick for memory saving
accum.setTo(0);
int nsamples = nsamples_, nstripes = nstripes_;
size_t planeSize = planeSize_, planeSize_n = planeSize * nsamples;
size_t elemsPerStripe = (planeSize_n + nstripes - 1)/nstripes;
size_t rstart = r.start*elemsPerStripe;
size_t rend = r.end == nstripes ? planeSize_n : r.end*elemsPerStripe;
rstart = std::min(rstart, planeSize_n);
rend = std::min(rend, planeSize_n);
float alpha1 = alpha1_, bias1 = bias1_, beta1 = beta1_;
int k, channels = channels_, ksize = ksize_;
for (int cn = 0; cn < std::min(ksize, channels); cn++)
cv::accumulateSquare(getPlane(srcMat, n, cn), accum);
AutoBuffer<float> buf_((channels + ksize*2 + 4)*2);
float* acc = (float*)buf_;
float* buf = acc + channels + ksize + 1;
for( k = 0; k <= ksize; k++ )
buf[-k-1] = buf[channels + k] = 0.f;
for (int cn = 0; cn < channels; cn++)
for( size_t ofs = rstart; ofs < rend; )
{
if (cn + ksize < channels)
{
cv::accumulateSquare(getPlane(srcMat, n, cn + ksize), accum);
}
int sampleIdx = (int)(ofs/planeSize);
if( sampleIdx >= nsamples )
break;
size_t ofs0 = ofs - sampleIdx*planeSize;
size_t ofs1 = std::min(planeSize - ofs0, rend - ofs) + ofs;
const float* src = src_ + sampleIdx*planeSize*channels + ofs0;
float* dst = dst_ + sampleIdx*planeSize*channels + ofs0;
if (cn - ksize - 1 >= 0)
for( ; ofs < ofs1; ofs++, src++, dst++ )
{
//subtractSquare
Mat left = getPlane(srcMat, n, cn - ksize - 1);
cv::pow(left, 2, left);
cv::subtract(accum, left, accum);
}
for( k = 0; k < channels; k++ )
buf[k] = src[k*planeSize];
float s = 0;
for( k = 0; k < ksize; k++ )
s += buf[k]*buf[k];
for( k = 0; k < channels; k++ )
{
float x1 = buf[k + ksize];
float x0 = buf[k - ksize - 1];
s = std::max(s + (x1 + x0)*(x1 - x0), 0.f);
acc[k] = (float)(alpha1*s + bias1);
}
Mat dst = getPlane(dstMat, n, cn);
accum.convertTo(dst, dst.type(), alpha/sizeNormFactor, bias);
cv::pow(dst, beta, dst);
cv::divide(getPlane(srcMat, n, cn), dst, dst);
hal::log32f(acc, acc, channels);
for( k = 0; k < channels; k++ )
acc[k] *= beta1;
hal::exp32f(acc, acc, channels);
for( k = 0; k < channels; k++ )
dst[k*planeSize] = buf[k]*acc[k];
}
}
}
const float* src_;
float* dst_;
float alpha1_, bias1_, beta1_;
size_t planeSize_;
int channels_, ksize_, nsamples_, nstripes_;
};
void channelNormalization(Mat &srcBlob, Mat &dstBlob)
{
int num = srcBlob.size[0];
int channels = srcBlob.size[1];
int ksize = (size - 1) / 2;
int sizeNormFactor = normBySize ? size : 1;
size_t planeSize = srcBlob.size[2]*srcBlob.size[3];
int nstripes = std::max(getNumThreads(), 1);
ChannelLRN clrn(srcBlob.ptr<float>(), dstBlob.ptr<float>(), channels,
ksize, alpha/sizeNormFactor, bias, -beta, planeSize, num, nstripes);
parallel_for_(Range(0, nstripes), clrn, nstripes);
}
void sqrBoxFilter_(const Mat &src, Mat &dst)
......
......@@ -48,194 +48,6 @@
namespace cv {
namespace dnn {
#if 0
template <typename Dtype>
class im2col_CpuPBody : public cv::ParallelLoopBody
{
const Dtype* data_im;
int channels, height, width;
int kernel_h, kernel_w;
int pad_h, pad_w;
int stride_h, stride_w;
int dilation_h, dilation_w;
Dtype* data_col;
int height_col, width_col, channels_col;
im2col_CpuPBody() {}
public:
static void run(const Dtype* data_im,
int channels, int height, int width,
int kernel_h, int kernel_w,
int pad_h, int pad_w,
int stride_h, int stride_w,
int dilation_h, int dilation_w,
int height_col, int width_col,
Dtype* data_col)
{
im2col_CpuPBody<Dtype> t;
t.data_im = data_im;
t.data_col = data_col;
t.channels = channels; t.height = height; t.width = width;
t.kernel_h = kernel_h; t.kernel_w = kernel_w;
t.pad_h = pad_h; t.pad_w = pad_w;
t.stride_h = stride_h; t.stride_w = stride_w;
t.dilation_h = dilation_h; t.dilation_w = dilation_w;
t.height_col = height_col;
t.width_col = width_col;
t.channels_col = channels * kernel_h * kernel_w;
cv::parallel_for_(Range(0, t.channels_col), t);
}
virtual void operator ()(const Range &r) const
{
for (int c = r.start; c < r.end; ++c)
{
int w_offset = c % kernel_w;
int h_offset = (c / kernel_w) % kernel_h;
int c_im = c / kernel_h / kernel_w;
for (int h = 0; h < height_col; ++h)
{
for (int w = 0; w < width_col; ++w)
{
int h_pad = h * stride_h - pad_h + h_offset * dilation_h;
int w_pad = w * stride_w - pad_w + w_offset * dilation_w;
if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
data_col[(c * height_col + h) * width_col + w] =
data_im[(c_im * height + h_pad) * width + w_pad];
else
data_col[(c * height_col + h) * width_col + w] = 0;
}
}
}
}
};
#endif
template <typename Dtype>
class im2row_CpuPBody : public cv::ParallelLoopBody
{
const Dtype* data_im;
int channels, height, width;
int kernel_h, kernel_w;
int pad_h, pad_w;
int stride_h, stride_w;
int dilation_h, dilation_w;
Dtype* data_col;
int height_col, width_col, channels_col;
im2row_CpuPBody() {}
public:
static void run(const Dtype* data_im,
int channels, int height, int width,
int kernel_h, int kernel_w,
int pad_h, int pad_w,
int stride_h, int stride_w,
int dilation_h, int dilation_w,
int height_col, int width_col,
Dtype* data_col)
{
im2row_CpuPBody<Dtype> t;
t.data_im = data_im;
t.data_col = data_col;
t.channels = channels; t.height = height; t.width = width;
t.kernel_h = kernel_h; t.kernel_w = kernel_w;
t.pad_h = pad_h; t.pad_w = pad_w;
t.stride_h = stride_h; t.stride_w = stride_w;
t.dilation_h = dilation_h; t.dilation_w = dilation_w;
t.height_col = height_col;
t.width_col = width_col;
t.channels_col = channels * kernel_h * kernel_w;
cv::parallel_for_(Range(0, t.height_col*t.width_col), t, 16);
}
virtual void operator ()(const Range &r) const
{
int dh = dilation_h, dw = dilation_w;
int kh = kernel_h, kw = kernel_w;
Dtype* data_col_ = data_col;
const Dtype* data_im_ = data_im;
int kelems = kh*kw;
AutoBuffer<int> ofs_(kelems);
int* ofs = ofs_;
int k = 0;
for( int k_r = 0; k_r < kernel_h; k_r++ )
for( int k_c = 0; k_c < kernel_w; k_c++, k++ )
ofs[k] = k_r*dh*width + k_c*dw;
for (int row = r.start; row < r.end; ++row)
{
int out_c = row % width_col;
int out_r = row / width_col;
int out_row_offset = row*kh*kw*channels;
int start_in_r = out_r * stride_h - pad_h;
int start_in_c = out_c * stride_w - pad_w;
int start_k_r = std::max(0, (-start_in_r + dilation_h-1)/dilation_h);
int end_k_r = std::min(kh, (height - start_in_r + dilation_h-1)/dilation_h);
int start_k_c = std::max(0, (-start_in_c + dilation_w-1)/dilation_w);
int end_k_c = std::min(kw, (width - start_in_c + dilation_w-1)/dilation_w);
if( start_k_r == 0 && end_k_r == kh && start_k_c == 0 && end_k_c == kw )
{
for( int i_c = 0; i_c < channels; i_c++ )
{
float* data_col_c = data_col_ + out_row_offset + i_c*kh*kw;
const float* data_im_c = data_im_ + (i_c*height + start_in_r)*width + start_in_c;
for( k = 0; k < kelems; k++ )
{
data_col_c[k] = data_im_c[ofs[k]];
}
}
}
else
{
for(int i_c = 0; i_c < channels; i_c++)
{
int channels_offset = i_c * width * height;
int out_ch_offset = i_c*kh*kw;
int in_r = start_in_r + start_k_r*dh;
for(int k_r = start_k_r; k_r < end_k_r; k_r++, in_r += dh)
{
int row_offset = in_r*width;
int out_col_offset = k_r*kw;
int in_c = start_in_c + start_k_c*dw;
for(int k_c = start_k_c; k_c < end_k_c; k_c++, in_c += dw)
{
int in_index = channels_offset + row_offset + in_c;
int out_index = out_row_offset + out_ch_offset + out_col_offset + k_c;
data_col_[out_index] = data_im_[in_index];
}
}
}
}
}
}
};
void im2row(const float* data_im, int channels, int height, int width,
int kernel_h, int kernel_w, int pad_h, int pad_w,
int stride_h, int stride_w, int dilation_h, int dilation_w,
int height_col, int width_col, float* data_col)
{
im2row_CpuPBody<float>::run(data_im, channels, height, width,
kernel_h, kernel_w, pad_h, pad_w,
stride_h, stride_w, dilation_h, dilation_w,
height_col, width_col, data_col);
}
template <typename Dtype>
class col2im_CpuPBody : public cv::ParallelLoopBody
{
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment