Commit 645260af authored by Vadim Pisarevsky's avatar Vadim Pisarevsky Committed by GitHub

optimized several conv net layers (#1227)

* rewritten the following layers to be [much] more efficient: convolution, fully connected, activations (relu, tanh, ...), LRN. Use optional AVX optimization for the first two.

* eliminated trailing whitespaces
parent 009d2efb
......@@ -201,9 +201,13 @@ namespace dnn
String padMode;
};
class CV_EXPORTS ActivationLayer;
class CV_EXPORTS ConvolutionLayer : public BaseConvolutionLayer
{
public:
virtual bool setActivation(const Ptr<ActivationLayer>& layer) = 0;
static Ptr<BaseConvolutionLayer> create(const LayerParams& params);
};
......@@ -327,8 +331,14 @@ namespace dnn
};
/* Activations */
class CV_EXPORTS ActivationLayer : public Layer
{
public:
virtual void forwardSlice(const float* src, float* dst, int len,
size_t outPlaneSize, int cn0, int cn1) const = 0;
};
class CV_EXPORTS ReLULayer : public Layer
class CV_EXPORTS ReLULayer : public ActivationLayer
{
public:
float negativeSlope;
......@@ -336,37 +346,37 @@ namespace dnn
static Ptr<ReLULayer> create(const LayerParams &params);
};
class CV_EXPORTS ChannelsPReLULayer : public Layer
class CV_EXPORTS ChannelsPReLULayer : public ActivationLayer
{
public:
static Ptr<ChannelsPReLULayer> create(const LayerParams& params);
};
class CV_EXPORTS TanHLayer : public Layer
class CV_EXPORTS TanHLayer : public ActivationLayer
{
public:
static Ptr<TanHLayer> create(const LayerParams &params);
};
class CV_EXPORTS SigmoidLayer : public Layer
class CV_EXPORTS SigmoidLayer : public ActivationLayer
{
public:
static Ptr<SigmoidLayer> create(const LayerParams &params);
};
class CV_EXPORTS BNLLLayer : public Layer
class CV_EXPORTS BNLLLayer : public ActivationLayer
{
public:
static Ptr<BNLLLayer> create(const LayerParams &params);
};
class CV_EXPORTS AbsLayer : public Layer
class CV_EXPORTS AbsLayer : public ActivationLayer
{
public:
static Ptr<AbsLayer> create(const LayerParams &params);
};
class CV_EXPORTS PowerLayer : public Layer
class CV_EXPORTS PowerLayer : public ActivationLayer
{
public:
float power, scale, shift;
......@@ -374,7 +384,7 @@ namespace dnn
static Ptr<PowerLayer> create(const LayerParams &params);
};
/* Layers using in semantic segmentation */
/* Layers used in semantic segmentation */
class CV_EXPORTS CropLayer : public Layer
{
......
......@@ -52,6 +52,8 @@ namespace dnn
class FullyConnectedLayerImpl : public InnerProductLayer
{
public:
enum { VEC_ALIGN = 8 };
FullyConnectedLayerImpl(const LayerParams& params)
{
setParamsFrom(params);
......@@ -65,15 +67,29 @@ public:
CV_Assert(blobs[0].dims >= 2 && (size_t)(innerSize * numOutput) == blobs[0].total());
CV_Assert(!bias || (blobs.size() == 2 && (size_t)numOutput == blobs[1].total()));
blobs[0] = blobs[0].reshape(1, numOutput);
weightsMat = blobs[0] = blobs[0].reshape(1, numOutput);
int vecsize = weightsMat.cols;
if( vecsize % VEC_ALIGN != 0 )
{
int vecsize_aligned = (int)alignSize(vecsize, VEC_ALIGN);
Mat weightsBuf(weightsMat.rows, vecsize_aligned, weightsMat.type());
Mat wpadding = weightsBuf.colRange(vecsize, vecsize_aligned);
wpadding.setTo(Scalar::all(0.));
weightsMat = weightsBuf.colRange(0, vecsize);
blobs[0].copyTo(weightsMat);
blobs[0] = weightsMat;
}
if (bias)
blobs[1] = blobs[1].reshape(1, 1);
biasMat = blobs[1] = blobs[1].reshape(1, 1);
else
biasMat = Mat::zeros(1, numOutput, weightsMat.type());
}
bool getMemoryShapes(const std::vector<MatShape> &inputs,
const int requiredOutputs,
std::vector<MatShape> &outputs,
std::vector<MatShape> &internals) const
std::vector<MatShape> &) const
{
CV_Assert(inputs.size() > 0);
CV_Assert(1 <= blobs.size() && blobs.size() <= 2);
......@@ -84,36 +100,116 @@ public:
int numOutput = blobs[0].size[0];
outputs.resize(inputs.size(), shape(outerSize, numOutput));
internals.push_back(shape(outerSize, 1));
CV_Assert(!bias || (size_t)numOutput == blobs[1].total());
return false;
}
void forward(std::vector<Mat*> &input, std::vector<Mat> &output, std::vector<Mat> &internals)
class FullConnected : public ParallelLoopBody
{
internals[0].setTo(1.);
const Mat &weight = blobs[0];
const Mat *biasMat = NULL, *biasOnesMat = NULL;
int axisCan = clamp(axis, input[0]->dims);
int outerSize = input[0]->total(0, axisCan);
public:
FullConnected(const Mat& srcMat, const Mat& weights, const Mat& biasMat, Mat& dstMat, int nstripes)
{
CV_Assert( srcMat.dims == 2 && srcMat.cols == weights.cols &&
dstMat.rows == srcMat.rows && dstMat.cols == weights.rows &&
srcMat.type() == weights.type() && weights.type() == dstMat.type() &&
srcMat.type() == CV_32F &&
(biasMat.empty() || (biasMat.type() == srcMat.type() &&
biasMat.isContinuous() && (int)biasMat.total() == dstMat.cols)) );
srcMat_ = &srcMat;
weights_ = &weights;
biasMat_ = &biasMat;
dstMat_ = &dstMat;
nstripes_ = nstripes;
useAVX2_ = checkHardwareSupport(CPU_AVX2);
}
if (bias)
void operator()(const Range& r) const
{
biasOnesMat = &internals[0];
biasMat = &blobs[1];
int nsamples = srcMat_->rows;
int nw0 = weights_->rows;
int vecsize = srcMat_->cols;
int nstripes = nstripes_;
size_t total = (size_t)nsamples*nw0;
size_t stripeSize = (total + nstripes - 1)/nstripes;
size_t stripeStart = r.start*stripeSize;
size_t stripeEnd = r.end == nstripes ? total : std::min(r.end*stripeSize, total);
size_t wstep = weights_->step1();
for( size_t ofs = stripeStart; ofs < stripeEnd; )
{
int sampleIdx = (int)(ofs / nw0);
int delta = (int)(ofs - (size_t)sampleIdx*nw0);
const float* sptr = srcMat_->ptr<float>(sampleIdx);
const float* wptr = weights_->ptr<float>(delta);
float* dptr = dstMat_->ptr<float>(sampleIdx) + delta;
const float* biasptr = biasMat_->ptr<float>() + delta;
int nw = std::min(nw0 - delta, (int)(stripeEnd - ofs));
#if CV_DNN_TRY_AVX2
if( useAVX2_ )
fastGEMM1T_avx2( sptr, wptr, wstep, biasptr, dptr, nw, vecsize);
else
#endif
{
int i = 0, k;
#if CV_SIMD128
for( ; i <= nw - 4; i += 4, wptr += 4*wstep )
{
vfloat32x4 vs0 = v_setall_f32(0.f), vs1 = v_setall_f32(0.f);
vfloat32x4 vs2 = v_setall_f32(0.f), vs3 = v_setall_f32(0.f);
for( k = 0; k < vecsize; k += 4 )
{
vfloat32x4 v = v_load_aligned(sptr + k);
vs0 += v*v_load_aligned(wptr + k);
vs1 += v*v_load_aligned(wptr + wstep + k);
vs2 += v*v_load_aligned(wptr + wstep*2 + k);
vs3 += v*v_load_aligned(wptr + wstep*3 + k);
}
vfloat32x4 s = v_reduce_sum4(vs0, vs1, vs2, vs3);
s += v_load(biasptr + i);
v_store(dptr + i, s);
}
#endif
for( ; i < nw; i++, wptr += wstep )
{
float s0=biasptr[i];
for( k = 0; k < vecsize; k++ )
{
float v = sptr[k];
s0 += v*wptr[k];
}
dptr[i] = s0;
}
}
ofs += nw;
}
}
const Mat *srcMat_, *weights_, *biasMat_;
Mat* dstMat_;
int nstripes_;
bool useAVX2_;
};
void forward(std::vector<Mat*> &input, std::vector<Mat> &output, std::vector<Mat> &)
{
int axisCan = clamp(axis, input[0]->dims);
int outerSize = input[0]->total(0, axisCan);
for (size_t i = 0; i < input.size(); i++)
{
Mat srcMat = input[i]->reshape(1, outerSize);
Mat dstMat = output[i].reshape(1, outerSize);
dnn::gemm(srcMat, weight, 1, dstMat, 0, GEMM_2_T);
if (bias)
dnn::gemm(*biasOnesMat, *biasMat, 1, dstMat, 1);
const int nstripes = getNumThreads();
FullConnected fconn(srcMat, weightsMat, biasMat, dstMat, nstripes);
parallel_for_(Range(0, nstripes), fconn, nstripes);
}
}
......@@ -134,6 +230,7 @@ public:
}
bool bias;
Mat weightsMat, biasMat;
};
Ptr<InnerProductLayer> InnerProductLayer::create(const LayerParams& params)
......
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "precomp.hpp"
#include "layers_common.hpp"
#include "opencv2/core/hal/intrin.hpp"
#if CV_DNN_TRY_AVX2
#include <immintrin.h>
namespace cv {
namespace dnn {
void fastConv_avx2( const float* weights, size_t wstep, const float* bias,
const float* rowbuf, float* output, const int* outShape,
int blockSize, int vecsize, int vecsize_aligned, bool initOutput )
{
int outCn = outShape[1];
size_t outPlaneSize = outShape[2]*outShape[3];
// now compute dot product of the weights
// and im2row-transformed part of the tensor
for( int i = 0; i < outCn; i += 3 )
{
const float* wptr0 = weights + i*wstep;
const float* wptr1 = wptr0 + wstep;
const float* wptr2 = wptr1 + wstep;
float* outptr0 = output + i*outPlaneSize;
float* outptr1 = outptr0 + outPlaneSize;
float* outptr2 = outptr1 + outPlaneSize;
float bias0 = bias[i], bias1 = bias[i+1], bias2 = bias[i+2];
if( i+2 >= outCn )
{
wptr2 = wptr1;
outptr2 = outptr1;
bias2 = bias1;
if( i+1 >= outCn )
{
wptr2 = wptr1 = wptr0;
outptr2 = outptr1 = outptr0;
bias2 = bias1 = bias0;
}
}
int j = 0;
for( ; j <= blockSize - 4; j += 4 )
{
const float* rptr = rowbuf + j*vecsize_aligned;
__m256 vs00 = _mm256_setzero_ps(), vs01 = _mm256_setzero_ps(),
vs02 = _mm256_setzero_ps(), vs03 = _mm256_setzero_ps(),
vs10 = _mm256_setzero_ps(), vs11 = _mm256_setzero_ps(),
vs12 = _mm256_setzero_ps(), vs13 = _mm256_setzero_ps(),
vs20 = _mm256_setzero_ps(), vs21 = _mm256_setzero_ps(),
vs22 = _mm256_setzero_ps(), vs23 = _mm256_setzero_ps();
for( int k = 0; k < vecsize; k += 8, rptr += 8 )
{
__m256 w0 = _mm256_load_ps(wptr0 + k);
__m256 w1 = _mm256_load_ps(wptr1 + k);
__m256 w2 = _mm256_load_ps(wptr2 + k);
__m256 r0 = _mm256_load_ps(rptr);
vs00 = _mm256_fmadd_ps(w0, r0, vs00);
vs10 = _mm256_fmadd_ps(w1, r0, vs10);
vs20 = _mm256_fmadd_ps(w2, r0, vs20);
r0 = _mm256_load_ps(rptr + vecsize_aligned);
vs01 = _mm256_fmadd_ps(w0, r0, vs01);
vs11 = _mm256_fmadd_ps(w1, r0, vs11);
vs21 = _mm256_fmadd_ps(w2, r0, vs21);
r0 = _mm256_load_ps(rptr + vecsize_aligned*2);
vs02 = _mm256_fmadd_ps(w0, r0, vs02);
vs12 = _mm256_fmadd_ps(w1, r0, vs12);
vs22 = _mm256_fmadd_ps(w2, r0, vs22);
r0 = _mm256_load_ps(rptr + vecsize_aligned*3);
vs03 = _mm256_fmadd_ps(w0, r0, vs03);
vs13 = _mm256_fmadd_ps(w1, r0, vs13);
vs23 = _mm256_fmadd_ps(w2, r0, vs23);
}
__m256 t0 = _mm256_hadd_ps(_mm256_hadd_ps(vs00, vs01), _mm256_hadd_ps(vs02, vs03));
__m256 t1 = _mm256_hadd_ps(_mm256_hadd_ps(vs10, vs11), _mm256_hadd_ps(vs12, vs13));
__m256 t2 = _mm256_hadd_ps(_mm256_hadd_ps(vs20, vs21), _mm256_hadd_ps(vs22, vs23));
t0 = _mm256_add_ps(t0, _mm256_permute2f128_ps(t0, t0, 1));
t1 = _mm256_add_ps(t1, _mm256_permute2f128_ps(t1, t1, 1));
t2 = _mm256_add_ps(t2, _mm256_permute2f128_ps(t2, t2, 1));
__m256 s0, s1, s2;
if( initOutput )
{
s0 = _mm256_set1_ps(bias0);
s1 = _mm256_set1_ps(bias1);
s2 = _mm256_set1_ps(bias2);
}
else
{
s0 = _mm256_castps128_ps256(_mm_loadu_ps(outptr0 + j));
s1 = _mm256_castps128_ps256(_mm_loadu_ps(outptr1 + j));
s2 = _mm256_castps128_ps256(_mm_loadu_ps(outptr2 + j));
}
s0 = _mm256_add_ps(s0, t0);
s1 = _mm256_add_ps(s1, t1);
s2 = _mm256_add_ps(s2, t2);
_mm_storeu_ps(outptr0 + j, _mm256_castps256_ps128(s0));
_mm_storeu_ps(outptr1 + j, _mm256_castps256_ps128(s1));
_mm_storeu_ps(outptr2 + j, _mm256_castps256_ps128(s2));
}
for( ; j < blockSize; j++ )
{
const float* rptr = rowbuf + j*vecsize_aligned;
float s00, s10, s20;
if( initOutput )
{
s00 = bias0;
s10 = bias1;
s20 = bias2;
}
else
{
s00 = outptr0[j];
s10 = outptr1[j];
s20 = outptr2[j];
}
for( int k = 0; k < vecsize; k++ )
{
float r0 = rptr[k];
s00 += wptr0[k]*r0;
s10 += wptr1[k]*r0;
s20 += wptr2[k]*r0;
}
outptr0[j] = s00;
outptr1[j] = s10;
outptr2[j] = s20;
}
}
_mm256_zeroupper();
}
// dst = vec * weights^t + bias
void fastGEMM1T_avx2( const float* vec, const float* weights,
size_t wstep, const float* bias,
float* dst, int nvecs, int vecsize )
{
int i = 0;
for( ; i <= nvecs - 8; i += 8 )
{
const float* wptr = weights + i*wstep;
__m256 vs0 = _mm256_setzero_ps(), vs1 = _mm256_setzero_ps(),
vs2 = _mm256_setzero_ps(), vs3 = _mm256_setzero_ps(),
vs4 = _mm256_setzero_ps(), vs5 = _mm256_setzero_ps(),
vs6 = _mm256_setzero_ps(), vs7 = _mm256_setzero_ps();
for( int k = 0; k < vecsize; k += 8, wptr += 8 )
{
__m256 v = _mm256_load_ps(vec + k);
vs0 = _mm256_fmadd_ps(_mm256_load_ps(wptr), v, vs0);
vs1 = _mm256_fmadd_ps(_mm256_load_ps(wptr + wstep), v, vs1);
vs2 = _mm256_fmadd_ps(_mm256_load_ps(wptr + wstep*2), v, vs2);
vs3 = _mm256_fmadd_ps(_mm256_load_ps(wptr + wstep*3), v, vs3);
vs4 = _mm256_fmadd_ps(_mm256_load_ps(wptr + wstep*4), v, vs4);
vs5 = _mm256_fmadd_ps(_mm256_load_ps(wptr + wstep*5), v, vs5);
vs6 = _mm256_fmadd_ps(_mm256_load_ps(wptr + wstep*6), v, vs6);
vs7 = _mm256_fmadd_ps(_mm256_load_ps(wptr + wstep*7), v, vs7);
}
__m256 s0 = _mm256_hadd_ps(_mm256_hadd_ps(vs0, vs1), _mm256_hadd_ps(vs2, vs3));
__m256 s1 = _mm256_hadd_ps(_mm256_hadd_ps(vs4, vs5), _mm256_hadd_ps(vs6, vs7));
s0 = _mm256_add_ps(s0, _mm256_permute2f128_ps(s0, s0, 1));
s1 = _mm256_add_ps(s1, _mm256_permute2f128_ps(s1, s1, 1));
s0 = _mm256_add_ps(s0, _mm256_castps128_ps256(_mm_loadu_ps(bias + i)));
s1 = _mm256_add_ps(s1, _mm256_castps128_ps256(_mm_loadu_ps(bias + i + 4)));
_mm_storeu_ps(dst + i, _mm256_castps256_ps128(s0));
_mm_storeu_ps(dst + i + 4, _mm256_castps256_ps128(s1));
}
float temp = 0.f;
for( ; i < nvecs; i++ )
{
const float* wptr = weights + i*wstep;
__m256 vs0 = _mm256_setzero_ps();
for( int k = 0; k < vecsize; k += 8, wptr += 8 )
{
__m256 v = _mm256_load_ps(vec + k);
vs0 = _mm256_fmadd_ps(_mm256_load_ps(wptr), v, vs0);
}
__m256 s0 = _mm256_hadd_ps(_mm256_hadd_ps(vs0, vs0), vs0);
s0 = _mm256_add_ps(s0, _mm256_permute2f128_ps(s0, s0, 1));
_mm_store_ss(&temp, _mm256_castps256_ps128(s0));
dst[i] = temp + bias[i];
}
_mm256_zeroupper();
}
}
}
#endif
......@@ -64,6 +64,21 @@ void getConvPoolOutParams(const Size& inp, const Size &kernel,
void getConvPoolPaddings(const Size& inp, const Size& out,
const Size &kernel, const Size &stride,
const String &padMode, Size &pad);
#if CV_SSE2
#define CV_DNN_TRY_AVX2 1
void fastConv_avx2(const float* weights, size_t wstep, const float* bias,
const float* rowbuf, float* output, const int* outShape,
int blockSize, int vecsize, int vecsize_aligned, bool initOutput);
void fastGEMM1T_avx2( const float* vec, const float* weights,
size_t wstep, const float* bias,
float* dst, int nvecs, int vecsize );
#else
#define CV_DNN_TRY_AVX2 0
#endif
}
}
......
......@@ -41,8 +41,9 @@
#include "../precomp.hpp"
#include "layers_common.hpp"
#include <opencv2/imgproc.hpp>
#include <opencv2/dnn/shape_utils.hpp>
#include "opencv2/imgproc.hpp"
#include "opencv2/dnn/shape_utils.hpp"
#include "opencv2/core/hal/hal.hpp"
#include <algorithm>
namespace cv
......@@ -100,45 +101,94 @@ public:
}
}
void channelNormalization(Mat &srcBlob, Mat &dstBlob)
class ChannelLRN : public ParallelLoopBody
{
int num = srcBlob.size[0];
int channels = srcBlob.size[1];
int ksize = (size - 1) / 2;
int sizeNormFactor = normBySize ? size : 1;
Mat srcMat = srcBlob.clone();
Mat dstMat = dstBlob;
public:
ChannelLRN(const float* src, float* dst, int channels, int ksize,
float alpha1, float bias1, float beta1,
size_t planeSize, int nsamples, int nstripes)
{
src_ = src; dst_ = dst;
channels_ = channels;
ksize_ = ksize;
alpha1_ = alpha1; bias1_ = bias1; beta1_ = beta1;
planeSize_ = planeSize; nsamples_ = nsamples; nstripes_ = nstripes;
}
for (int n = 0; n < num; n++)
void operator()(const Range& r) const
{
Mat accum = getPlane(dstMat, n, channels-1); //trick for memory saving
accum.setTo(0);
int nsamples = nsamples_, nstripes = nstripes_;
size_t planeSize = planeSize_, planeSize_n = planeSize * nsamples;
size_t elemsPerStripe = (planeSize_n + nstripes - 1)/nstripes;
size_t rstart = r.start*elemsPerStripe;
size_t rend = r.end == nstripes ? planeSize_n : r.end*elemsPerStripe;
rstart = std::min(rstart, planeSize_n);
rend = std::min(rend, planeSize_n);
float alpha1 = alpha1_, bias1 = bias1_, beta1 = beta1_;
int k, channels = channels_, ksize = ksize_;
for (int cn = 0; cn < std::min(ksize, channels); cn++)
cv::accumulateSquare(getPlane(srcMat, n, cn), accum);
AutoBuffer<float> buf_((channels + ksize*2 + 4)*2);
float* acc = (float*)buf_;
float* buf = acc + channels + ksize + 1;
for( k = 0; k <= ksize; k++ )
buf[-k-1] = buf[channels + k] = 0.f;
for (int cn = 0; cn < channels; cn++)
for( size_t ofs = rstart; ofs < rend; )
{
if (cn + ksize < channels)
{
cv::accumulateSquare(getPlane(srcMat, n, cn + ksize), accum);
}
int sampleIdx = (int)(ofs/planeSize);
if( sampleIdx >= nsamples )
break;
size_t ofs0 = ofs - sampleIdx*planeSize;
size_t ofs1 = std::min(planeSize - ofs0, rend - ofs) + ofs;
const float* src = src_ + sampleIdx*planeSize*channels + ofs0;
float* dst = dst_ + sampleIdx*planeSize*channels + ofs0;
if (cn - ksize - 1 >= 0)
for( ; ofs < ofs1; ofs++, src++, dst++ )
{
//subtractSquare
Mat left = getPlane(srcMat, n, cn - ksize - 1);
cv::pow(left, 2, left);
cv::subtract(accum, left, accum);
}
for( k = 0; k < channels; k++ )
buf[k] = src[k*planeSize];
float s = 0;
for( k = 0; k < ksize; k++ )
s += buf[k]*buf[k];
for( k = 0; k < channels; k++ )
{
float x1 = buf[k + ksize];
float x0 = buf[k - ksize - 1];
s = std::max(s + (x1 + x0)*(x1 - x0), 0.f);
acc[k] = (float)(alpha1*s + bias1);
}
Mat dst = getPlane(dstMat, n, cn);
accum.convertTo(dst, dst.type(), alpha/sizeNormFactor, bias);
cv::pow(dst, beta, dst);
cv::divide(getPlane(srcMat, n, cn), dst, dst);
hal::log32f(acc, acc, channels);
for( k = 0; k < channels; k++ )
acc[k] *= beta1;
hal::exp32f(acc, acc, channels);
for( k = 0; k < channels; k++ )
dst[k*planeSize] = buf[k]*acc[k];
}
}
}
const float* src_;
float* dst_;
float alpha1_, bias1_, beta1_;
size_t planeSize_;
int channels_, ksize_, nsamples_, nstripes_;
};
void channelNormalization(Mat &srcBlob, Mat &dstBlob)
{
int num = srcBlob.size[0];
int channels = srcBlob.size[1];
int ksize = (size - 1) / 2;
int sizeNormFactor = normBySize ? size : 1;
size_t planeSize = srcBlob.size[2]*srcBlob.size[3];
int nstripes = std::max(getNumThreads(), 1);
ChannelLRN clrn(srcBlob.ptr<float>(), dstBlob.ptr<float>(), channels,
ksize, alpha/sizeNormFactor, bias, -beta, planeSize, num, nstripes);
parallel_for_(Range(0, nstripes), clrn, nstripes);
}
void sqrBoxFilter_(const Mat &src, Mat &dst)
......
......@@ -48,194 +48,6 @@
namespace cv {
namespace dnn {
#if 0
template <typename Dtype>
class im2col_CpuPBody : public cv::ParallelLoopBody
{
const Dtype* data_im;
int channels, height, width;
int kernel_h, kernel_w;
int pad_h, pad_w;
int stride_h, stride_w;
int dilation_h, dilation_w;
Dtype* data_col;
int height_col, width_col, channels_col;
im2col_CpuPBody() {}
public:
static void run(const Dtype* data_im,
int channels, int height, int width,
int kernel_h, int kernel_w,
int pad_h, int pad_w,
int stride_h, int stride_w,
int dilation_h, int dilation_w,
int height_col, int width_col,
Dtype* data_col)
{
im2col_CpuPBody<Dtype> t;
t.data_im = data_im;
t.data_col = data_col;
t.channels = channels; t.height = height; t.width = width;
t.kernel_h = kernel_h; t.kernel_w = kernel_w;
t.pad_h = pad_h; t.pad_w = pad_w;
t.stride_h = stride_h; t.stride_w = stride_w;
t.dilation_h = dilation_h; t.dilation_w = dilation_w;
t.height_col = height_col;
t.width_col = width_col;
t.channels_col = channels * kernel_h * kernel_w;
cv::parallel_for_(Range(0, t.channels_col), t);
}
virtual void operator ()(const Range &r) const
{
for (int c = r.start; c < r.end; ++c)
{
int w_offset = c % kernel_w;
int h_offset = (c / kernel_w) % kernel_h;
int c_im = c / kernel_h / kernel_w;
for (int h = 0; h < height_col; ++h)
{
for (int w = 0; w < width_col; ++w)
{
int h_pad = h * stride_h - pad_h + h_offset * dilation_h;
int w_pad = w * stride_w - pad_w + w_offset * dilation_w;
if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
data_col[(c * height_col + h) * width_col + w] =
data_im[(c_im * height + h_pad) * width + w_pad];
else
data_col[(c * height_col + h) * width_col + w] = 0;
}
}
}
}
};
#endif
template <typename Dtype>
class im2row_CpuPBody : public cv::ParallelLoopBody
{
const Dtype* data_im;
int channels, height, width;
int kernel_h, kernel_w;
int pad_h, pad_w;
int stride_h, stride_w;
int dilation_h, dilation_w;
Dtype* data_col;
int height_col, width_col, channels_col;
im2row_CpuPBody() {}
public:
static void run(const Dtype* data_im,
int channels, int height, int width,
int kernel_h, int kernel_w,
int pad_h, int pad_w,
int stride_h, int stride_w,
int dilation_h, int dilation_w,
int height_col, int width_col,
Dtype* data_col)
{
im2row_CpuPBody<Dtype> t;
t.data_im = data_im;
t.data_col = data_col;
t.channels = channels; t.height = height; t.width = width;
t.kernel_h = kernel_h; t.kernel_w = kernel_w;
t.pad_h = pad_h; t.pad_w = pad_w;
t.stride_h = stride_h; t.stride_w = stride_w;
t.dilation_h = dilation_h; t.dilation_w = dilation_w;
t.height_col = height_col;
t.width_col = width_col;
t.channels_col = channels * kernel_h * kernel_w;
cv::parallel_for_(Range(0, t.height_col*t.width_col), t, 16);
}
virtual void operator ()(const Range &r) const
{
int dh = dilation_h, dw = dilation_w;
int kh = kernel_h, kw = kernel_w;
Dtype* data_col_ = data_col;
const Dtype* data_im_ = data_im;
int kelems = kh*kw;
AutoBuffer<int> ofs_(kelems);
int* ofs = ofs_;
int k = 0;
for( int k_r = 0; k_r < kernel_h; k_r++ )
for( int k_c = 0; k_c < kernel_w; k_c++, k++ )
ofs[k] = k_r*dh*width + k_c*dw;
for (int row = r.start; row < r.end; ++row)
{
int out_c = row % width_col;
int out_r = row / width_col;
int out_row_offset = row*kh*kw*channels;
int start_in_r = out_r * stride_h - pad_h;
int start_in_c = out_c * stride_w - pad_w;
int start_k_r = std::max(0, (-start_in_r + dilation_h-1)/dilation_h);
int end_k_r = std::min(kh, (height - start_in_r + dilation_h-1)/dilation_h);
int start_k_c = std::max(0, (-start_in_c + dilation_w-1)/dilation_w);
int end_k_c = std::min(kw, (width - start_in_c + dilation_w-1)/dilation_w);
if( start_k_r == 0 && end_k_r == kh && start_k_c == 0 && end_k_c == kw )
{
for( int i_c = 0; i_c < channels; i_c++ )
{
float* data_col_c = data_col_ + out_row_offset + i_c*kh*kw;
const float* data_im_c = data_im_ + (i_c*height + start_in_r)*width + start_in_c;
for( k = 0; k < kelems; k++ )
{
data_col_c[k] = data_im_c[ofs[k]];
}
}
}
else
{
for(int i_c = 0; i_c < channels; i_c++)
{
int channels_offset = i_c * width * height;
int out_ch_offset = i_c*kh*kw;
int in_r = start_in_r + start_k_r*dh;
for(int k_r = start_k_r; k_r < end_k_r; k_r++, in_r += dh)
{
int row_offset = in_r*width;
int out_col_offset = k_r*kw;
int in_c = start_in_c + start_k_c*dw;
for(int k_c = start_k_c; k_c < end_k_c; k_c++, in_c += dw)
{
int in_index = channels_offset + row_offset + in_c;
int out_index = out_row_offset + out_ch_offset + out_col_offset + k_c;
data_col_[out_index] = data_im_[in_index];
}
}
}
}
}
}
};
void im2row(const float* data_im, int channels, int height, int width,
int kernel_h, int kernel_w, int pad_h, int pad_w,
int stride_h, int stride_w, int dilation_h, int dilation_w,
int height_col, int width_col, float* data_col)
{
im2row_CpuPBody<float>::run(data_im, channels, height, width,
kernel_h, kernel_w, pad_h, pad_w,
stride_h, stride_w, dilation_h, dilation_w,
height_col, width_col, data_col);
}
template <typename Dtype>
class col2im_CpuPBody : public cv::ParallelLoopBody
{
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment