Commit b97931e0 authored by Vadim Pisarevsky's avatar Vadim Pisarevsky

Merge pull request #1136 from vpisarev:dnn5

parents 3908909d 75789089
...@@ -98,14 +98,19 @@ int main(int argc, char **argv) ...@@ -98,14 +98,19 @@ int main(int argc, char **argv)
net.setBlob("", inputBlob); //set the network input net.setBlob("", inputBlob); //set the network input
//! [Set input blob] //! [Set input blob]
const int N = 3;
TickMeter tm; TickMeter tm;
tm.start();
//! [Make forward pass] //! [Make forward pass]
net.forward(); //compute output for( int i = 0; i < N; i++ )
//! [Make forward pass] {
TickMeter tm_;
tm.stop(); tm_.start();
net.forward(); //compute output
tm_.stop();
if( i == 0 || tm_.getTimeTicks() < tm.getTimeTicks() )
tm = tm_;
}
//! [Gather output] //! [Gather output]
......
...@@ -41,6 +41,15 @@ public: ...@@ -41,6 +41,15 @@ public:
Mat* inp = inputs[i]; Mat* inp = inputs[i];
outputs[i].create(inp->dims, &inp->size.p[0], inp->type()); outputs[i].create(inp->dims, &inp->size.p[0], inp->type());
} }
varMeanScale = 1.f;
if (!hasWeights && !hasBias) {
varMeanScale = *blobs[2].ptr<float>();
if (varMeanScale != 0)
varMeanScale = 1/varMeanScale;
}
cv::pow(blobs[1]*varMeanScale + epsilon, -0.5, invStdMat);
} }
void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs) void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
...@@ -52,16 +61,6 @@ public: ...@@ -52,16 +61,6 @@ public:
int weightsBlobIndex = 2; int weightsBlobIndex = 2;
int biasBlobIndex = weightsBlobIndex + hasWeights; int biasBlobIndex = weightsBlobIndex + hasWeights;
float varMeanScale = 1;
if (!hasWeights && !hasBias) {
varMeanScale = *blobs[2].ptr<float>();
if (varMeanScale != 0)
varMeanScale = 1/varMeanScale;
}
Mat invStdMat;
cv::pow(blobs[1]*varMeanScale + epsilon, -0.5, invStdMat);
int rows = inpBlob.size[2]; int rows = inpBlob.size[2];
int cols = inpBlob.size[3]; int cols = inpBlob.size[3];
...@@ -92,7 +91,8 @@ public: ...@@ -92,7 +91,8 @@ public:
} }
bool hasWeights, hasBias; bool hasWeights, hasBias;
float epsilon; float epsilon, varMeanScale;
Mat invStdMat;
}; };
Ptr<BatchNormLayer> BatchNormLayer::create(const LayerParams& params) Ptr<BatchNormLayer> BatchNormLayer::create(const LayerParams& params)
......
...@@ -54,12 +54,74 @@ namespace dnn ...@@ -54,12 +54,74 @@ namespace dnn
class BaseConvolutionLayerImpl : public ConvolutionLayer class BaseConvolutionLayerImpl : public ConvolutionLayer
{ {
public: public:
BaseConvolutionLayerImpl(); BaseConvolutionLayerImpl()
virtual void allocate(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs); {
numOutput = -1;
group = -1;
inpH = inpW = inpCn = 0;
outH = outW = outCn = 0;
inpGroupCn = outGroupCn = 0;
ksize = 0;
bias = false;
#ifdef HAVE_LAPACK
int nthreads = cv::getThreadNum();
if (getBlasThreads() != nthreads)
{
setBlasThreads(nthreads);
}
#endif
}
void allocate(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
{
CV_Assert(inputs.size() > 0);
init();
const Mat &input = *inputs[0];
CV_Assert(input.dims == 4 && (input.type() == CV_32F || input.type() == CV_64F));
for (size_t i = 0; i < inputs.size(); i++)
{
CV_Assert(inputs[i]->type() == input.type());
CV_Assert(inputs[i]->dims == 4 && inputs[i]->size[1] == input.size[1]);
CV_Assert(inputs[i]->size[2] == input.size[2] && inputs[i]->size[3] == input.size[3]);
}
computeInpOutShape(input);
if (bias)
{
biasOnesBlob.create(1, outH * outW, input.type());
biasOnesBlob.setTo(1);
}
void init(); outputs.resize(inputs.size());
for (size_t i = 0; i < inputs.size(); i++)
{
int sz[] = { inputs[i]->size[0], outCn, outH, outW };
outputs[i].create(4, sz, input.type());
}
if (!is1x1())
{
colRowBlob.create((int)colRowBlobShape.size(), &colRowBlobShape[0], input.type());
colRowBlob.setTo(0);
}
}
void init()
{
CV_Assert(blobs.size() >= 1 && blobs.size() <= 2);
CV_Assert(blobs[0].dims == 4 && blobs[0].size[3] == kernel.width && blobs[0].size[2] == kernel.height);
bias = (blobs.size() >= 2);
}
virtual void computeInpOutShape(const Mat &inpBlob) = 0; virtual void computeInpOutShape(const Mat &inpBlob) = 0;
bool is1x1() const; bool is1x1() const
{
return (kernel.height == 1 && kernel.width == 1) &&
(stride.height == 1 && stride.width == 1) &&
(dilation.height == 1 && dilation.width == 1);
}
int numOutput, group; int numOutput, group;
int inpH, inpW, inpCn; int inpH, inpW, inpCn;
...@@ -76,306 +138,185 @@ public: ...@@ -76,306 +138,185 @@ public:
class ConvolutionLayerImpl : public BaseConvolutionLayerImpl class ConvolutionLayerImpl : public BaseConvolutionLayerImpl
{ {
public: public:
virtual void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs); void computeInpOutShape(const Mat &input)
virtual void computeInpOutShape(const Mat &inpBlob);
void im2col(const Mat &srcImg, Mat &dstCol);
void im2row(const Mat &srcImg, Mat &dstRow);
};
class DeConvolutionLayerImpl : public BaseConvolutionLayerImpl
{
public:
virtual void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs);
virtual void computeInpOutShape(const Mat &inpBlob);
void col2im(const Mat &colMat, Mat &dstImg);
};
BaseConvolutionLayerImpl::BaseConvolutionLayerImpl():
numOutput(-1), group(-1),
inpH(0), inpW(0), inpCn(0),
outH(0), outW(0), outCn(0),
inpGroupCn(0), outGroupCn(0),
ksize(0), bias(false)
{
#ifdef HAVE_LAPACK
if (getBlasThreads() != cv::getThreadNum())
{ {
setBlasThreads(cv::getThreadNum()); CV_Assert(!bias || blobs[1].total() == (size_t)blobs[0].size[0]);
}
#endif
}
void BaseConvolutionLayerImpl::init() numOutput = blobs[0].size[0];
{
CV_Assert(blobs.size() >= 1 && blobs.size() <= 2);
CV_Assert(blobs[0].dims == 4 && blobs[0].size[3] == kernel.width && blobs[0].size[2] == kernel.height);
bias = (blobs.size() >= 2); inpH = input.size[2];
} inpW = input.size[3];
inpCn = input.size[1];
outCn = numOutput;
void BaseConvolutionLayerImpl::allocate(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs) if (padMode.empty())
{ {
CV_Assert(inputs.size() > 0); outH = (inpH + 2 * pad.height - (dilation.height * (kernel.height - 1) + 1)) / stride.height + 1;
outW = (inpW + 2 * pad.width - (dilation.width * (kernel.width - 1) + 1)) / stride.width + 1;
init(); }
else
{
getConvPoolOutParams(inpH, inpW, kernel, stride, pad, padMode, outH, outW);
}
const Mat &input = *inputs[0]; group = inpCn / blobs[0].size[1];
CV_Assert(input.dims == 4 && (input.type() == CV_32F || input.type() == CV_64F));
for (size_t i = 0; i < inputs.size(); i++)
{
CV_Assert(inputs[i]->type() == input.type());
CV_Assert(inputs[i]->dims == 4 && inputs[i]->size[1] == input.size[1]);
CV_Assert(inputs[i]->size[2] == input.size[2] && inputs[i]->size[3] == input.size[3]);
}
computeInpOutShape(input); CV_Assert(inpCn % group == 0 && outCn % group == 0);
CV_Assert(blobs[0].size[0] == outCn && blobs[0].size[1] == inpCn / group);
if (bias) outGroupCn = outCn / group;
{ inpGroupCn = inpCn / group;
biasOnesBlob.create(1, outH * outW, input.type()); ksize = inpGroupCn * kernel.height * kernel.width;
biasOnesBlob.setTo(1);
}
outputs.resize(inputs.size()); colRowBlobShape.clear();
for (size_t i = 0; i < inputs.size(); i++) colRowBlobShape.push_back(outH*outW);
{ colRowBlobShape.push_back(ksize);
int sz[] = { inputs[i]->size[0], outCn, outH, outW };
outputs[i].create(4, sz, input.type());
} }
if (!is1x1()) void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
{ {
colRowBlob.create((int)colRowBlobShape.size(), &colRowBlobShape[0], input.type()); CV_Assert(inputs.size() > 0);
colRowBlob.setTo(0);
}
}
bool BaseConvolutionLayerImpl::is1x1() const Mat weightsMat = blobs[0].reshape(1, outCn);
{ Mat biasesMat = bias ? blobs[1].reshape(1, outCn) : Mat();
return (kernel.height == 1 && kernel.width == 1) &&
(stride.height == 1 && stride.width == 1) &&
(dilation.height == 1 && dilation.width == 1);
}
void ConvolutionLayerImpl::computeInpOutShape(const Mat &input) for (size_t ii = 0; ii < outputs.size(); ii++)
{
CV_Assert(!bias || blobs[1].total() == (size_t)blobs[0].size[0]);
numOutput = blobs[0].size[0];
inpH = input.size[2];
inpW = input.size[3];
inpCn = input.size[1];
outCn = numOutput;
if (padMode.empty())
{
outH = (inpH + 2 * pad.height - (dilation.height * (kernel.height - 1) + 1)) / stride.height + 1;
outW = (inpW + 2 * pad.width - (dilation.width * (kernel.width - 1) + 1)) / stride.width + 1;
}
else
{
getConvPoolOutParams(inpH, inpW, kernel, stride, pad, padMode, outH, outW);
}
group = inpCn / blobs[0].size[1];
CV_Assert(inpCn % group == 0 && outCn % group == 0);
CV_Assert(blobs[0].size[0] == outCn && blobs[0].size[1] == inpCn / group);
outGroupCn = outCn / group;
inpGroupCn = inpCn / group;
ksize = inpGroupCn * kernel.height * kernel.width;
colRowBlobShape.clear();
colRowBlobShape.push_back(outH*outW);
colRowBlobShape.push_back(ksize);
}
void ConvolutionLayerImpl::forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
{
CV_Assert(inputs.size() > 0);
Mat weightsMat = blobs[0].reshape(1, outCn);
Mat biasesMat = bias ? blobs[1].reshape(1, outCn) : Mat();
for (size_t ii = 0; ii < outputs.size(); ii++)
{
int numImg = inputs[ii]->size[0];
Mat inpMat = *inputs[ii];
Mat outMat = outputs[ii].reshape(1, numImg*group*outGroupCn);
for (int n = 0; n < numImg; n++)
{ {
for (int g = 0; g < group; g++) int numImg = inputs[ii]->size[0];
Mat inpMat = *inputs[ii];
Mat outMat = outputs[ii].reshape(1, numImg*group*outGroupCn);
for (int n = 0; n < numImg; n++)
{ {
Mat colMat, curInp = slice(inpMat, n, _Range(g * inpGroupCn, inpGroupCn)); for (int g = 0; g < group; g++)
{
Mat curInp = slice(inpMat, n, _Range(g * inpGroupCn, inpGroupCn));
im2row(curInp, colMat); im2row(curInp, colRowBlob);
_Range kerRange(g * outGroupCn, outGroupCn); _Range kerRange(g * outGroupCn, outGroupCn);
Mat kerMat = weightsMat.rowRange(kerRange); Mat kerMat = weightsMat.rowRange(kerRange);
_Range outRange((g + n * group) * outGroupCn, outGroupCn); _Range outRange((g + n * group) * outGroupCn, outGroupCn);
Mat dstMat = outMat.rowRange(outRange); Mat dstMat = outMat.rowRange(outRange);
dnn::gemm(kerMat, colMat, 1, dstMat, 0, GEMM_2_T); dnn::gemm(kerMat, colRowBlob, 1, dstMat, 0, GEMM_2_T);
if (bias) if (bias)
{ {
dnn::gemm(biasesMat.rowRange(kerRange), biasOnesBlob, 1, dstMat, 1); dnn::gemm(biasesMat.rowRange(kerRange), biasOnesBlob, 1, dstMat, 1);
}
} }
} }
} }
} }
}
void ConvolutionLayerImpl::im2col(const Mat &srcImg, Mat &dstCol) void im2row(const Mat &srcImg, Mat &dstRow)
{
if (is1x1())
{ {
dstCol = srcImg.reshape(1, ksize); if (is1x1())
return; {
transpose(srcImg.reshape(1, ksize), dstRow);
}
else
{
cv::dnn::im2row(srcImg.ptr<float>(), inpGroupCn, inpH, inpW, kernel.height,
kernel.width, pad.height, pad.width, stride.height, stride.width,
dilation.height, dilation.width, outH, outW, dstRow.ptr<float>());
}
} }
};
Mat &colMat = colRowBlob; class DeConvolutionLayerImpl : public BaseConvolutionLayerImpl
if (srcImg.type() == CV_32F)
im2col_CpuPBody<float>::run(srcImg.ptr<float>(), inpGroupCn, inpH, inpW, kernel.height,
kernel.width, pad.height, pad.width, stride.height, stride.width,
dilation.height, dilation.width, outH, outW, colMat.ptr<float>());
if (srcImg.type() == CV_64F)
im2col_CpuPBody<double>::run(srcImg.ptr<double>(), inpGroupCn, inpH, inpW, kernel.height,
kernel.width, pad.height, pad.width, stride.height, stride.width,
dilation.height, dilation.width, outH, outW, colMat.ptr<double>());
dstCol = colMat;
}
void ConvolutionLayerImpl::im2row(const Mat &srcImg, Mat &dstRow)
{ {
if (is1x1()) public:
void computeInpOutShape(const Mat &inpBlob)
{ {
dstRow = srcImg.reshape(1, ksize).t(); CV_Assert(!bias || blobs[1].total() == (size_t)blobs[0].size[0]);
return;
}
Mat &colMat = colRowBlob;
if (srcImg.type() == CV_32F)
im2row_CpuPBody<float>::run(srcImg.ptr<float>(), inpGroupCn, inpH, inpW, kernel.height,
kernel.width, pad.height, pad.width, stride.height, stride.width,
dilation.height, dilation.width, outH, outW, colMat.ptr<float>());
if (srcImg.type() == CV_64F)
im2row_CpuPBody<double>::run(srcImg.ptr<double>(), inpGroupCn, inpH, inpW, kernel.height,
kernel.width, pad.height, pad.width, stride.height, stride.width,
dilation.height, dilation.width, outH, outW, colMat.ptr<double>());
dstRow = colMat;
}
//Deconvolution numOutput = blobs[0].size[0];
void DeConvolutionLayerImpl::computeInpOutShape(const Mat &inpBlob) inpH = inpBlob.size[2];
{ inpW = inpBlob.size[3];
CV_Assert(!bias || blobs[1].total() == (size_t)blobs[0].size[0]); inpCn = inpBlob.size[1];
numOutput = blobs[0].size[0]; outH = stride.height * (inpH - 1) + kernel.height - 2 * pad.height + adjustPad.height;
outW = stride.width * (inpW - 1) + kernel.width - 2 * pad.width + adjustPad.width;
outCn = numOutput;
inpH = inpBlob.size[2]; group = inpCn / blobs[0].size[1];
inpW = inpBlob.size[3]; outGroupCn = outCn / group;
inpCn = inpBlob.size[1]; inpGroupCn = inpCn / group;
ksize = outGroupCn * kernel.height * kernel.width;
outH = stride.height * (inpH - 1) + kernel.height - 2 * pad.height + adjustPad.height; CV_Assert(inpCn % group == 0 && outCn % group == 0);
outW = stride.width * (inpW - 1) + kernel.width - 2 * pad.width + adjustPad.width; CV_Assert(blobs[0].size[0] == outCn && blobs[0].size[1] == inpCn / group);
outCn = numOutput;
group = inpCn / blobs[0].size[1]; colRowBlobShape.clear();
outGroupCn = outCn / group; colRowBlobShape.push_back(ksize);
inpGroupCn = inpCn / group; colRowBlobShape.push_back(inpH * inpW);
ksize = outGroupCn * kernel.height * kernel.width;
CV_Assert(inpCn % group == 0 && outCn % group == 0); ofsbuf.resize(ksize*3);
CV_Assert(blobs[0].size[0] == outCn && blobs[0].size[1] == inpCn / group); for( int k = 0; k < ksize; k++ )
{
colRowBlobShape.clear(); int w_offset = k % kernel.width;
colRowBlobShape.push_back(ksize); int h_offset = (k / kernel.width) % kernel.height;
colRowBlobShape.push_back(inpH * inpW); int c_im = k / kernel.height / kernel.width;
} ofsbuf[k*3] = w_offset;
ofsbuf[k*3+1] = h_offset;
void DeConvolutionLayerImpl::forward(std::vector<Mat *> &inputs, std::vector<Mat> &outputs) ofsbuf[k*3+2] = c_im;
{ }
Mat weightsMat = blobs[0].reshape(1, inpCn); }
Mat biasesMat = bias ? blobs[1].reshape(1, outCn) : Mat();
for (size_t ii = 0; ii < outputs.size(); ii++) void forward(std::vector<Mat *> &inputs, std::vector<Mat> &outputs)
{ {
int numImg = inputs[ii]->size[0]; Mat weightsMat = blobs[0].reshape(1, inpCn);
Mat convBlob = inputs[ii]->reshape(1, numImg*inpCn); Mat biasesMat = bias ? blobs[1].reshape(1, outCn) : Mat();
Mat decnBlob = outputs[ii].reshape(1, numImg*outCn);
for (int n = 0; n < numImg; n++) for (size_t ii = 0; ii < outputs.size(); ii++)
{ {
for (int g = 0; g < group; g++) int numImg = inputs[ii]->size[0];
Mat convBlob = inputs[ii]->reshape(1, numImg*inpCn);
Mat decnBlob = outputs[ii].reshape(1, numImg*outCn);
for (int n = 0; n < numImg; n++)
{ {
Mat dstMat = decnBlob.rowRange(_Range((g + n * group) * outGroupCn, outGroupCn)); for (int g = 0; g < group; g++)
Mat &colMat = (is1x1()) ? dstMat : colRowBlob; {
Mat dstMat = decnBlob.rowRange(_Range((g + n * group) * outGroupCn, outGroupCn));
Mat &colMat = (is1x1()) ? dstMat : colRowBlob;
Mat convMat = convBlob.rowRange(_Range((g + n * group) * inpGroupCn, inpGroupCn)); Mat convMat = convBlob.rowRange(_Range((g + n * group) * inpGroupCn, inpGroupCn));
Mat wghtMat = weightsMat.rowRange(_Range(g * inpGroupCn, inpGroupCn)); Mat wghtMat = weightsMat.rowRange(_Range(g * inpGroupCn, inpGroupCn));
dnn::gemm(wghtMat, convMat, 1, colMat, 0, GEMM_1_T); dnn::gemm(wghtMat, convMat, 1, colMat, 0, GEMM_1_T);
if (!is1x1()) if (!is1x1())
col2im(colMat, dstMat); col2im(colMat, dstMat);
if (bias) if (bias)
{ {
Mat curBiasMat = biasesMat.rowRange(_Range(g * outGroupCn, outGroupCn)); Mat curBiasMat = biasesMat.rowRange(_Range(g * outGroupCn, outGroupCn));
dnn::gemm(curBiasMat, biasOnesBlob, 1, dstMat, 1); dnn::gemm(curBiasMat, biasOnesBlob, 1, dstMat, 1);
}
} }
} }
} }
} }
}
void DeConvolutionLayerImpl::col2im(const Mat &colMat, Mat &dstImg) void col2im(const Mat &colMat, Mat &dstImg)
{
if (is1x1())
{ {
dstImg = colMat; if (is1x1())
return; {
dstImg = colMat;
return;
}
cv::dnn::col2im(colMat.ptr<float>(), outGroupCn, outH, outW, kernel.height, kernel.width,
pad.height, pad.width, stride.height, stride.width,
dilation.height, dilation.width, dstImg.ptr<float>(), &ofsbuf[0]);
} }
if (dstImg.type() == CV_32F)
col2im_CpuPBody<float>::run(colMat.ptr<float>(), outGroupCn, outH, outW, kernel.height, kernel.width, pad.height, pad.width, stride.height, stride.width, dstImg.ptr<float>());
if (dstImg.type() == CV_64F)
col2im_CpuPBody<double>::run(colMat.ptr<double>(), inpGroupCn, inpH, inpW, kernel.height, kernel.width, pad.height, pad.width, stride.height, stride.width, dstImg.ptr<double>());
}
//Initializers
/*Ptr<BaseConvolutionLayer> ConvolutionLayer::create(Size kernel, Size stride, Size pad, Size dilation) std::vector<int> ofsbuf;
{ };
ConvolutionLayerImpl *l = new ConvolutionLayerImpl();
l->kernel = kernel;
l->pad = pad;
l->stride = stride;
l->dilation = dilation;
return Ptr<BaseConvolutionLayer>(l);
}
Ptr<BaseConvolutionLayer> DeconvolutionLayer::create(Size kernel, Size stride, Size pad, Size dilation, Size adjustPad)
{
DeConvolutionLayerImpl *l = new DeConvolutionLayerImpl();
l->kernel = kernel;
l->pad = pad;
l->stride = stride;
l->dilation = dilation;
l->adjustPad = adjustPad;
return Ptr<BaseConvolutionLayer>(l);
}*/
//Convolution and Deconvolution //Convolution and Deconvolution
static void initConvDeconvLayerFromCaffe(Ptr<BaseConvolutionLayer> l, const LayerParams &params) static void initConvDeconvLayerFromCaffe(Ptr<BaseConvolutionLayer> l, const LayerParams &params)
......
...@@ -15,8 +15,7 @@ using std::pow; ...@@ -15,8 +15,7 @@ using std::pow;
template<typename Func> template<typename Func>
class ElementWiseLayer : public Func::Layer class ElementWiseLayer : public Func::Layer
{ {
Func func; public:
template<typename Dtype> template<typename Dtype>
class PBody : public cv::ParallelLoopBody class PBody : public cv::ParallelLoopBody
{ {
...@@ -35,9 +34,7 @@ class ElementWiseLayer : public Func::Layer ...@@ -35,9 +34,7 @@ class ElementWiseLayer : public Func::Layer
} }
}; };
public: ElementWiseLayer(bool run_parallel_=false, const Func &f=Func()) : func(f), run_parallel(run_parallel_) {}
ElementWiseLayer(const Func &f=Func()) : func(f) {}
void allocate(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs) void allocate(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
{ {
...@@ -58,9 +55,16 @@ public: ...@@ -58,9 +55,16 @@ public:
Range sizeRange = Range(0, dst.total()); Range sizeRange = Range(0, dst.total());
CV_Assert(src.type() == CV_32F); CV_Assert(src.type() == CV_32F);
cv::parallel_for_(sizeRange, PBody<float>(dst, func)); PBody<float> body(dst, func);
if( run_parallel )
cv::parallel_for_(sizeRange, body);
else
body(sizeRange);
} }
} }
Func func;
bool run_parallel;
}; };
struct ReLUFunctor struct ReLUFunctor
...@@ -135,8 +139,24 @@ struct PowerFunctor ...@@ -135,8 +139,24 @@ struct PowerFunctor
template<typename TFloat> template<typename TFloat>
inline TFloat operator()(TFloat x) const inline TFloat operator()(TFloat x) const
{ {
return power == 1.0f ? (TFloat)shift + (TFloat)scale * x : return pow((TFloat)shift + (TFloat)scale * x, (TFloat)power);
pow((TFloat)shift + (TFloat)scale * x, (TFloat)power); }
};
struct PowerFunctor1
{
typedef PowerLayer Layer;
const float scale;
const float shift;
PowerFunctor1(float scale_ = 1.f, float shift_ = 0)
: scale(scale_), shift(shift_) {}
template<typename TFloat>
inline TFloat operator()(TFloat x) const
{
return (TFloat)shift + (TFloat)scale * x;
} }
}; };
...@@ -165,12 +185,12 @@ public: ...@@ -165,12 +185,12 @@ public:
void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs) void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
{ {
CV_Assert(inputs.size() == 1); CV_Assert(inputs.size() == 1);
Mat &inpBlob = *inputs[0]; Mat &inpBlob = *inputs[0];
for (size_t ii = 0; ii < outputs.size(); ii++) for (size_t ii = 0; ii < outputs.size(); ii++)
{ {
Mat &outBlob = outputs[ii]; Mat &outBlob = outputs[ii];
CV_Assert(inpBlob.isContinuous() && outBlob.isContinuous());
CV_Assert(blobs[0].total() == inpBlob.size[1]); CV_Assert(blobs[0].total() == inpBlob.size[1]);
...@@ -181,8 +201,16 @@ public: ...@@ -181,8 +201,16 @@ public:
Mat inpBlobPlane = getPlane(inpBlob, 0, n); Mat inpBlobPlane = getPlane(inpBlob, 0, n);
Mat outBlobPlane = getPlane(outBlob, 0, n); Mat outBlobPlane = getPlane(outBlob, 0, n);
threshold(inpBlobPlane, outBlobPlane, 0, 0, cv::THRESH_TOZERO_INV); size_t i, planeTotal = inpBlobPlane.total();
scaleAdd(outBlobPlane, slopeWeight-1, inpBlobPlane, outBlobPlane); const float* inptr = inpBlobPlane.ptr<float>();
float* outptr = outBlobPlane.ptr<float>();
for( i = 0; i < planeTotal; i++ )
{
float val = inptr[i];
outptr[i] = val*(val >= 0.f ? 1.f : slopeWeight);
}
//threshold(inpBlobPlane, outBlobPlane, 0, 0, cv::THRESH_TOZERO_INV);
//scaleAdd(outBlobPlane, slopeWeight-1, inpBlobPlane, outBlobPlane);
} }
} }
} }
...@@ -196,7 +224,7 @@ Ptr<_Layer> _Layer::create() { \ ...@@ -196,7 +224,7 @@ Ptr<_Layer> _Layer::create() { \
Ptr<ReLULayer> ReLULayer::create(const LayerParams& params) Ptr<ReLULayer> ReLULayer::create(const LayerParams& params)
{ {
float negativeSlope = params.get<float>("negative_slope", 0.f); float negativeSlope = params.get<float>("negative_slope", 0.f);
Ptr<ReLULayer> l(new ElementWiseLayer<ReLUFunctor>(ReLUFunctor(negativeSlope))); Ptr<ReLULayer> l(new ElementWiseLayer<ReLUFunctor>(false, ReLUFunctor(negativeSlope)));
l->setParamsFrom(params); l->setParamsFrom(params);
return l; return l;
...@@ -204,7 +232,7 @@ Ptr<ReLULayer> ReLULayer::create(const LayerParams& params) ...@@ -204,7 +232,7 @@ Ptr<ReLULayer> ReLULayer::create(const LayerParams& params)
Ptr<TanHLayer> TanHLayer::create(const LayerParams& params) Ptr<TanHLayer> TanHLayer::create(const LayerParams& params)
{ {
Ptr<TanHLayer> l(new ElementWiseLayer<TanHFunctor>()); Ptr<TanHLayer> l(new ElementWiseLayer<TanHFunctor>(true));
l->setParamsFrom(params); l->setParamsFrom(params);
return l; return l;
...@@ -212,7 +240,7 @@ Ptr<TanHLayer> TanHLayer::create(const LayerParams& params) ...@@ -212,7 +240,7 @@ Ptr<TanHLayer> TanHLayer::create(const LayerParams& params)
Ptr<SigmoidLayer> SigmoidLayer::create(const LayerParams& params) Ptr<SigmoidLayer> SigmoidLayer::create(const LayerParams& params)
{ {
Ptr<SigmoidLayer> l(new ElementWiseLayer<SigmoidFunctor>()); Ptr<SigmoidLayer> l(new ElementWiseLayer<SigmoidFunctor>(true));
l->setParamsFrom(params); l->setParamsFrom(params);
return l; return l;
...@@ -228,7 +256,7 @@ Ptr<AbsLayer> AbsLayer::create(const LayerParams& params) ...@@ -228,7 +256,7 @@ Ptr<AbsLayer> AbsLayer::create(const LayerParams& params)
Ptr<BNLLLayer> BNLLLayer::create(const LayerParams& params) Ptr<BNLLLayer> BNLLLayer::create(const LayerParams& params)
{ {
Ptr<BNLLLayer> l(new ElementWiseLayer<BNLLFunctor>()); Ptr<BNLLLayer> l(new ElementWiseLayer<BNLLFunctor>(true));
l->setParamsFrom(params); l->setParamsFrom(params);
return l; return l;
...@@ -239,7 +267,9 @@ Ptr<PowerLayer> PowerLayer::create(const LayerParams& params) ...@@ -239,7 +267,9 @@ Ptr<PowerLayer> PowerLayer::create(const LayerParams& params)
float power = params.get<float>("power", 1.0f); float power = params.get<float>("power", 1.0f);
float scale = params.get<float>("scale", 1.0f); float scale = params.get<float>("scale", 1.0f);
float shift = params.get<float>("shift", 0.0f); float shift = params.get<float>("shift", 0.0f);
Ptr<PowerLayer> l(new ElementWiseLayer<PowerFunctor>(PowerFunctor(power, scale, shift))); Ptr<PowerLayer> l(power == 1.f ?
(PowerLayer*)(new ElementWiseLayer<PowerFunctor1>(false, PowerFunctor1(scale, shift))) :
(PowerLayer*)(new ElementWiseLayer<PowerFunctor>(true, PowerFunctor(power, scale, shift))));
l->setParamsFrom(params); l->setParamsFrom(params);
return l; return l;
......
...@@ -98,15 +98,14 @@ public: ...@@ -98,15 +98,14 @@ public:
void forward(std::vector<Mat *> &inputs, std::vector<Mat> &outputs) void forward(std::vector<Mat *> &inputs, std::vector<Mat> &outputs)
{ {
Mat& output = outputs[0];
switch (op) switch (op)
{ {
case SUM: case SUM:
{
CV_Assert(coeffs.size() == 0 || coeffs.size() == inputs.size()); CV_Assert(coeffs.size() == 0 || coeffs.size() == inputs.size());
Mat& output = outputs[0];
output.setTo(0.);
if (0 < coeffs.size()) if (0 < coeffs.size())
{ {
output.setTo(0.);
for (size_t i = 0; i < inputs.size(); i++) for (size_t i = 0; i < inputs.size(); i++)
{ {
output += *inputs[i] * coeffs[i]; output += *inputs[i] * coeffs[i];
...@@ -114,32 +113,26 @@ public: ...@@ -114,32 +113,26 @@ public:
} }
else else
{ {
for (size_t i = 0; i < inputs.size(); i++) add(*inputs[0], *inputs[1], output);
for (size_t i = 2; i < inputs.size(); i++)
{ {
output += *inputs[i]; output += *inputs[i];
} }
} }
}
break; break;
case PROD: case PROD:
{
Mat& output = outputs[0];
output.setTo(1.); output.setTo(1.);
for (size_t i = 0; i < inputs.size(); i++) for (size_t i = 0; i < inputs.size(); i++)
{ {
output = output.mul(*inputs[i]); output = output.mul(*inputs[i]);
} }
}
break; break;
case MAX: case MAX:
{
Mat& output = outputs[0];
cv::max(*inputs[0], *inputs[1], output); cv::max(*inputs[0], *inputs[1], output);
for (size_t i = 2; i < inputs.size(); i++) for (size_t i = 2; i < inputs.size(); i++)
{ {
cv::max(output, *inputs[i], output); cv::max(output, *inputs[i], output);
} }
}
break; break;
default: default:
CV_Assert(0); CV_Assert(0);
......
...@@ -44,3 +44,326 @@ ...@@ -44,3 +44,326 @@
#include "opencl_kernels_dnn.hpp" #include "opencl_kernels_dnn.hpp"
#include "op_im2col.hpp" #include "op_im2col.hpp"
#include "opencl_kernels_dnn.hpp" #include "opencl_kernels_dnn.hpp"
namespace cv {
namespace dnn {
#if 0
template <typename Dtype>
class im2col_CpuPBody : public cv::ParallelLoopBody
{
const Dtype* data_im;
int channels, height, width;
int kernel_h, kernel_w;
int pad_h, pad_w;
int stride_h, stride_w;
int dilation_h, dilation_w;
Dtype* data_col;
int height_col, width_col, channels_col;
im2col_CpuPBody() {}
public:
static void run(const Dtype* data_im,
int channels, int height, int width,
int kernel_h, int kernel_w,
int pad_h, int pad_w,
int stride_h, int stride_w,
int dilation_h, int dilation_w,
int height_col, int width_col,
Dtype* data_col)
{
im2col_CpuPBody<Dtype> t;
t.data_im = data_im;
t.data_col = data_col;
t.channels = channels; t.height = height; t.width = width;
t.kernel_h = kernel_h; t.kernel_w = kernel_w;
t.pad_h = pad_h; t.pad_w = pad_w;
t.stride_h = stride_h; t.stride_w = stride_w;
t.dilation_h = dilation_h; t.dilation_w = dilation_w;
t.height_col = height_col;
t.width_col = width_col;
t.channels_col = channels * kernel_h * kernel_w;
cv::parallel_for_(Range(0, t.channels_col), t);
}
virtual void operator ()(const Range &r) const
{
for (int c = r.start; c < r.end; ++c)
{
int w_offset = c % kernel_w;
int h_offset = (c / kernel_w) % kernel_h;
int c_im = c / kernel_h / kernel_w;
for (int h = 0; h < height_col; ++h)
{
for (int w = 0; w < width_col; ++w)
{
int h_pad = h * stride_h - pad_h + h_offset * dilation_h;
int w_pad = w * stride_w - pad_w + w_offset * dilation_w;
if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
data_col[(c * height_col + h) * width_col + w] =
data_im[(c_im * height + h_pad) * width + w_pad];
else
data_col[(c * height_col + h) * width_col + w] = 0;
}
}
}
}
};
#endif
template <typename Dtype>
class im2row_CpuPBody : public cv::ParallelLoopBody
{
const Dtype* data_im;
int channels, height, width;
int kernel_h, kernel_w;
int pad_h, pad_w;
int stride_h, stride_w;
int dilation_h, dilation_w;
Dtype* data_col;
int height_col, width_col, channels_col;
im2row_CpuPBody() {}
public:
static void run(const Dtype* data_im,
int channels, int height, int width,
int kernel_h, int kernel_w,
int pad_h, int pad_w,
int stride_h, int stride_w,
int dilation_h, int dilation_w,
int height_col, int width_col,
Dtype* data_col)
{
im2row_CpuPBody<Dtype> t;
t.data_im = data_im;
t.data_col = data_col;
t.channels = channels; t.height = height; t.width = width;
t.kernel_h = kernel_h; t.kernel_w = kernel_w;
t.pad_h = pad_h; t.pad_w = pad_w;
t.stride_h = stride_h; t.stride_w = stride_w;
t.dilation_h = dilation_h; t.dilation_w = dilation_w;
t.height_col = height_col;
t.width_col = width_col;
t.channels_col = channels * kernel_h * kernel_w;
int total = t.height_col*t.width_col;
#if 1
t(Range(0, total));
#else
cv::parallel_for_(Range(0, total), t, 16);
#endif
}
virtual void operator ()(const Range &r) const
{
int dh = dilation_h, dw = dilation_w;
int kh = kernel_h, kw = kernel_w;
Dtype* data_col_ = data_col;
const Dtype* data_im_ = data_im;
int kelems = kh*kw;
AutoBuffer<int> ofs_(kelems);
int* ofs = ofs_;
int k = 0;
for( int k_r = 0; k_r < kernel_h; k_r++ )
for( int k_c = 0; k_c < kernel_w; k_c++, k++ )
ofs[k] = k_r*dh*width + k_c*dw;
for (int row = r.start; row < r.end; ++row)
{
int out_c = row % width_col;
int out_r = row / width_col;
int out_row_offset = row*kh*kw*channels;
int start_in_r = out_r * stride_h - pad_h;
int start_in_c = out_c * stride_w - pad_w;
int start_k_r = std::max(0, (-start_in_r + dilation_h-1)/dilation_h);
int end_k_r = std::min(kh, (height - start_in_r + dilation_h-1)/dilation_h);
int start_k_c = std::max(0, (-start_in_c + dilation_w-1)/dilation_w);
int end_k_c = std::min(kw, (width - start_in_c + dilation_w-1)/dilation_w);
if( start_k_r == 0 && end_k_r == kh && start_k_c == 0 && end_k_c == kw )
{
for( int i_c = 0; i_c < channels; i_c++ )
{
float* data_col_c = data_col_ + out_row_offset + i_c*kh*kw;
const float* data_im_c = data_im_ + (i_c*height + start_in_r)*width + start_in_c;
for( k = 0; k < kelems; k++ )
{
data_col_c[k] = data_im_c[ofs[k]];
}
}
}
else
{
memset(data_col_, 0, kw*kh*channels*sizeof(data_col_[0]));
for(int i_c = 0; i_c < channels; i_c++)
{
int channels_offset = i_c * width * height;
int out_ch_offset = i_c*kh*kw;
int in_r = start_in_r + start_k_r*dh;
for(int k_r = start_k_r; k_r < end_k_r; k_r++, in_r += dh)
{
int row_offset = in_r*width;
int out_col_offset = k_r*kw;
int in_c = start_in_c + start_k_c*dw;
for(int k_c = start_k_c; k_c < end_k_c; k_c++, in_c += dw)
{
int in_index = channels_offset + row_offset + in_c;
int out_index = out_row_offset + out_ch_offset + out_col_offset + k_c;
data_col_[out_index] = data_im_[in_index];
}
}
}
}
}
}
};
void im2row(const float* data_im, int channels, int height, int width,
int kernel_h, int kernel_w, int pad_h, int pad_w,
int stride_h, int stride_w, int dilation_h, int dilation_w,
int height_col, int width_col, float* data_col)
{
im2row_CpuPBody<float>::run(data_im, channels, height, width,
kernel_h, kernel_w, pad_h, pad_w,
stride_h, stride_w, dilation_h, dilation_w,
height_col, width_col, data_col);
}
#if 0
template <typename Dtype>
class col2im_CpuPBody : public cv::ParallelLoopBody
{
const Dtype* data_col;
int channels, height, width;
int kernel_h, kernel_w;
int pad_h, pad_w;
int stride_h, stride_w;
Dtype* data_im;
int height_col, width_col;
col2im_CpuPBody() {}
public:
static void run(const Dtype* data_col,
int channels, int height, int width,
int kernel_h, int kernel_w,
int pad_h, int pad_w,
int stride_h, int stride_w,
Dtype* data_im)
{
//TODO: single-threaded version switch
col2im_CpuPBody t;
t.data_col = data_col;
t.data_im = data_im;
t.channels = channels; t.height = height; t.width = width;
t.kernel_h = kernel_h; t.kernel_w = kernel_w;
t.pad_h = pad_h; t.pad_w = pad_w;
t.stride_h = stride_h; t.stride_w = stride_w;
t.height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
t.width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
int img_total = channels * height * width;
cv::parallel_for_(Range(0, img_total), t);
}
virtual void operator ()(const Range &r) const
{
const Dtype* data_col_ = data_col;
Dtype* data_im_ = data_im;
int coeff_h_col = (1 - stride_h * kernel_w * height_col) * width_col;
int coeff_w_col = (1 - stride_w * height_col * width_col);
for (int index = r.start; index < r.end; index++)
{
Dtype val = 0;
int w = index % width + pad_w;
int h = (index / width) % height + pad_h;
int c = index / (width * height);
// compute the start and end of the output
int w_col_start = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
int w_col_end = std::min(w / stride_w + 1, width_col);
int h_col_start = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
int h_col_end = std::min(h / stride_h + 1, height_col);
// equivalent implementation
int offset =
(c * kernel_h * kernel_w + h * kernel_w + w) * height_col * width_col;
for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
val += data_col_[offset + h_col * coeff_h_col + w_col * coeff_w_col];
}
}
data_im_[index] = val;
}
}
};
#endif
//single-threaded version
template <typename Dtype>
void col2im_cpu(const Dtype* data_col,
int channels, int height, int width,
int kernel_h, int kernel_w,
int pad_h, int pad_w,
int stride_h, int stride_w,
int dilation_h, int dilation_w,
Dtype* data_im,
const int* ofsbuf)
{
int height_col = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
int width_col = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
int channels_col = channels * kernel_h * kernel_w;
std::memset(data_im, 0, height * width * channels * sizeof(Dtype));
for (int c = 0; c < channels_col; ++c, ofsbuf += 3)
{
//int w_offset = c % kernel_w;
//int h_offset = (c / kernel_w) % kernel_h;
//int c_im = c / kernel_h / kernel_w;
int w_offset = ofsbuf[0];
int h_offset = ofsbuf[1];
int c_im = ofsbuf[2];
for (int h = 0; h < height_col; ++h)
{
for (int w = 0; w < width_col; ++w)
{
int h_pad = h * stride_h - pad_h + h_offset * dilation_h;
int w_pad = w * stride_w - pad_w + w_offset * dilation_w;
if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
data_im[(c_im * height + h_pad) * width + w_pad] +=
data_col[(c * height_col + h) * width_col + w];
}
}
}
}
void col2im(const float* data_col, int channels, int height, int width,
int kernel_h, int kernel_w, int pad_h, int pad_w,
int stride_h, int stride_w, int dilation_h, int dilation_w,
float* data_im, const int* ofsbuf)
{
//col2im_CpuPBody<float>::run(data_col, channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, data_im);
col2im_cpu(data_col, channels, height, width, kernel_h, kernel_w, pad_h, pad_w,
stride_h, stride_w, dilation_h, dilation_w, data_im, ofsbuf);
}
}
}
...@@ -49,264 +49,15 @@ namespace cv ...@@ -49,264 +49,15 @@ namespace cv
namespace dnn namespace dnn
{ {
template <typename Dtype> void im2row(const float* data_im, int channels, int height, int width,
class im2col_CpuPBody : public cv::ParallelLoopBody int kernel_h, int kernel_w, int pad_h, int pad_w,
{ int stride_h, int stride_w, int dilation_h, int dilation_w,
const Dtype* data_im; int height_col, int width_col, float* data_col);
int channels, height, width;
int kernel_h, kernel_w; void col2im(const float* data_col, int channels, int height, int width,
int pad_h, pad_w; int kernel_h, int kernel_w, int pad_h, int pad_w,
int stride_h, stride_w; int stride_h, int stride_w, int dilation_h, int dilation_w,
int dilation_h, dilation_w; float* data_im, const int* ofsbuf);
Dtype* data_col;
int height_col, width_col, channels_col;
im2col_CpuPBody() {}
public:
static void run(const Dtype* data_im,
int channels, int height, int width,
int kernel_h, int kernel_w,
int pad_h, int pad_w,
int stride_h, int stride_w,
int dilation_h, int dilation_w,
int height_col, int width_col,
Dtype* data_col)
{
im2col_CpuPBody<Dtype> t;
t.data_im = data_im;
t.data_col = data_col;
t.channels = channels; t.height = height; t.width = width;
t.kernel_h = kernel_h; t.kernel_w = kernel_w;
t.pad_h = pad_h; t.pad_w = pad_w;
t.stride_h = stride_h; t.stride_w = stride_w;
t.dilation_h = dilation_h; t.dilation_w = dilation_w;
t.height_col = height_col;
t.width_col = width_col;
t.channels_col = channels * kernel_h * kernel_w;
cv::parallel_for_(Range(0, t.channels_col), t);
}
virtual void operator ()(const Range &r) const
{
for (int c = r.start; c < r.end; ++c)
{
int w_offset = c % kernel_w;
int h_offset = (c / kernel_w) % kernel_h;
int c_im = c / kernel_h / kernel_w;
for (int h = 0; h < height_col; ++h)
{
for (int w = 0; w < width_col; ++w)
{
int h_pad = h * stride_h - pad_h + h_offset * dilation_h;
int w_pad = w * stride_w - pad_w + w_offset * dilation_w;
if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
data_col[(c * height_col + h) * width_col + w] =
data_im[(c_im * height + h_pad) * width + w_pad];
else
data_col[(c * height_col + h) * width_col + w] = 0;
}
}
}
}
};
template <typename Dtype>
class im2row_CpuPBody : public cv::ParallelLoopBody
{
const Dtype* data_im;
int channels, height, width;
int kernel_h, kernel_w;
int pad_h, pad_w;
int stride_h, stride_w;
int dilation_h, dilation_w;
Dtype* data_col;
int height_col, width_col, channels_col;
im2row_CpuPBody() {}
public:
static void run(const Dtype* data_im,
int channels, int height, int width,
int kernel_h, int kernel_w,
int pad_h, int pad_w,
int stride_h, int stride_w,
int dilation_h, int dilation_w,
int height_col, int width_col,
Dtype* data_col)
{
im2row_CpuPBody<Dtype> t;
t.data_im = data_im;
t.data_col = data_col;
t.channels = channels; t.height = height; t.width = width;
t.kernel_h = kernel_h; t.kernel_w = kernel_w;
t.pad_h = pad_h; t.pad_w = pad_w;
t.stride_h = stride_h; t.stride_w = stride_w;
t.dilation_h = dilation_h; t.dilation_w = dilation_w;
t.height_col = height_col;
t.width_col = width_col;
t.channels_col = channels * kernel_h * kernel_w;
cv::parallel_for_(Range(0, t.height_col*t.width_col), t, 16);
}
virtual void operator ()(const Range &r) const
{
int dh = dilation_h, dw = dilation_w;
Dtype* data_col_ = data_col;
const Dtype* data_im_ = data_im;
for (int row = r.start; row < r.end; ++row)
{
int out_c = row % width_col;
int out_r = row / width_col;
int out_row_offset = row*kernel_h*kernel_w*channels;
int start_in_r = out_r * stride_h - pad_h;
int start_in_c = out_c * stride_w - pad_w;
int start_k_r = std::max(0, cvCeil(-start_in_r/(float)dilation_h));
int end_k_r = std::min(kernel_h, cvCeil((height - start_in_r)/(float)dilation_h));
int start_k_c = std::max(0, cvCeil(-start_in_c/(float)dilation_w));
int end_k_c = std::min(kernel_w, cvCeil((width - start_in_c)/(float)dilation_w));
for(int i_c = 0; i_c < channels; i_c++)
{
int channels_offset = i_c * width * height;
int out_ch_offset = i_c*kernel_h*kernel_w;
int in_r = start_in_r + start_k_r*dilation_h;
for(int k_r = start_k_r; k_r < end_k_r; k_r++, in_r += dh)
{
int row_offset = in_r*width;
int out_col_offset = k_r*kernel_w;
int in_c = start_in_c + start_k_c*dilation_w;
for(int k_c = start_k_c; k_c < end_k_c; k_c++, in_c += dw)
{
int in_index = channels_offset + row_offset + in_c;
int out_index = out_row_offset + out_ch_offset + out_col_offset + k_c;
data_col_[out_index] = data_im_[in_index];
}
}
}
}
}
};
template <typename Dtype>
class col2im_CpuPBody : public cv::ParallelLoopBody
{
const Dtype* data_col;
int channels, height, width;
int kernel_h, kernel_w;
int pad_h, pad_w;
int stride_h, stride_w;
Dtype* data_im;
int height_col, width_col;
col2im_CpuPBody() {}
public:
static void run(const Dtype* data_col,
int channels, int height, int width,
int kernel_h, int kernel_w,
int pad_h, int pad_w,
int stride_h, int stride_w,
Dtype* data_im)
{
//TODO: single-threaded version switch
col2im_CpuPBody t;
t.data_col = data_col;
t.data_im = data_im;
t.channels = channels; t.height = height; t.width = width;
t.kernel_h = kernel_h; t.kernel_w = kernel_w;
t.pad_h = pad_h; t.pad_w = pad_w;
t.stride_h = stride_h; t.stride_w = stride_w;
t.height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
t.width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
int img_total = channels * height * width;
cv::parallel_for_(Range(0, img_total), t);
}
virtual void operator ()(const Range &r) const
{
const Dtype* data_col_ = data_col;
Dtype* data_im_ = data_im;
int coeff_h_col = (1 - stride_h * kernel_w * height_col) * width_col;
int coeff_w_col = (1 - stride_w * height_col * width_col);
for (int index = r.start; index < r.end; index++)
{
Dtype val = 0;
int w = index % width + pad_w;
int h = (index / width) % height + pad_h;
int c = index / (width * height);
// compute the start and end of the output
int w_col_start = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
int w_col_end = std::min(w / stride_w + 1, width_col);
int h_col_start = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
int h_col_end = std::min(h / stride_h + 1, height_col);
// equivalent implementation
int offset =
(c * kernel_h * kernel_w + h * kernel_w + w) * height_col * width_col;
for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
val += data_col_[offset + h_col * coeff_h_col + w_col * coeff_w_col];
}
}
data_im_[index] = val;
}
}
};
//single-threaded version
template <typename Dtype>
void col2im_cpu(const Dtype* data_col,
int channels, int height, int width,
int kernel_h, int kernel_w,
int pad_h, int pad_w,
int stride_h, int stride_w,
int dilation_h, int dilation_w,
Dtype* data_im)
{
int height_col = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
int width_col = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
int channels_col = channels * kernel_h * kernel_w;
std::memset(data_im, 0, height * width * channels * sizeof(Dtype));
for (int c = 0; c < channels_col; ++c)
{
int w_offset = c % kernel_w;
int h_offset = (c / kernel_w) % kernel_h;
int c_im = c / kernel_h / kernel_w;
for (int h = 0; h < height_col; ++h)
{
for (int w = 0; w < width_col; ++w)
{
int h_pad = h * stride_h - pad_h + h_offset * dilation_h;
int w_pad = w * stride_w - pad_w + w_offset * dilation_w;
if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
data_im[(c_im * height + h_pad) * width + w_pad] +=
data_col[(c * height_col + h) * width_col + w];
}
}
}
}
} }
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment