Commit b97931e0 authored by Vadim Pisarevsky's avatar Vadim Pisarevsky

Merge pull request #1136 from vpisarev:dnn5

parents 3908909d 75789089
......@@ -98,14 +98,19 @@ int main(int argc, char **argv)
net.setBlob("", inputBlob); //set the network input
//! [Set input blob]
const int N = 3;
TickMeter tm;
tm.start();
//! [Make forward pass]
net.forward(); //compute output
//! [Make forward pass]
tm.stop();
for( int i = 0; i < N; i++ )
{
TickMeter tm_;
tm_.start();
net.forward(); //compute output
tm_.stop();
if( i == 0 || tm_.getTimeTicks() < tm.getTimeTicks() )
tm = tm_;
}
//! [Gather output]
......
......@@ -41,6 +41,15 @@ public:
Mat* inp = inputs[i];
outputs[i].create(inp->dims, &inp->size.p[0], inp->type());
}
varMeanScale = 1.f;
if (!hasWeights && !hasBias) {
varMeanScale = *blobs[2].ptr<float>();
if (varMeanScale != 0)
varMeanScale = 1/varMeanScale;
}
cv::pow(blobs[1]*varMeanScale + epsilon, -0.5, invStdMat);
}
void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
......@@ -52,16 +61,6 @@ public:
int weightsBlobIndex = 2;
int biasBlobIndex = weightsBlobIndex + hasWeights;
float varMeanScale = 1;
if (!hasWeights && !hasBias) {
varMeanScale = *blobs[2].ptr<float>();
if (varMeanScale != 0)
varMeanScale = 1/varMeanScale;
}
Mat invStdMat;
cv::pow(blobs[1]*varMeanScale + epsilon, -0.5, invStdMat);
int rows = inpBlob.size[2];
int cols = inpBlob.size[3];
......@@ -92,7 +91,8 @@ public:
}
bool hasWeights, hasBias;
float epsilon;
float epsilon, varMeanScale;
Mat invStdMat;
};
Ptr<BatchNormLayer> BatchNormLayer::create(const LayerParams& params)
......
......@@ -54,12 +54,74 @@ namespace dnn
class BaseConvolutionLayerImpl : public ConvolutionLayer
{
public:
BaseConvolutionLayerImpl();
virtual void allocate(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs);
BaseConvolutionLayerImpl()
{
numOutput = -1;
group = -1;
inpH = inpW = inpCn = 0;
outH = outW = outCn = 0;
inpGroupCn = outGroupCn = 0;
ksize = 0;
bias = false;
#ifdef HAVE_LAPACK
int nthreads = cv::getThreadNum();
if (getBlasThreads() != nthreads)
{
setBlasThreads(nthreads);
}
#endif
}
void allocate(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
{
CV_Assert(inputs.size() > 0);
init();
const Mat &input = *inputs[0];
CV_Assert(input.dims == 4 && (input.type() == CV_32F || input.type() == CV_64F));
for (size_t i = 0; i < inputs.size(); i++)
{
CV_Assert(inputs[i]->type() == input.type());
CV_Assert(inputs[i]->dims == 4 && inputs[i]->size[1] == input.size[1]);
CV_Assert(inputs[i]->size[2] == input.size[2] && inputs[i]->size[3] == input.size[3]);
}
computeInpOutShape(input);
if (bias)
{
biasOnesBlob.create(1, outH * outW, input.type());
biasOnesBlob.setTo(1);
}
void init();
outputs.resize(inputs.size());
for (size_t i = 0; i < inputs.size(); i++)
{
int sz[] = { inputs[i]->size[0], outCn, outH, outW };
outputs[i].create(4, sz, input.type());
}
if (!is1x1())
{
colRowBlob.create((int)colRowBlobShape.size(), &colRowBlobShape[0], input.type());
colRowBlob.setTo(0);
}
}
void init()
{
CV_Assert(blobs.size() >= 1 && blobs.size() <= 2);
CV_Assert(blobs[0].dims == 4 && blobs[0].size[3] == kernel.width && blobs[0].size[2] == kernel.height);
bias = (blobs.size() >= 2);
}
virtual void computeInpOutShape(const Mat &inpBlob) = 0;
bool is1x1() const;
bool is1x1() const
{
return (kernel.height == 1 && kernel.width == 1) &&
(stride.height == 1 && stride.width == 1) &&
(dilation.height == 1 && dilation.width == 1);
}
int numOutput, group;
int inpH, inpW, inpCn;
......@@ -76,306 +138,185 @@ public:
class ConvolutionLayerImpl : public BaseConvolutionLayerImpl
{
public:
virtual void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs);
virtual void computeInpOutShape(const Mat &inpBlob);
void im2col(const Mat &srcImg, Mat &dstCol);
void im2row(const Mat &srcImg, Mat &dstRow);
};
class DeConvolutionLayerImpl : public BaseConvolutionLayerImpl
{
public:
virtual void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs);
virtual void computeInpOutShape(const Mat &inpBlob);
void col2im(const Mat &colMat, Mat &dstImg);
};
BaseConvolutionLayerImpl::BaseConvolutionLayerImpl():
numOutput(-1), group(-1),
inpH(0), inpW(0), inpCn(0),
outH(0), outW(0), outCn(0),
inpGroupCn(0), outGroupCn(0),
ksize(0), bias(false)
{
#ifdef HAVE_LAPACK
if (getBlasThreads() != cv::getThreadNum())
void computeInpOutShape(const Mat &input)
{
setBlasThreads(cv::getThreadNum());
}
#endif
}
CV_Assert(!bias || blobs[1].total() == (size_t)blobs[0].size[0]);
void BaseConvolutionLayerImpl::init()
{
CV_Assert(blobs.size() >= 1 && blobs.size() <= 2);
CV_Assert(blobs[0].dims == 4 && blobs[0].size[3] == kernel.width && blobs[0].size[2] == kernel.height);
numOutput = blobs[0].size[0];
bias = (blobs.size() >= 2);
}
inpH = input.size[2];
inpW = input.size[3];
inpCn = input.size[1];
outCn = numOutput;
void BaseConvolutionLayerImpl::allocate(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
{
CV_Assert(inputs.size() > 0);
init();
if (padMode.empty())
{
outH = (inpH + 2 * pad.height - (dilation.height * (kernel.height - 1) + 1)) / stride.height + 1;
outW = (inpW + 2 * pad.width - (dilation.width * (kernel.width - 1) + 1)) / stride.width + 1;
}
else
{
getConvPoolOutParams(inpH, inpW, kernel, stride, pad, padMode, outH, outW);
}
const Mat &input = *inputs[0];
CV_Assert(input.dims == 4 && (input.type() == CV_32F || input.type() == CV_64F));
for (size_t i = 0; i < inputs.size(); i++)
{
CV_Assert(inputs[i]->type() == input.type());
CV_Assert(inputs[i]->dims == 4 && inputs[i]->size[1] == input.size[1]);
CV_Assert(inputs[i]->size[2] == input.size[2] && inputs[i]->size[3] == input.size[3]);
}
group = inpCn / blobs[0].size[1];
computeInpOutShape(input);
CV_Assert(inpCn % group == 0 && outCn % group == 0);
CV_Assert(blobs[0].size[0] == outCn && blobs[0].size[1] == inpCn / group);
if (bias)
{
biasOnesBlob.create(1, outH * outW, input.type());
biasOnesBlob.setTo(1);
}
outGroupCn = outCn / group;
inpGroupCn = inpCn / group;
ksize = inpGroupCn * kernel.height * kernel.width;
outputs.resize(inputs.size());
for (size_t i = 0; i < inputs.size(); i++)
{
int sz[] = { inputs[i]->size[0], outCn, outH, outW };
outputs[i].create(4, sz, input.type());
colRowBlobShape.clear();
colRowBlobShape.push_back(outH*outW);
colRowBlobShape.push_back(ksize);
}
if (!is1x1())
void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
{
colRowBlob.create((int)colRowBlobShape.size(), &colRowBlobShape[0], input.type());
colRowBlob.setTo(0);
}
}
CV_Assert(inputs.size() > 0);
bool BaseConvolutionLayerImpl::is1x1() const
{
return (kernel.height == 1 && kernel.width == 1) &&
(stride.height == 1 && stride.width == 1) &&
(dilation.height == 1 && dilation.width == 1);
}
Mat weightsMat = blobs[0].reshape(1, outCn);
Mat biasesMat = bias ? blobs[1].reshape(1, outCn) : Mat();
void ConvolutionLayerImpl::computeInpOutShape(const Mat &input)
{
CV_Assert(!bias || blobs[1].total() == (size_t)blobs[0].size[0]);
numOutput = blobs[0].size[0];
inpH = input.size[2];
inpW = input.size[3];
inpCn = input.size[1];
outCn = numOutput;
if (padMode.empty())
{
outH = (inpH + 2 * pad.height - (dilation.height * (kernel.height - 1) + 1)) / stride.height + 1;
outW = (inpW + 2 * pad.width - (dilation.width * (kernel.width - 1) + 1)) / stride.width + 1;
}
else
{
getConvPoolOutParams(inpH, inpW, kernel, stride, pad, padMode, outH, outW);
}
group = inpCn / blobs[0].size[1];
CV_Assert(inpCn % group == 0 && outCn % group == 0);
CV_Assert(blobs[0].size[0] == outCn && blobs[0].size[1] == inpCn / group);
outGroupCn = outCn / group;
inpGroupCn = inpCn / group;
ksize = inpGroupCn * kernel.height * kernel.width;
colRowBlobShape.clear();
colRowBlobShape.push_back(outH*outW);
colRowBlobShape.push_back(ksize);
}
void ConvolutionLayerImpl::forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
{
CV_Assert(inputs.size() > 0);
Mat weightsMat = blobs[0].reshape(1, outCn);
Mat biasesMat = bias ? blobs[1].reshape(1, outCn) : Mat();
for (size_t ii = 0; ii < outputs.size(); ii++)
{
int numImg = inputs[ii]->size[0];
Mat inpMat = *inputs[ii];
Mat outMat = outputs[ii].reshape(1, numImg*group*outGroupCn);
for (int n = 0; n < numImg; n++)
for (size_t ii = 0; ii < outputs.size(); ii++)
{
for (int g = 0; g < group; g++)
int numImg = inputs[ii]->size[0];
Mat inpMat = *inputs[ii];
Mat outMat = outputs[ii].reshape(1, numImg*group*outGroupCn);
for (int n = 0; n < numImg; n++)
{
Mat colMat, curInp = slice(inpMat, n, _Range(g * inpGroupCn, inpGroupCn));
for (int g = 0; g < group; g++)
{
Mat curInp = slice(inpMat, n, _Range(g * inpGroupCn, inpGroupCn));
im2row(curInp, colMat);
im2row(curInp, colRowBlob);
_Range kerRange(g * outGroupCn, outGroupCn);
Mat kerMat = weightsMat.rowRange(kerRange);
_Range kerRange(g * outGroupCn, outGroupCn);
Mat kerMat = weightsMat.rowRange(kerRange);
_Range outRange((g + n * group) * outGroupCn, outGroupCn);
Mat dstMat = outMat.rowRange(outRange);
_Range outRange((g + n * group) * outGroupCn, outGroupCn);
Mat dstMat = outMat.rowRange(outRange);
dnn::gemm(kerMat, colMat, 1, dstMat, 0, GEMM_2_T);
dnn::gemm(kerMat, colRowBlob, 1, dstMat, 0, GEMM_2_T);
if (bias)
{
dnn::gemm(biasesMat.rowRange(kerRange), biasOnesBlob, 1, dstMat, 1);
if (bias)
{
dnn::gemm(biasesMat.rowRange(kerRange), biasOnesBlob, 1, dstMat, 1);
}
}
}
}
}
}
void ConvolutionLayerImpl::im2col(const Mat &srcImg, Mat &dstCol)
{
if (is1x1())
void im2row(const Mat &srcImg, Mat &dstRow)
{
dstCol = srcImg.reshape(1, ksize);
return;
if (is1x1())
{
transpose(srcImg.reshape(1, ksize), dstRow);
}
else
{
cv::dnn::im2row(srcImg.ptr<float>(), inpGroupCn, inpH, inpW, kernel.height,
kernel.width, pad.height, pad.width, stride.height, stride.width,
dilation.height, dilation.width, outH, outW, dstRow.ptr<float>());
}
}
};
Mat &colMat = colRowBlob;
if (srcImg.type() == CV_32F)
im2col_CpuPBody<float>::run(srcImg.ptr<float>(), inpGroupCn, inpH, inpW, kernel.height,
kernel.width, pad.height, pad.width, stride.height, stride.width,
dilation.height, dilation.width, outH, outW, colMat.ptr<float>());
if (srcImg.type() == CV_64F)
im2col_CpuPBody<double>::run(srcImg.ptr<double>(), inpGroupCn, inpH, inpW, kernel.height,
kernel.width, pad.height, pad.width, stride.height, stride.width,
dilation.height, dilation.width, outH, outW, colMat.ptr<double>());
dstCol = colMat;
}
void ConvolutionLayerImpl::im2row(const Mat &srcImg, Mat &dstRow)
class DeConvolutionLayerImpl : public BaseConvolutionLayerImpl
{
if (is1x1())
public:
void computeInpOutShape(const Mat &inpBlob)
{
dstRow = srcImg.reshape(1, ksize).t();
return;
}
Mat &colMat = colRowBlob;
if (srcImg.type() == CV_32F)
im2row_CpuPBody<float>::run(srcImg.ptr<float>(), inpGroupCn, inpH, inpW, kernel.height,
kernel.width, pad.height, pad.width, stride.height, stride.width,
dilation.height, dilation.width, outH, outW, colMat.ptr<float>());
if (srcImg.type() == CV_64F)
im2row_CpuPBody<double>::run(srcImg.ptr<double>(), inpGroupCn, inpH, inpW, kernel.height,
kernel.width, pad.height, pad.width, stride.height, stride.width,
dilation.height, dilation.width, outH, outW, colMat.ptr<double>());
dstRow = colMat;
}
CV_Assert(!bias || blobs[1].total() == (size_t)blobs[0].size[0]);
//Deconvolution
numOutput = blobs[0].size[0];
void DeConvolutionLayerImpl::computeInpOutShape(const Mat &inpBlob)
{
CV_Assert(!bias || blobs[1].total() == (size_t)blobs[0].size[0]);
inpH = inpBlob.size[2];
inpW = inpBlob.size[3];
inpCn = inpBlob.size[1];
numOutput = blobs[0].size[0];
outH = stride.height * (inpH - 1) + kernel.height - 2 * pad.height + adjustPad.height;
outW = stride.width * (inpW - 1) + kernel.width - 2 * pad.width + adjustPad.width;
outCn = numOutput;
inpH = inpBlob.size[2];
inpW = inpBlob.size[3];
inpCn = inpBlob.size[1];
group = inpCn / blobs[0].size[1];
outGroupCn = outCn / group;
inpGroupCn = inpCn / group;
ksize = outGroupCn * kernel.height * kernel.width;
outH = stride.height * (inpH - 1) + kernel.height - 2 * pad.height + adjustPad.height;
outW = stride.width * (inpW - 1) + kernel.width - 2 * pad.width + adjustPad.width;
outCn = numOutput;
CV_Assert(inpCn % group == 0 && outCn % group == 0);
CV_Assert(blobs[0].size[0] == outCn && blobs[0].size[1] == inpCn / group);
group = inpCn / blobs[0].size[1];
outGroupCn = outCn / group;
inpGroupCn = inpCn / group;
ksize = outGroupCn * kernel.height * kernel.width;
colRowBlobShape.clear();
colRowBlobShape.push_back(ksize);
colRowBlobShape.push_back(inpH * inpW);
CV_Assert(inpCn % group == 0 && outCn % group == 0);
CV_Assert(blobs[0].size[0] == outCn && blobs[0].size[1] == inpCn / group);
colRowBlobShape.clear();
colRowBlobShape.push_back(ksize);
colRowBlobShape.push_back(inpH * inpW);
}
void DeConvolutionLayerImpl::forward(std::vector<Mat *> &inputs, std::vector<Mat> &outputs)
{
Mat weightsMat = blobs[0].reshape(1, inpCn);
Mat biasesMat = bias ? blobs[1].reshape(1, outCn) : Mat();
ofsbuf.resize(ksize*3);
for( int k = 0; k < ksize; k++ )
{
int w_offset = k % kernel.width;
int h_offset = (k / kernel.width) % kernel.height;
int c_im = k / kernel.height / kernel.width;
ofsbuf[k*3] = w_offset;
ofsbuf[k*3+1] = h_offset;
ofsbuf[k*3+2] = c_im;
}
}
for (size_t ii = 0; ii < outputs.size(); ii++)
void forward(std::vector<Mat *> &inputs, std::vector<Mat> &outputs)
{
int numImg = inputs[ii]->size[0];
Mat convBlob = inputs[ii]->reshape(1, numImg*inpCn);
Mat decnBlob = outputs[ii].reshape(1, numImg*outCn);
Mat weightsMat = blobs[0].reshape(1, inpCn);
Mat biasesMat = bias ? blobs[1].reshape(1, outCn) : Mat();
for (int n = 0; n < numImg; n++)
for (size_t ii = 0; ii < outputs.size(); ii++)
{
for (int g = 0; g < group; g++)
int numImg = inputs[ii]->size[0];
Mat convBlob = inputs[ii]->reshape(1, numImg*inpCn);
Mat decnBlob = outputs[ii].reshape(1, numImg*outCn);
for (int n = 0; n < numImg; n++)
{
Mat dstMat = decnBlob.rowRange(_Range((g + n * group) * outGroupCn, outGroupCn));
Mat &colMat = (is1x1()) ? dstMat : colRowBlob;
for (int g = 0; g < group; g++)
{
Mat dstMat = decnBlob.rowRange(_Range((g + n * group) * outGroupCn, outGroupCn));
Mat &colMat = (is1x1()) ? dstMat : colRowBlob;
Mat convMat = convBlob.rowRange(_Range((g + n * group) * inpGroupCn, inpGroupCn));
Mat wghtMat = weightsMat.rowRange(_Range(g * inpGroupCn, inpGroupCn));
Mat convMat = convBlob.rowRange(_Range((g + n * group) * inpGroupCn, inpGroupCn));
Mat wghtMat = weightsMat.rowRange(_Range(g * inpGroupCn, inpGroupCn));
dnn::gemm(wghtMat, convMat, 1, colMat, 0, GEMM_1_T);
dnn::gemm(wghtMat, convMat, 1, colMat, 0, GEMM_1_T);
if (!is1x1())
col2im(colMat, dstMat);
if (!is1x1())
col2im(colMat, dstMat);
if (bias)
{
Mat curBiasMat = biasesMat.rowRange(_Range(g * outGroupCn, outGroupCn));
dnn::gemm(curBiasMat, biasOnesBlob, 1, dstMat, 1);
if (bias)
{
Mat curBiasMat = biasesMat.rowRange(_Range(g * outGroupCn, outGroupCn));
dnn::gemm(curBiasMat, biasOnesBlob, 1, dstMat, 1);
}
}
}
}
}
}
void DeConvolutionLayerImpl::col2im(const Mat &colMat, Mat &dstImg)
{
if (is1x1())
void col2im(const Mat &colMat, Mat &dstImg)
{
dstImg = colMat;
return;
if (is1x1())
{
dstImg = colMat;
return;
}
cv::dnn::col2im(colMat.ptr<float>(), outGroupCn, outH, outW, kernel.height, kernel.width,
pad.height, pad.width, stride.height, stride.width,
dilation.height, dilation.width, dstImg.ptr<float>(), &ofsbuf[0]);
}
if (dstImg.type() == CV_32F)
col2im_CpuPBody<float>::run(colMat.ptr<float>(), outGroupCn, outH, outW, kernel.height, kernel.width, pad.height, pad.width, stride.height, stride.width, dstImg.ptr<float>());
if (dstImg.type() == CV_64F)
col2im_CpuPBody<double>::run(colMat.ptr<double>(), inpGroupCn, inpH, inpW, kernel.height, kernel.width, pad.height, pad.width, stride.height, stride.width, dstImg.ptr<double>());
}
//Initializers
/*Ptr<BaseConvolutionLayer> ConvolutionLayer::create(Size kernel, Size stride, Size pad, Size dilation)
{
ConvolutionLayerImpl *l = new ConvolutionLayerImpl();
l->kernel = kernel;
l->pad = pad;
l->stride = stride;
l->dilation = dilation;
return Ptr<BaseConvolutionLayer>(l);
}
Ptr<BaseConvolutionLayer> DeconvolutionLayer::create(Size kernel, Size stride, Size pad, Size dilation, Size adjustPad)
{
DeConvolutionLayerImpl *l = new DeConvolutionLayerImpl();
l->kernel = kernel;
l->pad = pad;
l->stride = stride;
l->dilation = dilation;
l->adjustPad = adjustPad;
return Ptr<BaseConvolutionLayer>(l);
}*/
std::vector<int> ofsbuf;
};
//Convolution and Deconvolution
static void initConvDeconvLayerFromCaffe(Ptr<BaseConvolutionLayer> l, const LayerParams &params)
......
......@@ -15,8 +15,7 @@ using std::pow;
template<typename Func>
class ElementWiseLayer : public Func::Layer
{
Func func;
public:
template<typename Dtype>
class PBody : public cv::ParallelLoopBody
{
......@@ -35,9 +34,7 @@ class ElementWiseLayer : public Func::Layer
}
};
public:
ElementWiseLayer(const Func &f=Func()) : func(f) {}
ElementWiseLayer(bool run_parallel_=false, const Func &f=Func()) : func(f), run_parallel(run_parallel_) {}
void allocate(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
{
......@@ -58,9 +55,16 @@ public:
Range sizeRange = Range(0, dst.total());
CV_Assert(src.type() == CV_32F);
cv::parallel_for_(sizeRange, PBody<float>(dst, func));
PBody<float> body(dst, func);
if( run_parallel )
cv::parallel_for_(sizeRange, body);
else
body(sizeRange);
}
}
Func func;
bool run_parallel;
};
struct ReLUFunctor
......@@ -135,8 +139,24 @@ struct PowerFunctor
template<typename TFloat>
inline TFloat operator()(TFloat x) const
{
return power == 1.0f ? (TFloat)shift + (TFloat)scale * x :
pow((TFloat)shift + (TFloat)scale * x, (TFloat)power);
return pow((TFloat)shift + (TFloat)scale * x, (TFloat)power);
}
};
struct PowerFunctor1
{
typedef PowerLayer Layer;
const float scale;
const float shift;
PowerFunctor1(float scale_ = 1.f, float shift_ = 0)
: scale(scale_), shift(shift_) {}
template<typename TFloat>
inline TFloat operator()(TFloat x) const
{
return (TFloat)shift + (TFloat)scale * x;
}
};
......@@ -165,12 +185,12 @@ public:
void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
{
CV_Assert(inputs.size() == 1);
Mat &inpBlob = *inputs[0];
for (size_t ii = 0; ii < outputs.size(); ii++)
{
Mat &outBlob = outputs[ii];
CV_Assert(inpBlob.isContinuous() && outBlob.isContinuous());
CV_Assert(blobs[0].total() == inpBlob.size[1]);
......@@ -181,8 +201,16 @@ public:
Mat inpBlobPlane = getPlane(inpBlob, 0, n);
Mat outBlobPlane = getPlane(outBlob, 0, n);
threshold(inpBlobPlane, outBlobPlane, 0, 0, cv::THRESH_TOZERO_INV);
scaleAdd(outBlobPlane, slopeWeight-1, inpBlobPlane, outBlobPlane);
size_t i, planeTotal = inpBlobPlane.total();
const float* inptr = inpBlobPlane.ptr<float>();
float* outptr = outBlobPlane.ptr<float>();
for( i = 0; i < planeTotal; i++ )
{
float val = inptr[i];
outptr[i] = val*(val >= 0.f ? 1.f : slopeWeight);
}
//threshold(inpBlobPlane, outBlobPlane, 0, 0, cv::THRESH_TOZERO_INV);
//scaleAdd(outBlobPlane, slopeWeight-1, inpBlobPlane, outBlobPlane);
}
}
}
......@@ -196,7 +224,7 @@ Ptr<_Layer> _Layer::create() { \
Ptr<ReLULayer> ReLULayer::create(const LayerParams& params)
{
float negativeSlope = params.get<float>("negative_slope", 0.f);
Ptr<ReLULayer> l(new ElementWiseLayer<ReLUFunctor>(ReLUFunctor(negativeSlope)));
Ptr<ReLULayer> l(new ElementWiseLayer<ReLUFunctor>(false, ReLUFunctor(negativeSlope)));
l->setParamsFrom(params);
return l;
......@@ -204,7 +232,7 @@ Ptr<ReLULayer> ReLULayer::create(const LayerParams& params)
Ptr<TanHLayer> TanHLayer::create(const LayerParams& params)
{
Ptr<TanHLayer> l(new ElementWiseLayer<TanHFunctor>());
Ptr<TanHLayer> l(new ElementWiseLayer<TanHFunctor>(true));
l->setParamsFrom(params);
return l;
......@@ -212,7 +240,7 @@ Ptr<TanHLayer> TanHLayer::create(const LayerParams& params)
Ptr<SigmoidLayer> SigmoidLayer::create(const LayerParams& params)
{
Ptr<SigmoidLayer> l(new ElementWiseLayer<SigmoidFunctor>());
Ptr<SigmoidLayer> l(new ElementWiseLayer<SigmoidFunctor>(true));
l->setParamsFrom(params);
return l;
......@@ -228,7 +256,7 @@ Ptr<AbsLayer> AbsLayer::create(const LayerParams& params)
Ptr<BNLLLayer> BNLLLayer::create(const LayerParams& params)
{
Ptr<BNLLLayer> l(new ElementWiseLayer<BNLLFunctor>());
Ptr<BNLLLayer> l(new ElementWiseLayer<BNLLFunctor>(true));
l->setParamsFrom(params);
return l;
......@@ -239,7 +267,9 @@ Ptr<PowerLayer> PowerLayer::create(const LayerParams& params)
float power = params.get<float>("power", 1.0f);
float scale = params.get<float>("scale", 1.0f);
float shift = params.get<float>("shift", 0.0f);
Ptr<PowerLayer> l(new ElementWiseLayer<PowerFunctor>(PowerFunctor(power, scale, shift)));
Ptr<PowerLayer> l(power == 1.f ?
(PowerLayer*)(new ElementWiseLayer<PowerFunctor1>(false, PowerFunctor1(scale, shift))) :
(PowerLayer*)(new ElementWiseLayer<PowerFunctor>(true, PowerFunctor(power, scale, shift))));
l->setParamsFrom(params);
return l;
......
......@@ -98,15 +98,14 @@ public:
void forward(std::vector<Mat *> &inputs, std::vector<Mat> &outputs)
{
Mat& output = outputs[0];
switch (op)
{
case SUM:
{
CV_Assert(coeffs.size() == 0 || coeffs.size() == inputs.size());
Mat& output = outputs[0];
output.setTo(0.);
if (0 < coeffs.size())
{
output.setTo(0.);
for (size_t i = 0; i < inputs.size(); i++)
{
output += *inputs[i] * coeffs[i];
......@@ -114,32 +113,26 @@ public:
}
else
{
for (size_t i = 0; i < inputs.size(); i++)
add(*inputs[0], *inputs[1], output);
for (size_t i = 2; i < inputs.size(); i++)
{
output += *inputs[i];
}
}
}
break;
case PROD:
{
Mat& output = outputs[0];
output.setTo(1.);
for (size_t i = 0; i < inputs.size(); i++)
{
output = output.mul(*inputs[i]);
}
}
break;
case MAX:
{
Mat& output = outputs[0];
cv::max(*inputs[0], *inputs[1], output);
for (size_t i = 2; i < inputs.size(); i++)
{
cv::max(output, *inputs[i], output);
}
}
break;
default:
CV_Assert(0);
......
......@@ -44,3 +44,326 @@
#include "opencl_kernels_dnn.hpp"
#include "op_im2col.hpp"
#include "opencl_kernels_dnn.hpp"
namespace cv {
namespace dnn {
#if 0
template <typename Dtype>
class im2col_CpuPBody : public cv::ParallelLoopBody
{
const Dtype* data_im;
int channels, height, width;
int kernel_h, kernel_w;
int pad_h, pad_w;
int stride_h, stride_w;
int dilation_h, dilation_w;
Dtype* data_col;
int height_col, width_col, channels_col;
im2col_CpuPBody() {}
public:
static void run(const Dtype* data_im,
int channels, int height, int width,
int kernel_h, int kernel_w,
int pad_h, int pad_w,
int stride_h, int stride_w,
int dilation_h, int dilation_w,
int height_col, int width_col,
Dtype* data_col)
{
im2col_CpuPBody<Dtype> t;
t.data_im = data_im;
t.data_col = data_col;
t.channels = channels; t.height = height; t.width = width;
t.kernel_h = kernel_h; t.kernel_w = kernel_w;
t.pad_h = pad_h; t.pad_w = pad_w;
t.stride_h = stride_h; t.stride_w = stride_w;
t.dilation_h = dilation_h; t.dilation_w = dilation_w;
t.height_col = height_col;
t.width_col = width_col;
t.channels_col = channels * kernel_h * kernel_w;
cv::parallel_for_(Range(0, t.channels_col), t);
}
virtual void operator ()(const Range &r) const
{
for (int c = r.start; c < r.end; ++c)
{
int w_offset = c % kernel_w;
int h_offset = (c / kernel_w) % kernel_h;
int c_im = c / kernel_h / kernel_w;
for (int h = 0; h < height_col; ++h)
{
for (int w = 0; w < width_col; ++w)
{
int h_pad = h * stride_h - pad_h + h_offset * dilation_h;
int w_pad = w * stride_w - pad_w + w_offset * dilation_w;
if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
data_col[(c * height_col + h) * width_col + w] =
data_im[(c_im * height + h_pad) * width + w_pad];
else
data_col[(c * height_col + h) * width_col + w] = 0;
}
}
}
}
};
#endif
template <typename Dtype>
class im2row_CpuPBody : public cv::ParallelLoopBody
{
const Dtype* data_im;
int channels, height, width;
int kernel_h, kernel_w;
int pad_h, pad_w;
int stride_h, stride_w;
int dilation_h, dilation_w;
Dtype* data_col;
int height_col, width_col, channels_col;
im2row_CpuPBody() {}
public:
static void run(const Dtype* data_im,
int channels, int height, int width,
int kernel_h, int kernel_w,
int pad_h, int pad_w,
int stride_h, int stride_w,
int dilation_h, int dilation_w,
int height_col, int width_col,
Dtype* data_col)
{
im2row_CpuPBody<Dtype> t;
t.data_im = data_im;
t.data_col = data_col;
t.channels = channels; t.height = height; t.width = width;
t.kernel_h = kernel_h; t.kernel_w = kernel_w;
t.pad_h = pad_h; t.pad_w = pad_w;
t.stride_h = stride_h; t.stride_w = stride_w;
t.dilation_h = dilation_h; t.dilation_w = dilation_w;
t.height_col = height_col;
t.width_col = width_col;
t.channels_col = channels * kernel_h * kernel_w;
int total = t.height_col*t.width_col;
#if 1
t(Range(0, total));
#else
cv::parallel_for_(Range(0, total), t, 16);
#endif
}
virtual void operator ()(const Range &r) const
{
int dh = dilation_h, dw = dilation_w;
int kh = kernel_h, kw = kernel_w;
Dtype* data_col_ = data_col;
const Dtype* data_im_ = data_im;
int kelems = kh*kw;
AutoBuffer<int> ofs_(kelems);
int* ofs = ofs_;
int k = 0;
for( int k_r = 0; k_r < kernel_h; k_r++ )
for( int k_c = 0; k_c < kernel_w; k_c++, k++ )
ofs[k] = k_r*dh*width + k_c*dw;
for (int row = r.start; row < r.end; ++row)
{
int out_c = row % width_col;
int out_r = row / width_col;
int out_row_offset = row*kh*kw*channels;
int start_in_r = out_r * stride_h - pad_h;
int start_in_c = out_c * stride_w - pad_w;
int start_k_r = std::max(0, (-start_in_r + dilation_h-1)/dilation_h);
int end_k_r = std::min(kh, (height - start_in_r + dilation_h-1)/dilation_h);
int start_k_c = std::max(0, (-start_in_c + dilation_w-1)/dilation_w);
int end_k_c = std::min(kw, (width - start_in_c + dilation_w-1)/dilation_w);
if( start_k_r == 0 && end_k_r == kh && start_k_c == 0 && end_k_c == kw )
{
for( int i_c = 0; i_c < channels; i_c++ )
{
float* data_col_c = data_col_ + out_row_offset + i_c*kh*kw;
const float* data_im_c = data_im_ + (i_c*height + start_in_r)*width + start_in_c;
for( k = 0; k < kelems; k++ )
{
data_col_c[k] = data_im_c[ofs[k]];
}
}
}
else
{
memset(data_col_, 0, kw*kh*channels*sizeof(data_col_[0]));
for(int i_c = 0; i_c < channels; i_c++)
{
int channels_offset = i_c * width * height;
int out_ch_offset = i_c*kh*kw;
int in_r = start_in_r + start_k_r*dh;
for(int k_r = start_k_r; k_r < end_k_r; k_r++, in_r += dh)
{
int row_offset = in_r*width;
int out_col_offset = k_r*kw;
int in_c = start_in_c + start_k_c*dw;
for(int k_c = start_k_c; k_c < end_k_c; k_c++, in_c += dw)
{
int in_index = channels_offset + row_offset + in_c;
int out_index = out_row_offset + out_ch_offset + out_col_offset + k_c;
data_col_[out_index] = data_im_[in_index];
}
}
}
}
}
}
};
void im2row(const float* data_im, int channels, int height, int width,
int kernel_h, int kernel_w, int pad_h, int pad_w,
int stride_h, int stride_w, int dilation_h, int dilation_w,
int height_col, int width_col, float* data_col)
{
im2row_CpuPBody<float>::run(data_im, channels, height, width,
kernel_h, kernel_w, pad_h, pad_w,
stride_h, stride_w, dilation_h, dilation_w,
height_col, width_col, data_col);
}
#if 0
template <typename Dtype>
class col2im_CpuPBody : public cv::ParallelLoopBody
{
const Dtype* data_col;
int channels, height, width;
int kernel_h, kernel_w;
int pad_h, pad_w;
int stride_h, stride_w;
Dtype* data_im;
int height_col, width_col;
col2im_CpuPBody() {}
public:
static void run(const Dtype* data_col,
int channels, int height, int width,
int kernel_h, int kernel_w,
int pad_h, int pad_w,
int stride_h, int stride_w,
Dtype* data_im)
{
//TODO: single-threaded version switch
col2im_CpuPBody t;
t.data_col = data_col;
t.data_im = data_im;
t.channels = channels; t.height = height; t.width = width;
t.kernel_h = kernel_h; t.kernel_w = kernel_w;
t.pad_h = pad_h; t.pad_w = pad_w;
t.stride_h = stride_h; t.stride_w = stride_w;
t.height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
t.width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
int img_total = channels * height * width;
cv::parallel_for_(Range(0, img_total), t);
}
virtual void operator ()(const Range &r) const
{
const Dtype* data_col_ = data_col;
Dtype* data_im_ = data_im;
int coeff_h_col = (1 - stride_h * kernel_w * height_col) * width_col;
int coeff_w_col = (1 - stride_w * height_col * width_col);
for (int index = r.start; index < r.end; index++)
{
Dtype val = 0;
int w = index % width + pad_w;
int h = (index / width) % height + pad_h;
int c = index / (width * height);
// compute the start and end of the output
int w_col_start = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
int w_col_end = std::min(w / stride_w + 1, width_col);
int h_col_start = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
int h_col_end = std::min(h / stride_h + 1, height_col);
// equivalent implementation
int offset =
(c * kernel_h * kernel_w + h * kernel_w + w) * height_col * width_col;
for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
val += data_col_[offset + h_col * coeff_h_col + w_col * coeff_w_col];
}
}
data_im_[index] = val;
}
}
};
#endif
//single-threaded version
template <typename Dtype>
void col2im_cpu(const Dtype* data_col,
int channels, int height, int width,
int kernel_h, int kernel_w,
int pad_h, int pad_w,
int stride_h, int stride_w,
int dilation_h, int dilation_w,
Dtype* data_im,
const int* ofsbuf)
{
int height_col = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
int width_col = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
int channels_col = channels * kernel_h * kernel_w;
std::memset(data_im, 0, height * width * channels * sizeof(Dtype));
for (int c = 0; c < channels_col; ++c, ofsbuf += 3)
{
//int w_offset = c % kernel_w;
//int h_offset = (c / kernel_w) % kernel_h;
//int c_im = c / kernel_h / kernel_w;
int w_offset = ofsbuf[0];
int h_offset = ofsbuf[1];
int c_im = ofsbuf[2];
for (int h = 0; h < height_col; ++h)
{
for (int w = 0; w < width_col; ++w)
{
int h_pad = h * stride_h - pad_h + h_offset * dilation_h;
int w_pad = w * stride_w - pad_w + w_offset * dilation_w;
if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
data_im[(c_im * height + h_pad) * width + w_pad] +=
data_col[(c * height_col + h) * width_col + w];
}
}
}
}
void col2im(const float* data_col, int channels, int height, int width,
int kernel_h, int kernel_w, int pad_h, int pad_w,
int stride_h, int stride_w, int dilation_h, int dilation_w,
float* data_im, const int* ofsbuf)
{
//col2im_CpuPBody<float>::run(data_col, channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, data_im);
col2im_cpu(data_col, channels, height, width, kernel_h, kernel_w, pad_h, pad_w,
stride_h, stride_w, dilation_h, dilation_w, data_im, ofsbuf);
}
}
}
......@@ -49,264 +49,15 @@ namespace cv
namespace dnn
{
template <typename Dtype>
class im2col_CpuPBody : public cv::ParallelLoopBody
{
const Dtype* data_im;
int channels, height, width;
int kernel_h, kernel_w;
int pad_h, pad_w;
int stride_h, stride_w;
int dilation_h, dilation_w;
Dtype* data_col;
int height_col, width_col, channels_col;
im2col_CpuPBody() {}
public:
static void run(const Dtype* data_im,
int channels, int height, int width,
int kernel_h, int kernel_w,
int pad_h, int pad_w,
int stride_h, int stride_w,
int dilation_h, int dilation_w,
int height_col, int width_col,
Dtype* data_col)
{
im2col_CpuPBody<Dtype> t;
t.data_im = data_im;
t.data_col = data_col;
t.channels = channels; t.height = height; t.width = width;
t.kernel_h = kernel_h; t.kernel_w = kernel_w;
t.pad_h = pad_h; t.pad_w = pad_w;
t.stride_h = stride_h; t.stride_w = stride_w;
t.dilation_h = dilation_h; t.dilation_w = dilation_w;
t.height_col = height_col;
t.width_col = width_col;
t.channels_col = channels * kernel_h * kernel_w;
cv::parallel_for_(Range(0, t.channels_col), t);
}
virtual void operator ()(const Range &r) const
{
for (int c = r.start; c < r.end; ++c)
{
int w_offset = c % kernel_w;
int h_offset = (c / kernel_w) % kernel_h;
int c_im = c / kernel_h / kernel_w;
for (int h = 0; h < height_col; ++h)
{
for (int w = 0; w < width_col; ++w)
{
int h_pad = h * stride_h - pad_h + h_offset * dilation_h;
int w_pad = w * stride_w - pad_w + w_offset * dilation_w;
if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
data_col[(c * height_col + h) * width_col + w] =
data_im[(c_im * height + h_pad) * width + w_pad];
else
data_col[(c * height_col + h) * width_col + w] = 0;
}
}
}
}
};
template <typename Dtype>
class im2row_CpuPBody : public cv::ParallelLoopBody
{
const Dtype* data_im;
int channels, height, width;
int kernel_h, kernel_w;
int pad_h, pad_w;
int stride_h, stride_w;
int dilation_h, dilation_w;
Dtype* data_col;
int height_col, width_col, channels_col;
im2row_CpuPBody() {}
public:
static void run(const Dtype* data_im,
int channels, int height, int width,
int kernel_h, int kernel_w,
int pad_h, int pad_w,
int stride_h, int stride_w,
int dilation_h, int dilation_w,
int height_col, int width_col,
Dtype* data_col)
{
im2row_CpuPBody<Dtype> t;
t.data_im = data_im;
t.data_col = data_col;
t.channels = channels; t.height = height; t.width = width;
t.kernel_h = kernel_h; t.kernel_w = kernel_w;
t.pad_h = pad_h; t.pad_w = pad_w;
t.stride_h = stride_h; t.stride_w = stride_w;
t.dilation_h = dilation_h; t.dilation_w = dilation_w;
t.height_col = height_col;
t.width_col = width_col;
t.channels_col = channels * kernel_h * kernel_w;
cv::parallel_for_(Range(0, t.height_col*t.width_col), t, 16);
}
virtual void operator ()(const Range &r) const
{
int dh = dilation_h, dw = dilation_w;
Dtype* data_col_ = data_col;
const Dtype* data_im_ = data_im;
for (int row = r.start; row < r.end; ++row)
{
int out_c = row % width_col;
int out_r = row / width_col;
int out_row_offset = row*kernel_h*kernel_w*channels;
int start_in_r = out_r * stride_h - pad_h;
int start_in_c = out_c * stride_w - pad_w;
int start_k_r = std::max(0, cvCeil(-start_in_r/(float)dilation_h));
int end_k_r = std::min(kernel_h, cvCeil((height - start_in_r)/(float)dilation_h));
int start_k_c = std::max(0, cvCeil(-start_in_c/(float)dilation_w));
int end_k_c = std::min(kernel_w, cvCeil((width - start_in_c)/(float)dilation_w));
for(int i_c = 0; i_c < channels; i_c++)
{
int channels_offset = i_c * width * height;
int out_ch_offset = i_c*kernel_h*kernel_w;
int in_r = start_in_r + start_k_r*dilation_h;
for(int k_r = start_k_r; k_r < end_k_r; k_r++, in_r += dh)
{
int row_offset = in_r*width;
int out_col_offset = k_r*kernel_w;
int in_c = start_in_c + start_k_c*dilation_w;
for(int k_c = start_k_c; k_c < end_k_c; k_c++, in_c += dw)
{
int in_index = channels_offset + row_offset + in_c;
int out_index = out_row_offset + out_ch_offset + out_col_offset + k_c;
data_col_[out_index] = data_im_[in_index];
}
}
}
}
}
};
template <typename Dtype>
class col2im_CpuPBody : public cv::ParallelLoopBody
{
const Dtype* data_col;
int channels, height, width;
int kernel_h, kernel_w;
int pad_h, pad_w;
int stride_h, stride_w;
Dtype* data_im;
int height_col, width_col;
col2im_CpuPBody() {}
public:
static void run(const Dtype* data_col,
int channels, int height, int width,
int kernel_h, int kernel_w,
int pad_h, int pad_w,
int stride_h, int stride_w,
Dtype* data_im)
{
//TODO: single-threaded version switch
col2im_CpuPBody t;
t.data_col = data_col;
t.data_im = data_im;
t.channels = channels; t.height = height; t.width = width;
t.kernel_h = kernel_h; t.kernel_w = kernel_w;
t.pad_h = pad_h; t.pad_w = pad_w;
t.stride_h = stride_h; t.stride_w = stride_w;
t.height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
t.width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
int img_total = channels * height * width;
cv::parallel_for_(Range(0, img_total), t);
}
virtual void operator ()(const Range &r) const
{
const Dtype* data_col_ = data_col;
Dtype* data_im_ = data_im;
int coeff_h_col = (1 - stride_h * kernel_w * height_col) * width_col;
int coeff_w_col = (1 - stride_w * height_col * width_col);
for (int index = r.start; index < r.end; index++)
{
Dtype val = 0;
int w = index % width + pad_w;
int h = (index / width) % height + pad_h;
int c = index / (width * height);
// compute the start and end of the output
int w_col_start = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
int w_col_end = std::min(w / stride_w + 1, width_col);
int h_col_start = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
int h_col_end = std::min(h / stride_h + 1, height_col);
// equivalent implementation
int offset =
(c * kernel_h * kernel_w + h * kernel_w + w) * height_col * width_col;
for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
val += data_col_[offset + h_col * coeff_h_col + w_col * coeff_w_col];
}
}
data_im_[index] = val;
}
}
};
//single-threaded version
template <typename Dtype>
void col2im_cpu(const Dtype* data_col,
int channels, int height, int width,
int kernel_h, int kernel_w,
int pad_h, int pad_w,
int stride_h, int stride_w,
int dilation_h, int dilation_w,
Dtype* data_im)
{
int height_col = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
int width_col = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
int channels_col = channels * kernel_h * kernel_w;
std::memset(data_im, 0, height * width * channels * sizeof(Dtype));
for (int c = 0; c < channels_col; ++c)
{
int w_offset = c % kernel_w;
int h_offset = (c / kernel_w) % kernel_h;
int c_im = c / kernel_h / kernel_w;
for (int h = 0; h < height_col; ++h)
{
for (int w = 0; w < width_col; ++w)
{
int h_pad = h * stride_h - pad_h + h_offset * dilation_h;
int w_pad = w * stride_w - pad_w + w_offset * dilation_w;
if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
data_im[(c_im * height + h_pad) * width + w_pad] +=
data_col[(c * height_col + h) * width_col + w];
}
}
}
}
void im2row(const float* data_im, int channels, int height, int width,
int kernel_h, int kernel_w, int pad_h, int pad_w,
int stride_h, int stride_w, int dilation_h, int dilation_w,
int height_col, int width_col, float* data_col);
void col2im(const float* data_col, int channels, int height, int width,
int kernel_h, int kernel_w, int pad_h, int pad_w,
int stride_h, int stride_w, int dilation_h, int dilation_w,
float* data_im, const int* ofsbuf);
}
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment