Commit b97931e0 authored by Vadim Pisarevsky's avatar Vadim Pisarevsky

Merge pull request #1136 from vpisarev:dnn5

parents 3908909d 75789089
...@@ -98,14 +98,19 @@ int main(int argc, char **argv) ...@@ -98,14 +98,19 @@ int main(int argc, char **argv)
net.setBlob("", inputBlob); //set the network input net.setBlob("", inputBlob); //set the network input
//! [Set input blob] //! [Set input blob]
const int N = 3;
TickMeter tm; TickMeter tm;
tm.start();
//! [Make forward pass] //! [Make forward pass]
for( int i = 0; i < N; i++ )
{
TickMeter tm_;
tm_.start();
net.forward(); //compute output net.forward(); //compute output
//! [Make forward pass] tm_.stop();
if( i == 0 || tm_.getTimeTicks() < tm.getTimeTicks() )
tm.stop(); tm = tm_;
}
//! [Gather output] //! [Gather output]
......
...@@ -41,6 +41,15 @@ public: ...@@ -41,6 +41,15 @@ public:
Mat* inp = inputs[i]; Mat* inp = inputs[i];
outputs[i].create(inp->dims, &inp->size.p[0], inp->type()); outputs[i].create(inp->dims, &inp->size.p[0], inp->type());
} }
varMeanScale = 1.f;
if (!hasWeights && !hasBias) {
varMeanScale = *blobs[2].ptr<float>();
if (varMeanScale != 0)
varMeanScale = 1/varMeanScale;
}
cv::pow(blobs[1]*varMeanScale + epsilon, -0.5, invStdMat);
} }
void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs) void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
...@@ -52,16 +61,6 @@ public: ...@@ -52,16 +61,6 @@ public:
int weightsBlobIndex = 2; int weightsBlobIndex = 2;
int biasBlobIndex = weightsBlobIndex + hasWeights; int biasBlobIndex = weightsBlobIndex + hasWeights;
float varMeanScale = 1;
if (!hasWeights && !hasBias) {
varMeanScale = *blobs[2].ptr<float>();
if (varMeanScale != 0)
varMeanScale = 1/varMeanScale;
}
Mat invStdMat;
cv::pow(blobs[1]*varMeanScale + epsilon, -0.5, invStdMat);
int rows = inpBlob.size[2]; int rows = inpBlob.size[2];
int cols = inpBlob.size[3]; int cols = inpBlob.size[3];
...@@ -92,7 +91,8 @@ public: ...@@ -92,7 +91,8 @@ public:
} }
bool hasWeights, hasBias; bool hasWeights, hasBias;
float epsilon; float epsilon, varMeanScale;
Mat invStdMat;
}; };
Ptr<BatchNormLayer> BatchNormLayer::create(const LayerParams& params) Ptr<BatchNormLayer> BatchNormLayer::create(const LayerParams& params)
......
...@@ -15,8 +15,7 @@ using std::pow; ...@@ -15,8 +15,7 @@ using std::pow;
template<typename Func> template<typename Func>
class ElementWiseLayer : public Func::Layer class ElementWiseLayer : public Func::Layer
{ {
Func func; public:
template<typename Dtype> template<typename Dtype>
class PBody : public cv::ParallelLoopBody class PBody : public cv::ParallelLoopBody
{ {
...@@ -35,9 +34,7 @@ class ElementWiseLayer : public Func::Layer ...@@ -35,9 +34,7 @@ class ElementWiseLayer : public Func::Layer
} }
}; };
public: ElementWiseLayer(bool run_parallel_=false, const Func &f=Func()) : func(f), run_parallel(run_parallel_) {}
ElementWiseLayer(const Func &f=Func()) : func(f) {}
void allocate(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs) void allocate(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
{ {
...@@ -58,9 +55,16 @@ public: ...@@ -58,9 +55,16 @@ public:
Range sizeRange = Range(0, dst.total()); Range sizeRange = Range(0, dst.total());
CV_Assert(src.type() == CV_32F); CV_Assert(src.type() == CV_32F);
cv::parallel_for_(sizeRange, PBody<float>(dst, func)); PBody<float> body(dst, func);
if( run_parallel )
cv::parallel_for_(sizeRange, body);
else
body(sizeRange);
} }
} }
Func func;
bool run_parallel;
}; };
struct ReLUFunctor struct ReLUFunctor
...@@ -135,8 +139,24 @@ struct PowerFunctor ...@@ -135,8 +139,24 @@ struct PowerFunctor
template<typename TFloat> template<typename TFloat>
inline TFloat operator()(TFloat x) const inline TFloat operator()(TFloat x) const
{ {
return power == 1.0f ? (TFloat)shift + (TFloat)scale * x : return pow((TFloat)shift + (TFloat)scale * x, (TFloat)power);
pow((TFloat)shift + (TFloat)scale * x, (TFloat)power); }
};
struct PowerFunctor1
{
typedef PowerLayer Layer;
const float scale;
const float shift;
PowerFunctor1(float scale_ = 1.f, float shift_ = 0)
: scale(scale_), shift(shift_) {}
template<typename TFloat>
inline TFloat operator()(TFloat x) const
{
return (TFloat)shift + (TFloat)scale * x;
} }
}; };
...@@ -165,12 +185,12 @@ public: ...@@ -165,12 +185,12 @@ public:
void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs) void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
{ {
CV_Assert(inputs.size() == 1); CV_Assert(inputs.size() == 1);
Mat &inpBlob = *inputs[0]; Mat &inpBlob = *inputs[0];
for (size_t ii = 0; ii < outputs.size(); ii++) for (size_t ii = 0; ii < outputs.size(); ii++)
{ {
Mat &outBlob = outputs[ii]; Mat &outBlob = outputs[ii];
CV_Assert(inpBlob.isContinuous() && outBlob.isContinuous());
CV_Assert(blobs[0].total() == inpBlob.size[1]); CV_Assert(blobs[0].total() == inpBlob.size[1]);
...@@ -181,8 +201,16 @@ public: ...@@ -181,8 +201,16 @@ public:
Mat inpBlobPlane = getPlane(inpBlob, 0, n); Mat inpBlobPlane = getPlane(inpBlob, 0, n);
Mat outBlobPlane = getPlane(outBlob, 0, n); Mat outBlobPlane = getPlane(outBlob, 0, n);
threshold(inpBlobPlane, outBlobPlane, 0, 0, cv::THRESH_TOZERO_INV); size_t i, planeTotal = inpBlobPlane.total();
scaleAdd(outBlobPlane, slopeWeight-1, inpBlobPlane, outBlobPlane); const float* inptr = inpBlobPlane.ptr<float>();
float* outptr = outBlobPlane.ptr<float>();
for( i = 0; i < planeTotal; i++ )
{
float val = inptr[i];
outptr[i] = val*(val >= 0.f ? 1.f : slopeWeight);
}
//threshold(inpBlobPlane, outBlobPlane, 0, 0, cv::THRESH_TOZERO_INV);
//scaleAdd(outBlobPlane, slopeWeight-1, inpBlobPlane, outBlobPlane);
} }
} }
} }
...@@ -196,7 +224,7 @@ Ptr<_Layer> _Layer::create() { \ ...@@ -196,7 +224,7 @@ Ptr<_Layer> _Layer::create() { \
Ptr<ReLULayer> ReLULayer::create(const LayerParams& params) Ptr<ReLULayer> ReLULayer::create(const LayerParams& params)
{ {
float negativeSlope = params.get<float>("negative_slope", 0.f); float negativeSlope = params.get<float>("negative_slope", 0.f);
Ptr<ReLULayer> l(new ElementWiseLayer<ReLUFunctor>(ReLUFunctor(negativeSlope))); Ptr<ReLULayer> l(new ElementWiseLayer<ReLUFunctor>(false, ReLUFunctor(negativeSlope)));
l->setParamsFrom(params); l->setParamsFrom(params);
return l; return l;
...@@ -204,7 +232,7 @@ Ptr<ReLULayer> ReLULayer::create(const LayerParams& params) ...@@ -204,7 +232,7 @@ Ptr<ReLULayer> ReLULayer::create(const LayerParams& params)
Ptr<TanHLayer> TanHLayer::create(const LayerParams& params) Ptr<TanHLayer> TanHLayer::create(const LayerParams& params)
{ {
Ptr<TanHLayer> l(new ElementWiseLayer<TanHFunctor>()); Ptr<TanHLayer> l(new ElementWiseLayer<TanHFunctor>(true));
l->setParamsFrom(params); l->setParamsFrom(params);
return l; return l;
...@@ -212,7 +240,7 @@ Ptr<TanHLayer> TanHLayer::create(const LayerParams& params) ...@@ -212,7 +240,7 @@ Ptr<TanHLayer> TanHLayer::create(const LayerParams& params)
Ptr<SigmoidLayer> SigmoidLayer::create(const LayerParams& params) Ptr<SigmoidLayer> SigmoidLayer::create(const LayerParams& params)
{ {
Ptr<SigmoidLayer> l(new ElementWiseLayer<SigmoidFunctor>()); Ptr<SigmoidLayer> l(new ElementWiseLayer<SigmoidFunctor>(true));
l->setParamsFrom(params); l->setParamsFrom(params);
return l; return l;
...@@ -228,7 +256,7 @@ Ptr<AbsLayer> AbsLayer::create(const LayerParams& params) ...@@ -228,7 +256,7 @@ Ptr<AbsLayer> AbsLayer::create(const LayerParams& params)
Ptr<BNLLLayer> BNLLLayer::create(const LayerParams& params) Ptr<BNLLLayer> BNLLLayer::create(const LayerParams& params)
{ {
Ptr<BNLLLayer> l(new ElementWiseLayer<BNLLFunctor>()); Ptr<BNLLLayer> l(new ElementWiseLayer<BNLLFunctor>(true));
l->setParamsFrom(params); l->setParamsFrom(params);
return l; return l;
...@@ -239,7 +267,9 @@ Ptr<PowerLayer> PowerLayer::create(const LayerParams& params) ...@@ -239,7 +267,9 @@ Ptr<PowerLayer> PowerLayer::create(const LayerParams& params)
float power = params.get<float>("power", 1.0f); float power = params.get<float>("power", 1.0f);
float scale = params.get<float>("scale", 1.0f); float scale = params.get<float>("scale", 1.0f);
float shift = params.get<float>("shift", 0.0f); float shift = params.get<float>("shift", 0.0f);
Ptr<PowerLayer> l(new ElementWiseLayer<PowerFunctor>(PowerFunctor(power, scale, shift))); Ptr<PowerLayer> l(power == 1.f ?
(PowerLayer*)(new ElementWiseLayer<PowerFunctor1>(false, PowerFunctor1(scale, shift))) :
(PowerLayer*)(new ElementWiseLayer<PowerFunctor>(true, PowerFunctor(power, scale, shift))));
l->setParamsFrom(params); l->setParamsFrom(params);
return l; return l;
......
...@@ -98,15 +98,14 @@ public: ...@@ -98,15 +98,14 @@ public:
void forward(std::vector<Mat *> &inputs, std::vector<Mat> &outputs) void forward(std::vector<Mat *> &inputs, std::vector<Mat> &outputs)
{ {
Mat& output = outputs[0];
switch (op) switch (op)
{ {
case SUM: case SUM:
{
CV_Assert(coeffs.size() == 0 || coeffs.size() == inputs.size()); CV_Assert(coeffs.size() == 0 || coeffs.size() == inputs.size());
Mat& output = outputs[0];
output.setTo(0.);
if (0 < coeffs.size()) if (0 < coeffs.size())
{ {
output.setTo(0.);
for (size_t i = 0; i < inputs.size(); i++) for (size_t i = 0; i < inputs.size(); i++)
{ {
output += *inputs[i] * coeffs[i]; output += *inputs[i] * coeffs[i];
...@@ -114,32 +113,26 @@ public: ...@@ -114,32 +113,26 @@ public:
} }
else else
{ {
for (size_t i = 0; i < inputs.size(); i++) add(*inputs[0], *inputs[1], output);
for (size_t i = 2; i < inputs.size(); i++)
{ {
output += *inputs[i]; output += *inputs[i];
} }
} }
}
break; break;
case PROD: case PROD:
{
Mat& output = outputs[0];
output.setTo(1.); output.setTo(1.);
for (size_t i = 0; i < inputs.size(); i++) for (size_t i = 0; i < inputs.size(); i++)
{ {
output = output.mul(*inputs[i]); output = output.mul(*inputs[i]);
} }
}
break; break;
case MAX: case MAX:
{
Mat& output = outputs[0];
cv::max(*inputs[0], *inputs[1], output); cv::max(*inputs[0], *inputs[1], output);
for (size_t i = 2; i < inputs.size(); i++) for (size_t i = 2; i < inputs.size(); i++)
{ {
cv::max(output, *inputs[i], output); cv::max(output, *inputs[i], output);
} }
}
break; break;
default: default:
CV_Assert(0); CV_Assert(0);
......
This diff is collapsed.
...@@ -49,264 +49,15 @@ namespace cv ...@@ -49,264 +49,15 @@ namespace cv
namespace dnn namespace dnn
{ {
template <typename Dtype> void im2row(const float* data_im, int channels, int height, int width,
class im2col_CpuPBody : public cv::ParallelLoopBody int kernel_h, int kernel_w, int pad_h, int pad_w,
{ int stride_h, int stride_w, int dilation_h, int dilation_w,
const Dtype* data_im; int height_col, int width_col, float* data_col);
int channels, height, width;
int kernel_h, kernel_w; void col2im(const float* data_col, int channels, int height, int width,
int pad_h, pad_w; int kernel_h, int kernel_w, int pad_h, int pad_w,
int stride_h, stride_w; int stride_h, int stride_w, int dilation_h, int dilation_w,
int dilation_h, dilation_w; float* data_im, const int* ofsbuf);
Dtype* data_col;
int height_col, width_col, channels_col;
im2col_CpuPBody() {}
public:
static void run(const Dtype* data_im,
int channels, int height, int width,
int kernel_h, int kernel_w,
int pad_h, int pad_w,
int stride_h, int stride_w,
int dilation_h, int dilation_w,
int height_col, int width_col,
Dtype* data_col)
{
im2col_CpuPBody<Dtype> t;
t.data_im = data_im;
t.data_col = data_col;
t.channels = channels; t.height = height; t.width = width;
t.kernel_h = kernel_h; t.kernel_w = kernel_w;
t.pad_h = pad_h; t.pad_w = pad_w;
t.stride_h = stride_h; t.stride_w = stride_w;
t.dilation_h = dilation_h; t.dilation_w = dilation_w;
t.height_col = height_col;
t.width_col = width_col;
t.channels_col = channels * kernel_h * kernel_w;
cv::parallel_for_(Range(0, t.channels_col), t);
}
virtual void operator ()(const Range &r) const
{
for (int c = r.start; c < r.end; ++c)
{
int w_offset = c % kernel_w;
int h_offset = (c / kernel_w) % kernel_h;
int c_im = c / kernel_h / kernel_w;
for (int h = 0; h < height_col; ++h)
{
for (int w = 0; w < width_col; ++w)
{
int h_pad = h * stride_h - pad_h + h_offset * dilation_h;
int w_pad = w * stride_w - pad_w + w_offset * dilation_w;
if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
data_col[(c * height_col + h) * width_col + w] =
data_im[(c_im * height + h_pad) * width + w_pad];
else
data_col[(c * height_col + h) * width_col + w] = 0;
}
}
}
}
};
template <typename Dtype>
class im2row_CpuPBody : public cv::ParallelLoopBody
{
const Dtype* data_im;
int channels, height, width;
int kernel_h, kernel_w;
int pad_h, pad_w;
int stride_h, stride_w;
int dilation_h, dilation_w;
Dtype* data_col;
int height_col, width_col, channels_col;
im2row_CpuPBody() {}
public:
static void run(const Dtype* data_im,
int channels, int height, int width,
int kernel_h, int kernel_w,
int pad_h, int pad_w,
int stride_h, int stride_w,
int dilation_h, int dilation_w,
int height_col, int width_col,
Dtype* data_col)
{
im2row_CpuPBody<Dtype> t;
t.data_im = data_im;
t.data_col = data_col;
t.channels = channels; t.height = height; t.width = width;
t.kernel_h = kernel_h; t.kernel_w = kernel_w;
t.pad_h = pad_h; t.pad_w = pad_w;
t.stride_h = stride_h; t.stride_w = stride_w;
t.dilation_h = dilation_h; t.dilation_w = dilation_w;
t.height_col = height_col;
t.width_col = width_col;
t.channels_col = channels * kernel_h * kernel_w;
cv::parallel_for_(Range(0, t.height_col*t.width_col), t, 16);
}
virtual void operator ()(const Range &r) const
{
int dh = dilation_h, dw = dilation_w;
Dtype* data_col_ = data_col;
const Dtype* data_im_ = data_im;
for (int row = r.start; row < r.end; ++row)
{
int out_c = row % width_col;
int out_r = row / width_col;
int out_row_offset = row*kernel_h*kernel_w*channels;
int start_in_r = out_r * stride_h - pad_h;
int start_in_c = out_c * stride_w - pad_w;
int start_k_r = std::max(0, cvCeil(-start_in_r/(float)dilation_h));
int end_k_r = std::min(kernel_h, cvCeil((height - start_in_r)/(float)dilation_h));
int start_k_c = std::max(0, cvCeil(-start_in_c/(float)dilation_w));
int end_k_c = std::min(kernel_w, cvCeil((width - start_in_c)/(float)dilation_w));
for(int i_c = 0; i_c < channels; i_c++)
{
int channels_offset = i_c * width * height;
int out_ch_offset = i_c*kernel_h*kernel_w;
int in_r = start_in_r + start_k_r*dilation_h;
for(int k_r = start_k_r; k_r < end_k_r; k_r++, in_r += dh)
{
int row_offset = in_r*width;
int out_col_offset = k_r*kernel_w;
int in_c = start_in_c + start_k_c*dilation_w;
for(int k_c = start_k_c; k_c < end_k_c; k_c++, in_c += dw)
{
int in_index = channels_offset + row_offset + in_c;
int out_index = out_row_offset + out_ch_offset + out_col_offset + k_c;
data_col_[out_index] = data_im_[in_index];
}
}
}
}
}
};
template <typename Dtype>
class col2im_CpuPBody : public cv::ParallelLoopBody
{
const Dtype* data_col;
int channels, height, width;
int kernel_h, kernel_w;
int pad_h, pad_w;
int stride_h, stride_w;
Dtype* data_im;
int height_col, width_col;
col2im_CpuPBody() {}
public:
static void run(const Dtype* data_col,
int channels, int height, int width,
int kernel_h, int kernel_w,
int pad_h, int pad_w,
int stride_h, int stride_w,
Dtype* data_im)
{
//TODO: single-threaded version switch
col2im_CpuPBody t;
t.data_col = data_col;
t.data_im = data_im;
t.channels = channels; t.height = height; t.width = width;
t.kernel_h = kernel_h; t.kernel_w = kernel_w;
t.pad_h = pad_h; t.pad_w = pad_w;
t.stride_h = stride_h; t.stride_w = stride_w;
t.height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
t.width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
int img_total = channels * height * width;
cv::parallel_for_(Range(0, img_total), t);
}
virtual void operator ()(const Range &r) const
{
const Dtype* data_col_ = data_col;
Dtype* data_im_ = data_im;
int coeff_h_col = (1 - stride_h * kernel_w * height_col) * width_col;
int coeff_w_col = (1 - stride_w * height_col * width_col);
for (int index = r.start; index < r.end; index++)
{
Dtype val = 0;
int w = index % width + pad_w;
int h = (index / width) % height + pad_h;
int c = index / (width * height);
// compute the start and end of the output
int w_col_start = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
int w_col_end = std::min(w / stride_w + 1, width_col);
int h_col_start = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
int h_col_end = std::min(h / stride_h + 1, height_col);
// equivalent implementation
int offset =
(c * kernel_h * kernel_w + h * kernel_w + w) * height_col * width_col;
for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
val += data_col_[offset + h_col * coeff_h_col + w_col * coeff_w_col];
}
}
data_im_[index] = val;
}
}
};
//single-threaded version
template <typename Dtype>
void col2im_cpu(const Dtype* data_col,
int channels, int height, int width,
int kernel_h, int kernel_w,
int pad_h, int pad_w,
int stride_h, int stride_w,
int dilation_h, int dilation_w,
Dtype* data_im)
{
int height_col = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
int width_col = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
int channels_col = channels * kernel_h * kernel_w;
std::memset(data_im, 0, height * width * channels * sizeof(Dtype));
for (int c = 0; c < channels_col; ++c)
{
int w_offset = c % kernel_w;
int h_offset = (c / kernel_w) % kernel_h;
int c_im = c / kernel_h / kernel_w;
for (int h = 0; h < height_col; ++h)
{
for (int w = 0; w < width_col; ++w)
{
int h_pad = h * stride_h - pad_h + h_offset * dilation_h;
int w_pad = w * stride_w - pad_w + w_offset * dilation_w;
if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
data_im[(c_im * height + h_pad) * width + w_pad] +=
data_col[(c * height_col + h) * width_col + w];
}
}
}
}
} }
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment