Commit 9e26b24d authored by Vadim Pisarevsky's avatar Vadim Pisarevsky

improved speed of ENet processing.

parent 3f5b4655
......@@ -98,14 +98,19 @@ int main(int argc, char **argv)
net.setBlob("", inputBlob); //set the network input
//! [Set input blob]
const int N = 3;
TickMeter tm;
tm.start();
//! [Make forward pass]
net.forward(); //compute output
//! [Make forward pass]
tm.stop();
for( int i = 0; i < N; i++ )
{
TickMeter tm_;
tm_.start();
net.forward(); //compute output
tm_.stop();
if( i == 0 || tm_.getTimeTicks() < tm.getTimeTicks() )
tm = tm_;
}
//! [Gather output]
......
......@@ -41,6 +41,15 @@ public:
Mat* inp = inputs[i];
outputs[i].create(inp->dims, &inp->size.p[0], inp->type());
}
varMeanScale = 1.f;
if (!hasWeights && !hasBias) {
varMeanScale = *blobs[2].ptr<float>();
if (varMeanScale != 0)
varMeanScale = 1/varMeanScale;
}
cv::pow(blobs[1]*varMeanScale + epsilon, -0.5, invStdMat);
}
void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
......@@ -52,16 +61,6 @@ public:
int weightsBlobIndex = 2;
int biasBlobIndex = weightsBlobIndex + hasWeights;
float varMeanScale = 1;
if (!hasWeights && !hasBias) {
varMeanScale = *blobs[2].ptr<float>();
if (varMeanScale != 0)
varMeanScale = 1/varMeanScale;
}
Mat invStdMat;
cv::pow(blobs[1]*varMeanScale + epsilon, -0.5, invStdMat);
int rows = inpBlob.size[2];
int cols = inpBlob.size[3];
......@@ -92,7 +91,8 @@ public:
}
bool hasWeights, hasBias;
float epsilon;
float epsilon, varMeanScale;
Mat invStdMat;
};
Ptr<BatchNormLayer> BatchNormLayer::create(const LayerParams& params)
......
......@@ -15,8 +15,7 @@ using std::pow;
template<typename Func>
class ElementWiseLayer : public Func::Layer
{
Func func;
public:
template<typename Dtype>
class PBody : public cv::ParallelLoopBody
{
......@@ -35,9 +34,7 @@ class ElementWiseLayer : public Func::Layer
}
};
public:
ElementWiseLayer(const Func &f=Func()) : func(f) {}
ElementWiseLayer(bool run_parallel_=false, const Func &f=Func()) : func(f), run_parallel(run_parallel_) {}
void allocate(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
{
......@@ -58,9 +55,16 @@ public:
Range sizeRange = Range(0, dst.total());
CV_Assert(src.type() == CV_32F);
cv::parallel_for_(sizeRange, PBody<float>(dst, func));
PBody<float> body(dst, func);
if( run_parallel )
cv::parallel_for_(sizeRange, body);
else
body(sizeRange);
}
}
Func func;
bool run_parallel;
};
struct ReLUFunctor
......@@ -135,8 +139,24 @@ struct PowerFunctor
template<typename TFloat>
inline TFloat operator()(TFloat x) const
{
return power == 1.0f ? (TFloat)shift + (TFloat)scale * x :
pow((TFloat)shift + (TFloat)scale * x, (TFloat)power);
return pow((TFloat)shift + (TFloat)scale * x, (TFloat)power);
}
};
struct PowerFunctor1
{
typedef PowerLayer Layer;
const float scale;
const float shift;
PowerFunctor1(float scale_ = 1.f, float shift_ = 0)
: scale(scale_), shift(shift_) {}
template<typename TFloat>
inline TFloat operator()(TFloat x) const
{
return (TFloat)shift + (TFloat)scale * x;
}
};
......@@ -165,12 +185,12 @@ public:
void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
{
CV_Assert(inputs.size() == 1);
Mat &inpBlob = *inputs[0];
for (size_t ii = 0; ii < outputs.size(); ii++)
{
Mat &outBlob = outputs[ii];
CV_Assert(inpBlob.isContinuous() && outBlob.isContinuous());
CV_Assert(blobs[0].total() == inpBlob.size[1]);
......@@ -181,8 +201,16 @@ public:
Mat inpBlobPlane = getPlane(inpBlob, 0, n);
Mat outBlobPlane = getPlane(outBlob, 0, n);
threshold(inpBlobPlane, outBlobPlane, 0, 0, cv::THRESH_TOZERO_INV);
scaleAdd(outBlobPlane, slopeWeight-1, inpBlobPlane, outBlobPlane);
size_t i, planeTotal = inpBlobPlane.total();
const float* inptr = inpBlobPlane.ptr<float>();
float* outptr = outBlobPlane.ptr<float>();
for( i = 0; i < planeTotal; i++ )
{
float val = inptr[i];
outptr[i] = val*(val >= 0.f ? 1.f : slopeWeight);
}
//threshold(inpBlobPlane, outBlobPlane, 0, 0, cv::THRESH_TOZERO_INV);
//scaleAdd(outBlobPlane, slopeWeight-1, inpBlobPlane, outBlobPlane);
}
}
}
......@@ -196,7 +224,7 @@ Ptr<_Layer> _Layer::create() { \
Ptr<ReLULayer> ReLULayer::create(const LayerParams& params)
{
float negativeSlope = params.get<float>("negative_slope", 0.f);
Ptr<ReLULayer> l(new ElementWiseLayer<ReLUFunctor>(ReLUFunctor(negativeSlope)));
Ptr<ReLULayer> l(new ElementWiseLayer<ReLUFunctor>(false, ReLUFunctor(negativeSlope)));
l->setParamsFrom(params);
return l;
......@@ -204,7 +232,7 @@ Ptr<ReLULayer> ReLULayer::create(const LayerParams& params)
Ptr<TanHLayer> TanHLayer::create(const LayerParams& params)
{
Ptr<TanHLayer> l(new ElementWiseLayer<TanHFunctor>());
Ptr<TanHLayer> l(new ElementWiseLayer<TanHFunctor>(true));
l->setParamsFrom(params);
return l;
......@@ -212,7 +240,7 @@ Ptr<TanHLayer> TanHLayer::create(const LayerParams& params)
Ptr<SigmoidLayer> SigmoidLayer::create(const LayerParams& params)
{
Ptr<SigmoidLayer> l(new ElementWiseLayer<SigmoidFunctor>());
Ptr<SigmoidLayer> l(new ElementWiseLayer<SigmoidFunctor>(true));
l->setParamsFrom(params);
return l;
......@@ -228,7 +256,7 @@ Ptr<AbsLayer> AbsLayer::create(const LayerParams& params)
Ptr<BNLLLayer> BNLLLayer::create(const LayerParams& params)
{
Ptr<BNLLLayer> l(new ElementWiseLayer<BNLLFunctor>());
Ptr<BNLLLayer> l(new ElementWiseLayer<BNLLFunctor>(true));
l->setParamsFrom(params);
return l;
......@@ -239,7 +267,9 @@ Ptr<PowerLayer> PowerLayer::create(const LayerParams& params)
float power = params.get<float>("power", 1.0f);
float scale = params.get<float>("scale", 1.0f);
float shift = params.get<float>("shift", 0.0f);
Ptr<PowerLayer> l(new ElementWiseLayer<PowerFunctor>(PowerFunctor(power, scale, shift)));
Ptr<PowerLayer> l(power == 1.f ?
(PowerLayer*)(new ElementWiseLayer<PowerFunctor1>(false, PowerFunctor1(scale, shift))) :
(PowerLayer*)(new ElementWiseLayer<PowerFunctor>(true, PowerFunctor(power, scale, shift))));
l->setParamsFrom(params);
return l;
......
......@@ -98,15 +98,14 @@ public:
void forward(std::vector<Mat *> &inputs, std::vector<Mat> &outputs)
{
Mat& output = outputs[0];
switch (op)
{
case SUM:
{
CV_Assert(coeffs.size() == 0 || coeffs.size() == inputs.size());
Mat& output = outputs[0];
output.setTo(0.);
if (0 < coeffs.size())
{
output.setTo(0.);
for (size_t i = 0; i < inputs.size(); i++)
{
output += *inputs[i] * coeffs[i];
......@@ -114,32 +113,26 @@ public:
}
else
{
for (size_t i = 0; i < inputs.size(); i++)
add(*inputs[0], *inputs[1], output);
for (size_t i = 2; i < inputs.size(); i++)
{
output += *inputs[i];
}
}
}
break;
case PROD:
{
Mat& output = outputs[0];
output.setTo(1.);
for (size_t i = 0; i < inputs.size(); i++)
{
output = output.mul(*inputs[i]);
}
}
break;
case MAX:
{
Mat& output = outputs[0];
cv::max(*inputs[0], *inputs[1], output);
for (size_t i = 2; i < inputs.size(); i++)
{
cv::max(output, *inputs[i], output);
}
}
break;
default:
CV_Assert(0);
......
This diff is collapsed.
......@@ -49,264 +49,15 @@ namespace cv
namespace dnn
{
template <typename Dtype>
class im2col_CpuPBody : public cv::ParallelLoopBody
{
const Dtype* data_im;
int channels, height, width;
int kernel_h, kernel_w;
int pad_h, pad_w;
int stride_h, stride_w;
int dilation_h, dilation_w;
Dtype* data_col;
int height_col, width_col, channels_col;
im2col_CpuPBody() {}
public:
static void run(const Dtype* data_im,
int channels, int height, int width,
int kernel_h, int kernel_w,
int pad_h, int pad_w,
int stride_h, int stride_w,
int dilation_h, int dilation_w,
int height_col, int width_col,
Dtype* data_col)
{
im2col_CpuPBody<Dtype> t;
t.data_im = data_im;
t.data_col = data_col;
t.channels = channels; t.height = height; t.width = width;
t.kernel_h = kernel_h; t.kernel_w = kernel_w;
t.pad_h = pad_h; t.pad_w = pad_w;
t.stride_h = stride_h; t.stride_w = stride_w;
t.dilation_h = dilation_h; t.dilation_w = dilation_w;
t.height_col = height_col;
t.width_col = width_col;
t.channels_col = channels * kernel_h * kernel_w;
cv::parallel_for_(Range(0, t.channels_col), t);
}
virtual void operator ()(const Range &r) const
{
for (int c = r.start; c < r.end; ++c)
{
int w_offset = c % kernel_w;
int h_offset = (c / kernel_w) % kernel_h;
int c_im = c / kernel_h / kernel_w;
for (int h = 0; h < height_col; ++h)
{
for (int w = 0; w < width_col; ++w)
{
int h_pad = h * stride_h - pad_h + h_offset * dilation_h;
int w_pad = w * stride_w - pad_w + w_offset * dilation_w;
if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
data_col[(c * height_col + h) * width_col + w] =
data_im[(c_im * height + h_pad) * width + w_pad];
else
data_col[(c * height_col + h) * width_col + w] = 0;
}
}
}
}
};
template <typename Dtype>
class im2row_CpuPBody : public cv::ParallelLoopBody
{
const Dtype* data_im;
int channels, height, width;
int kernel_h, kernel_w;
int pad_h, pad_w;
int stride_h, stride_w;
int dilation_h, dilation_w;
Dtype* data_col;
int height_col, width_col, channels_col;
im2row_CpuPBody() {}
public:
static void run(const Dtype* data_im,
int channels, int height, int width,
int kernel_h, int kernel_w,
int pad_h, int pad_w,
int stride_h, int stride_w,
int dilation_h, int dilation_w,
int height_col, int width_col,
Dtype* data_col)
{
im2row_CpuPBody<Dtype> t;
t.data_im = data_im;
t.data_col = data_col;
t.channels = channels; t.height = height; t.width = width;
t.kernel_h = kernel_h; t.kernel_w = kernel_w;
t.pad_h = pad_h; t.pad_w = pad_w;
t.stride_h = stride_h; t.stride_w = stride_w;
t.dilation_h = dilation_h; t.dilation_w = dilation_w;
t.height_col = height_col;
t.width_col = width_col;
t.channels_col = channels * kernel_h * kernel_w;
cv::parallel_for_(Range(0, t.height_col*t.width_col), t, 16);
}
virtual void operator ()(const Range &r) const
{
int dh = dilation_h, dw = dilation_w;
Dtype* data_col_ = data_col;
const Dtype* data_im_ = data_im;
for (int row = r.start; row < r.end; ++row)
{
int out_c = row % width_col;
int out_r = row / width_col;
int out_row_offset = row*kernel_h*kernel_w*channels;
int start_in_r = out_r * stride_h - pad_h;
int start_in_c = out_c * stride_w - pad_w;
int start_k_r = std::max(0, cvCeil(-start_in_r/(float)dilation_h));
int end_k_r = std::min(kernel_h, cvCeil((height - start_in_r)/(float)dilation_h));
int start_k_c = std::max(0, cvCeil(-start_in_c/(float)dilation_w));
int end_k_c = std::min(kernel_w, cvCeil((width - start_in_c)/(float)dilation_w));
for(int i_c = 0; i_c < channels; i_c++)
{
int channels_offset = i_c * width * height;
int out_ch_offset = i_c*kernel_h*kernel_w;
int in_r = start_in_r + start_k_r*dilation_h;
for(int k_r = start_k_r; k_r < end_k_r; k_r++, in_r += dh)
{
int row_offset = in_r*width;
int out_col_offset = k_r*kernel_w;
int in_c = start_in_c + start_k_c*dilation_w;
for(int k_c = start_k_c; k_c < end_k_c; k_c++, in_c += dw)
{
int in_index = channels_offset + row_offset + in_c;
int out_index = out_row_offset + out_ch_offset + out_col_offset + k_c;
data_col_[out_index] = data_im_[in_index];
}
}
}
}
}
};
template <typename Dtype>
class col2im_CpuPBody : public cv::ParallelLoopBody
{
const Dtype* data_col;
int channels, height, width;
int kernel_h, kernel_w;
int pad_h, pad_w;
int stride_h, stride_w;
Dtype* data_im;
int height_col, width_col;
col2im_CpuPBody() {}
public:
static void run(const Dtype* data_col,
int channels, int height, int width,
int kernel_h, int kernel_w,
int pad_h, int pad_w,
int stride_h, int stride_w,
Dtype* data_im)
{
//TODO: single-threaded version switch
col2im_CpuPBody t;
t.data_col = data_col;
t.data_im = data_im;
t.channels = channels; t.height = height; t.width = width;
t.kernel_h = kernel_h; t.kernel_w = kernel_w;
t.pad_h = pad_h; t.pad_w = pad_w;
t.stride_h = stride_h; t.stride_w = stride_w;
t.height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
t.width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
int img_total = channels * height * width;
cv::parallel_for_(Range(0, img_total), t);
}
virtual void operator ()(const Range &r) const
{
const Dtype* data_col_ = data_col;
Dtype* data_im_ = data_im;
int coeff_h_col = (1 - stride_h * kernel_w * height_col) * width_col;
int coeff_w_col = (1 - stride_w * height_col * width_col);
for (int index = r.start; index < r.end; index++)
{
Dtype val = 0;
int w = index % width + pad_w;
int h = (index / width) % height + pad_h;
int c = index / (width * height);
// compute the start and end of the output
int w_col_start = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
int w_col_end = std::min(w / stride_w + 1, width_col);
int h_col_start = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
int h_col_end = std::min(h / stride_h + 1, height_col);
// equivalent implementation
int offset =
(c * kernel_h * kernel_w + h * kernel_w + w) * height_col * width_col;
for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
val += data_col_[offset + h_col * coeff_h_col + w_col * coeff_w_col];
}
}
data_im_[index] = val;
}
}
};
//single-threaded version
template <typename Dtype>
void col2im_cpu(const Dtype* data_col,
int channels, int height, int width,
int kernel_h, int kernel_w,
int pad_h, int pad_w,
int stride_h, int stride_w,
int dilation_h, int dilation_w,
Dtype* data_im)
{
int height_col = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
int width_col = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
int channels_col = channels * kernel_h * kernel_w;
std::memset(data_im, 0, height * width * channels * sizeof(Dtype));
for (int c = 0; c < channels_col; ++c)
{
int w_offset = c % kernel_w;
int h_offset = (c / kernel_w) % kernel_h;
int c_im = c / kernel_h / kernel_w;
for (int h = 0; h < height_col; ++h)
{
for (int w = 0; w < width_col; ++w)
{
int h_pad = h * stride_h - pad_h + h_offset * dilation_h;
int w_pad = w * stride_w - pad_w + w_offset * dilation_w;
if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
data_im[(c_im * height + h_pad) * width + w_pad] +=
data_col[(c * height_col + h) * width_col + w];
}
}
}
}
void im2row(const float* data_im, int channels, int height, int width,
int kernel_h, int kernel_w, int pad_h, int pad_w,
int stride_h, int stride_w, int dilation_h, int dilation_w,
int height_col, int width_col, float* data_col);
void col2im(const float* data_col, int channels, int height, int width,
int kernel_h, int kernel_w, int pad_h, int pad_w,
int stride_h, int stride_w, int dilation_h, int dilation_w,
float* data_im, const int* ofsbuf);
}
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment