Commit 4cb108ef authored by Vitaliy Lyudvichenko's avatar Vitaliy Lyudvichenko

Adding CPU parallelization for element-wise layers and im2col operation

parent 6d3cb808
......@@ -179,9 +179,9 @@ namespace dnn
#endif // HAVE_OPENCL
if (inpBlob.type() == CV_32F)
im2col_cpu((float *)srcPtr, inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, (float *)colMat.ptr());
im2col_CpuPBody<float>::run((float*)srcPtr, inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, (float *)colMat.ptr());
if (inpBlob.type() == CV_64F)
im2col_cpu((double*)srcPtr, inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, (double*)colMat.ptr());
im2col_CpuPBody<double>::run((double*)srcPtr, inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, (double*)colMat.ptr());
}
void ConvolutionLayer::computeInpOutShape(const Blob &inpBlob)
......@@ -253,9 +253,9 @@ namespace dnn
if (is1x1()) return;
if (dstMat.type() == CV_32F)
col2im_cpu((float*)colMat.ptr(), inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, (float*)dstMat.ptr());
col2im_cpu(colMat.ptr<float>(), inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, dstMat.ptr<float>());
if (dstMat.type() == CV_64F)
col2im_cpu((double*)colMat.ptr(), inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, (double*)dstMat.ptr());
col2im_cpu(colMat.ptr<double>(), inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, dstMat.ptr<double>());
}
}
}
......@@ -55,130 +55,147 @@ using std::exp;
using std::tanh;
using std::pow;
template<typename Func>
class ElementWiseLayer : public Layer
template<typename Func>
class ElementWiseLayer : public Layer
{
Func func;
template<typename Dtype>
class PBody : public cv::ParallelLoopBody
{
Func func;
Dtype *data;
Func &func;
public:
ElementWiseLayer(LayerParams &_params) : func(_params) {}
PBody(Blob &blob, Func &func_) :
func(func_), data(blob.ptr<Dtype>())
{}
void allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
void operator()(const Range &r) const
{
outputs.resize(inputs.size());
for (size_t i = 0; i < inputs.size(); i++)
outputs[i].shareFrom(*inputs[i]); //no data copy
}
void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
{
for (size_t i = 0; i < inputs.size(); i++)
{
CV_Assert(inputs[i]->ptr() == outputs[i].ptr() && inputs[i]->type() == outputs[i].type());
size_t size = outputs[i].total();
if (outputs[i].type() == CV_32F)
{
float *data = outputs[i].ptrf();
for (size_t j = 0; j < size; j++)
data[j] = func(data[j]);
}
else if (outputs[i].type() == CV_64F)
{
double *data = outputs[i].ptr<double>();
for (size_t j = 0; j < size; j++)
data[j] = func(data[j]);
}
else
{
CV_Error(Error::StsNotImplemented, "Only CV_32F and CV_64F blobs are supported");
}
}
for (int i = r.start; i < r.end; i++)
data[i] = func(data[i]);
}
};
public:
struct ReLUFunctor
ElementWiseLayer(LayerParams &_params) : func(_params) {}
void allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
{
float negative_slope;
outputs.resize(inputs.size());
for (size_t i = 0; i < inputs.size(); i++)
outputs[i].shareFrom(*inputs[i]); //no data copy
}
ReLUFunctor(LayerParams &params)
void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
{
for (size_t i = 0; i < inputs.size(); i++)
{
if (params.has("negative_slope"))
negative_slope = params.get<float>("negative_slope");
CV_Assert(inputs[i]->ptr() == outputs[i].ptr() && inputs[i]->type() == outputs[i].type());
CV_Assert(inputs[i]->matRefConst().isContinuous());
Range sizeRange = Range(0, outputs[i].total());
if (outputs[i].type() == CV_32F)
{
cv::parallel_for_(sizeRange, PBody<float>(outputs[i], func));
}
else if (outputs[i].type() == CV_64F)
{
cv::parallel_for_(sizeRange, PBody<double>(outputs[i], func));
}
else
negative_slope = 0.f;
{
CV_Error(Error::StsNotImplemented, "Only CV_32F and CV_64F blobs are supported");
}
}
}
};
template<typename TFloat>
inline TFloat operator()(TFloat x)
{
return (x >= (TFloat)0) ? x : negative_slope * x;
}
};
struct TanHFunctor
struct ReLUFunctor
{
float negative_slope;
ReLUFunctor(LayerParams &params)
{
TanHFunctor(LayerParams&) {}
if (params.has("negative_slope"))
negative_slope = params.get<float>("negative_slope");
else
negative_slope = 0.f;
}
template<typename TFloat>
inline TFloat operator()(TFloat x) const
{
return (x >= (TFloat)0) ? x : negative_slope * x;
}
};
template<typename TFloat>
inline TFloat operator()(TFloat x)
{
return tanh(x);
}
};
struct TanHFunctor
{
TanHFunctor(LayerParams&) {}
struct SigmoidFunctor
template<typename TFloat>
inline TFloat operator()(TFloat x) const
{
SigmoidFunctor(LayerParams&) {}
return tanh(x);
}
};
template<typename TFloat>
inline TFloat operator()(TFloat x)
{
return (TFloat)1 / ((TFloat)1 + exp(-x));
}
};
struct SigmoidFunctor
{
SigmoidFunctor(LayerParams&) {}
struct AbsValFunctor
template<typename TFloat>
inline TFloat operator()(TFloat x) const
{
AbsValFunctor(LayerParams&) {}
return (TFloat)1 / ((TFloat)1 + exp(-x));
}
};
template<typename TFloat>
inline TFloat operator()(TFloat x)
{
return abs(x);
}
};
struct AbsValFunctor
{
AbsValFunctor(LayerParams&) {}
struct PowerFunctor
template<typename TFloat>
inline TFloat operator()(TFloat x) const
{
float power, scale, shift;
return abs(x);
}
};
PowerFunctor(LayerParams &params)
{
power = params.get<float>("power", 1.0f);
scale = params.get<float>("scale", 1.0f);
shift = params.get<float>("shift", 0.0f);
}
struct PowerFunctor
{
float power, scale, shift;
template<typename TFloat>
inline TFloat operator()(TFloat x)
{
return pow((TFloat)shift + (TFloat)scale * x, (TFloat)power);
}
};
PowerFunctor(LayerParams &params)
{
power = params.get<float>("power", 1.0f);
scale = params.get<float>("scale", 1.0f);
shift = params.get<float>("shift", 0.0f);
}
struct BNLLFunctor
template<typename TFloat>
inline TFloat operator()(TFloat x) const
{
BNLLFunctor(LayerParams&) {}
return pow((TFloat)shift + (TFloat)scale * x, (TFloat)power);
}
};
struct BNLLFunctor
{
BNLLFunctor(LayerParams&) {}
template<typename TFloat>
inline TFloat operator()(TFloat x) const
{
return log((TFloat)1 + exp(-abs(x)));
}
};
template<typename TFloat>
inline TFloat operator()(TFloat x)
{
return log((TFloat)1 + exp(-abs(x)));
}
};
}
}
#endif
......@@ -41,6 +41,8 @@
#ifndef __OPENCV_DNN_LAYERS_IM2COL_HPP__
#define __OPENCV_DNN_LAYERS_IM2COL_HPP__
#include <opencv2/core.hpp>
#include <iostream>
namespace cv
{
......@@ -48,33 +50,67 @@ namespace dnn
{
template <typename Dtype>
void im2col_cpu(const Dtype* data_im,
int channels, int height, int width,
int kernel_h, int kernel_w,
int pad_h, int pad_w,
int stride_h, int stride_w,
Dtype* data_col)
class im2col_CpuPBody : public cv::ParallelLoopBody
{
int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
int channels_col = channels * kernel_h * kernel_w;
for (int c = 0; c < channels_col; ++c) {
int w_offset = c % kernel_w;
int h_offset = (c / kernel_w) % kernel_h;
int c_im = c / kernel_h / kernel_w;
for (int h = 0; h < height_col; ++h) {
for (int w = 0; w < width_col; ++w) {
int h_pad = h * stride_h - pad_h + h_offset;
int w_pad = w * stride_w - pad_w + w_offset;
if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
data_col[(c * height_col + h) * width_col + w] =
data_im[(c_im * height + h_pad) * width + w_pad];
else
data_col[(c * height_col + h) * width_col + w] = 0;
const Dtype* data_im;
int channels, height, width;
int kernel_h, kernel_w;
int pad_h, pad_w;
int stride_h, stride_w;
Dtype* data_col;
int height_col, width_col, channels_col;
public:
im2col_CpuPBody(const Dtype* data_im_,
int channels_, int height_, int width_,
int kernel_h_, int kernel_w_,
int pad_h_, int pad_w_,
int stride_h_, int stride_w_,
Dtype* data_col_) :
data_im(data_im_),
channels(channels_), height(height_), width(width_),
kernel_h(kernel_h_), kernel_w(kernel_w_),
pad_h(pad_h_), pad_w(pad_w_),
stride_h(stride_h_), stride_w(stride_w_),
data_col(data_col_)
{
height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
channels_col = channels * kernel_h * kernel_w;
}
static void run(const Dtype* data_im,
int channels, int height, int width,
int kernel_h, int kernel_w,
int pad_h, int pad_w,
int stride_h, int stride_w,
Dtype* data_col)
{
im2col_CpuPBody<Dtype> pb(data_im, channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, data_col);
cv::parallel_for_(Range(0, pb.channels_col), pb);
}
virtual void operator ()(const Range &r) const
{
for (int c = r.start; c < r.end; ++c) {
int w_offset = c % kernel_w;
int h_offset = (c / kernel_w) % kernel_h;
int c_im = c / kernel_h / kernel_w;
for (int h = 0; h < height_col; ++h) {
for (int w = 0; w < width_col; ++w) {
int h_pad = h * stride_h - pad_h + h_offset;
int w_pad = w * stride_w - pad_w + w_offset;
if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
data_col[(c * height_col + h) * width_col + w] =
data_im[(c_im * height + h_pad) * width + w_pad];
else
data_col[(c * height_col + h) * width_col + w] = 0;
}
}
}
}
}
};
template <typename Dtype>
void col2im_cpu(const Dtype* data_col,
......
......@@ -65,6 +65,8 @@ static void testLayer(String basename, bool useCaffeModel = false, bool useCommo
String inpfile = (useCommonInputBlob) ? _tf("blob.npy") : _tf(basename + ".input.npy");
String outfile = _tf(basename + ".npy");
cv::setNumThreads(cv::getNumberOfCPUs());
Net net;
{
Ptr<Importer> importer = createCaffeImporter(prototxt, (useCaffeModel) ? caffemodel : String());
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment