Commit 4cb108ef authored by Vitaliy Lyudvichenko's avatar Vitaliy Lyudvichenko

Adding CPU parallelization for element-wise layers and im2col operation

parent 6d3cb808
...@@ -179,9 +179,9 @@ namespace dnn ...@@ -179,9 +179,9 @@ namespace dnn
#endif // HAVE_OPENCL #endif // HAVE_OPENCL
if (inpBlob.type() == CV_32F) if (inpBlob.type() == CV_32F)
im2col_cpu((float *)srcPtr, inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, (float *)colMat.ptr()); im2col_CpuPBody<float>::run((float*)srcPtr, inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, (float *)colMat.ptr());
if (inpBlob.type() == CV_64F) if (inpBlob.type() == CV_64F)
im2col_cpu((double*)srcPtr, inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, (double*)colMat.ptr()); im2col_CpuPBody<double>::run((double*)srcPtr, inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, (double*)colMat.ptr());
} }
void ConvolutionLayer::computeInpOutShape(const Blob &inpBlob) void ConvolutionLayer::computeInpOutShape(const Blob &inpBlob)
...@@ -253,9 +253,9 @@ namespace dnn ...@@ -253,9 +253,9 @@ namespace dnn
if (is1x1()) return; if (is1x1()) return;
if (dstMat.type() == CV_32F) if (dstMat.type() == CV_32F)
col2im_cpu((float*)colMat.ptr(), inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, (float*)dstMat.ptr()); col2im_cpu(colMat.ptr<float>(), inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, dstMat.ptr<float>());
if (dstMat.type() == CV_64F) if (dstMat.type() == CV_64F)
col2im_cpu((double*)colMat.ptr(), inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, (double*)dstMat.ptr()); col2im_cpu(colMat.ptr<double>(), inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, dstMat.ptr<double>());
} }
} }
} }
...@@ -55,130 +55,147 @@ using std::exp; ...@@ -55,130 +55,147 @@ using std::exp;
using std::tanh; using std::tanh;
using std::pow; using std::pow;
template<typename Func> template<typename Func>
class ElementWiseLayer : public Layer class ElementWiseLayer : public Layer
{
Func func;
template<typename Dtype>
class PBody : public cv::ParallelLoopBody
{ {
Func func; Dtype *data;
Func &func;
public: public:
ElementWiseLayer(LayerParams &_params) : func(_params) {} PBody(Blob &blob, Func &func_) :
func(func_), data(blob.ptr<Dtype>())
{}
void allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs) void operator()(const Range &r) const
{ {
outputs.resize(inputs.size()); for (int i = r.start; i < r.end; i++)
for (size_t i = 0; i < inputs.size(); i++) data[i] = func(data[i]);
outputs[i].shareFrom(*inputs[i]); //no data copy
}
void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
{
for (size_t i = 0; i < inputs.size(); i++)
{
CV_Assert(inputs[i]->ptr() == outputs[i].ptr() && inputs[i]->type() == outputs[i].type());
size_t size = outputs[i].total();
if (outputs[i].type() == CV_32F)
{
float *data = outputs[i].ptrf();
for (size_t j = 0; j < size; j++)
data[j] = func(data[j]);
}
else if (outputs[i].type() == CV_64F)
{
double *data = outputs[i].ptr<double>();
for (size_t j = 0; j < size; j++)
data[j] = func(data[j]);
}
else
{
CV_Error(Error::StsNotImplemented, "Only CV_32F and CV_64F blobs are supported");
}
}
} }
}; };
public:
struct ReLUFunctor ElementWiseLayer(LayerParams &_params) : func(_params) {}
void allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
{ {
float negative_slope; outputs.resize(inputs.size());
for (size_t i = 0; i < inputs.size(); i++)
outputs[i].shareFrom(*inputs[i]); //no data copy
}
ReLUFunctor(LayerParams &params) void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
{
for (size_t i = 0; i < inputs.size(); i++)
{ {
if (params.has("negative_slope")) CV_Assert(inputs[i]->ptr() == outputs[i].ptr() && inputs[i]->type() == outputs[i].type());
negative_slope = params.get<float>("negative_slope"); CV_Assert(inputs[i]->matRefConst().isContinuous());
Range sizeRange = Range(0, outputs[i].total());
if (outputs[i].type() == CV_32F)
{
cv::parallel_for_(sizeRange, PBody<float>(outputs[i], func));
}
else if (outputs[i].type() == CV_64F)
{
cv::parallel_for_(sizeRange, PBody<double>(outputs[i], func));
}
else else
negative_slope = 0.f; {
CV_Error(Error::StsNotImplemented, "Only CV_32F and CV_64F blobs are supported");
}
} }
}
};
template<typename TFloat>
inline TFloat operator()(TFloat x)
{
return (x >= (TFloat)0) ? x : negative_slope * x;
}
};
struct TanHFunctor struct ReLUFunctor
{
float negative_slope;
ReLUFunctor(LayerParams &params)
{ {
TanHFunctor(LayerParams&) {} if (params.has("negative_slope"))
negative_slope = params.get<float>("negative_slope");
else
negative_slope = 0.f;
}
template<typename TFloat>
inline TFloat operator()(TFloat x) const
{
return (x >= (TFloat)0) ? x : negative_slope * x;
}
};
template<typename TFloat> struct TanHFunctor
inline TFloat operator()(TFloat x) {
{ TanHFunctor(LayerParams&) {}
return tanh(x);
}
};
struct SigmoidFunctor template<typename TFloat>
inline TFloat operator()(TFloat x) const
{ {
SigmoidFunctor(LayerParams&) {} return tanh(x);
}
};
template<typename TFloat> struct SigmoidFunctor
inline TFloat operator()(TFloat x) {
{ SigmoidFunctor(LayerParams&) {}
return (TFloat)1 / ((TFloat)1 + exp(-x));
}
};
struct AbsValFunctor template<typename TFloat>
inline TFloat operator()(TFloat x) const
{ {
AbsValFunctor(LayerParams&) {} return (TFloat)1 / ((TFloat)1 + exp(-x));
}
};
template<typename TFloat> struct AbsValFunctor
inline TFloat operator()(TFloat x) {
{ AbsValFunctor(LayerParams&) {}
return abs(x);
}
};
struct PowerFunctor template<typename TFloat>
inline TFloat operator()(TFloat x) const
{ {
float power, scale, shift; return abs(x);
}
};
PowerFunctor(LayerParams &params) struct PowerFunctor
{ {
power = params.get<float>("power", 1.0f); float power, scale, shift;
scale = params.get<float>("scale", 1.0f);
shift = params.get<float>("shift", 0.0f);
}
template<typename TFloat> PowerFunctor(LayerParams &params)
inline TFloat operator()(TFloat x) {
{ power = params.get<float>("power", 1.0f);
return pow((TFloat)shift + (TFloat)scale * x, (TFloat)power); scale = params.get<float>("scale", 1.0f);
} shift = params.get<float>("shift", 0.0f);
}; }
struct BNLLFunctor template<typename TFloat>
inline TFloat operator()(TFloat x) const
{ {
BNLLFunctor(LayerParams&) {} return pow((TFloat)shift + (TFloat)scale * x, (TFloat)power);
}
};
struct BNLLFunctor
{
BNLLFunctor(LayerParams&) {}
template<typename TFloat>
inline TFloat operator()(TFloat x) const
{
return log((TFloat)1 + exp(-abs(x)));
}
};
template<typename TFloat>
inline TFloat operator()(TFloat x)
{
return log((TFloat)1 + exp(-abs(x)));
}
};
} }
} }
#endif #endif
...@@ -41,6 +41,8 @@ ...@@ -41,6 +41,8 @@
#ifndef __OPENCV_DNN_LAYERS_IM2COL_HPP__ #ifndef __OPENCV_DNN_LAYERS_IM2COL_HPP__
#define __OPENCV_DNN_LAYERS_IM2COL_HPP__ #define __OPENCV_DNN_LAYERS_IM2COL_HPP__
#include <opencv2/core.hpp>
#include <iostream>
namespace cv namespace cv
{ {
...@@ -48,33 +50,67 @@ namespace dnn ...@@ -48,33 +50,67 @@ namespace dnn
{ {
template <typename Dtype> template <typename Dtype>
void im2col_cpu(const Dtype* data_im, class im2col_CpuPBody : public cv::ParallelLoopBody
int channels, int height, int width,
int kernel_h, int kernel_w,
int pad_h, int pad_w,
int stride_h, int stride_w,
Dtype* data_col)
{ {
int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1; const Dtype* data_im;
int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1; int channels, height, width;
int channels_col = channels * kernel_h * kernel_w; int kernel_h, kernel_w;
for (int c = 0; c < channels_col; ++c) { int pad_h, pad_w;
int w_offset = c % kernel_w; int stride_h, stride_w;
int h_offset = (c / kernel_w) % kernel_h; Dtype* data_col;
int c_im = c / kernel_h / kernel_w; int height_col, width_col, channels_col;
for (int h = 0; h < height_col; ++h) {
for (int w = 0; w < width_col; ++w) { public:
int h_pad = h * stride_h - pad_h + h_offset;
int w_pad = w * stride_w - pad_w + w_offset; im2col_CpuPBody(const Dtype* data_im_,
if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width) int channels_, int height_, int width_,
data_col[(c * height_col + h) * width_col + w] = int kernel_h_, int kernel_w_,
data_im[(c_im * height + h_pad) * width + w_pad]; int pad_h_, int pad_w_,
else int stride_h_, int stride_w_,
data_col[(c * height_col + h) * width_col + w] = 0; Dtype* data_col_) :
data_im(data_im_),
channels(channels_), height(height_), width(width_),
kernel_h(kernel_h_), kernel_w(kernel_w_),
pad_h(pad_h_), pad_w(pad_w_),
stride_h(stride_h_), stride_w(stride_w_),
data_col(data_col_)
{
height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
channels_col = channels * kernel_h * kernel_w;
}
static void run(const Dtype* data_im,
int channels, int height, int width,
int kernel_h, int kernel_w,
int pad_h, int pad_w,
int stride_h, int stride_w,
Dtype* data_col)
{
im2col_CpuPBody<Dtype> pb(data_im, channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, data_col);
cv::parallel_for_(Range(0, pb.channels_col), pb);
}
virtual void operator ()(const Range &r) const
{
for (int c = r.start; c < r.end; ++c) {
int w_offset = c % kernel_w;
int h_offset = (c / kernel_w) % kernel_h;
int c_im = c / kernel_h / kernel_w;
for (int h = 0; h < height_col; ++h) {
for (int w = 0; w < width_col; ++w) {
int h_pad = h * stride_h - pad_h + h_offset;
int w_pad = w * stride_w - pad_w + w_offset;
if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
data_col[(c * height_col + h) * width_col + w] =
data_im[(c_im * height + h_pad) * width + w_pad];
else
data_col[(c * height_col + h) * width_col + w] = 0;
}
} }
} }
} }
} };
template <typename Dtype> template <typename Dtype>
void col2im_cpu(const Dtype* data_col, void col2im_cpu(const Dtype* data_col,
......
...@@ -65,6 +65,8 @@ static void testLayer(String basename, bool useCaffeModel = false, bool useCommo ...@@ -65,6 +65,8 @@ static void testLayer(String basename, bool useCaffeModel = false, bool useCommo
String inpfile = (useCommonInputBlob) ? _tf("blob.npy") : _tf(basename + ".input.npy"); String inpfile = (useCommonInputBlob) ? _tf("blob.npy") : _tf(basename + ".input.npy");
String outfile = _tf(basename + ".npy"); String outfile = _tf(basename + ".npy");
cv::setNumThreads(cv::getNumberOfCPUs());
Net net; Net net;
{ {
Ptr<Importer> importer = createCaffeImporter(prototxt, (useCaffeModel) ? caffemodel : String()); Ptr<Importer> importer = createCaffeImporter(prototxt, (useCaffeModel) ? caffemodel : String());
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment