Commit 4cb108ef authored by Vitaliy Lyudvichenko's avatar Vitaliy Lyudvichenko

Adding CPU parallelization for element-wise layers and im2col operation

parent 6d3cb808
...@@ -179,9 +179,9 @@ namespace dnn ...@@ -179,9 +179,9 @@ namespace dnn
#endif // HAVE_OPENCL #endif // HAVE_OPENCL
if (inpBlob.type() == CV_32F) if (inpBlob.type() == CV_32F)
im2col_cpu((float *)srcPtr, inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, (float *)colMat.ptr()); im2col_CpuPBody<float>::run((float*)srcPtr, inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, (float *)colMat.ptr());
if (inpBlob.type() == CV_64F) if (inpBlob.type() == CV_64F)
im2col_cpu((double*)srcPtr, inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, (double*)colMat.ptr()); im2col_CpuPBody<double>::run((double*)srcPtr, inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, (double*)colMat.ptr());
} }
void ConvolutionLayer::computeInpOutShape(const Blob &inpBlob) void ConvolutionLayer::computeInpOutShape(const Blob &inpBlob)
...@@ -253,9 +253,9 @@ namespace dnn ...@@ -253,9 +253,9 @@ namespace dnn
if (is1x1()) return; if (is1x1()) return;
if (dstMat.type() == CV_32F) if (dstMat.type() == CV_32F)
col2im_cpu((float*)colMat.ptr(), inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, (float*)dstMat.ptr()); col2im_cpu(colMat.ptr<float>(), inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, dstMat.ptr<float>());
if (dstMat.type() == CV_64F) if (dstMat.type() == CV_64F)
col2im_cpu((double*)colMat.ptr(), inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, (double*)dstMat.ptr()); col2im_cpu(colMat.ptr<double>(), inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, dstMat.ptr<double>());
} }
} }
} }
...@@ -55,12 +55,31 @@ using std::exp; ...@@ -55,12 +55,31 @@ using std::exp;
using std::tanh; using std::tanh;
using std::pow; using std::pow;
template<typename Func> template<typename Func>
class ElementWiseLayer : public Layer class ElementWiseLayer : public Layer
{ {
Func func; Func func;
template<typename Dtype>
class PBody : public cv::ParallelLoopBody
{
Dtype *data;
Func &func;
public: public:
PBody(Blob &blob, Func &func_) :
func(func_), data(blob.ptr<Dtype>())
{}
void operator()(const Range &r) const
{
for (int i = r.start; i < r.end; i++)
data[i] = func(data[i]);
}
};
public:
ElementWiseLayer(LayerParams &_params) : func(_params) {} ElementWiseLayer(LayerParams &_params) : func(_params) {}
void allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs) void allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
...@@ -75,20 +94,17 @@ using std::pow; ...@@ -75,20 +94,17 @@ using std::pow;
for (size_t i = 0; i < inputs.size(); i++) for (size_t i = 0; i < inputs.size(); i++)
{ {
CV_Assert(inputs[i]->ptr() == outputs[i].ptr() && inputs[i]->type() == outputs[i].type()); CV_Assert(inputs[i]->ptr() == outputs[i].ptr() && inputs[i]->type() == outputs[i].type());
CV_Assert(inputs[i]->matRefConst().isContinuous());
size_t size = outputs[i].total(); Range sizeRange = Range(0, outputs[i].total());
if (outputs[i].type() == CV_32F) if (outputs[i].type() == CV_32F)
{ {
float *data = outputs[i].ptrf(); cv::parallel_for_(sizeRange, PBody<float>(outputs[i], func));
for (size_t j = 0; j < size; j++)
data[j] = func(data[j]);
} }
else if (outputs[i].type() == CV_64F) else if (outputs[i].type() == CV_64F)
{ {
double *data = outputs[i].ptr<double>(); cv::parallel_for_(sizeRange, PBody<double>(outputs[i], func));
for (size_t j = 0; j < size; j++)
data[j] = func(data[j]);
} }
else else
{ {
...@@ -96,11 +112,11 @@ using std::pow; ...@@ -96,11 +112,11 @@ using std::pow;
} }
} }
} }
}; };
struct ReLUFunctor struct ReLUFunctor
{ {
float negative_slope; float negative_slope;
ReLUFunctor(LayerParams &params) ReLUFunctor(LayerParams &params)
...@@ -112,47 +128,47 @@ using std::pow; ...@@ -112,47 +128,47 @@ using std::pow;
} }
template<typename TFloat> template<typename TFloat>
inline TFloat operator()(TFloat x) inline TFloat operator()(TFloat x) const
{ {
return (x >= (TFloat)0) ? x : negative_slope * x; return (x >= (TFloat)0) ? x : negative_slope * x;
} }
}; };
struct TanHFunctor struct TanHFunctor
{ {
TanHFunctor(LayerParams&) {} TanHFunctor(LayerParams&) {}
template<typename TFloat> template<typename TFloat>
inline TFloat operator()(TFloat x) inline TFloat operator()(TFloat x) const
{ {
return tanh(x); return tanh(x);
} }
}; };
struct SigmoidFunctor struct SigmoidFunctor
{ {
SigmoidFunctor(LayerParams&) {} SigmoidFunctor(LayerParams&) {}
template<typename TFloat> template<typename TFloat>
inline TFloat operator()(TFloat x) inline TFloat operator()(TFloat x) const
{ {
return (TFloat)1 / ((TFloat)1 + exp(-x)); return (TFloat)1 / ((TFloat)1 + exp(-x));
} }
}; };
struct AbsValFunctor struct AbsValFunctor
{ {
AbsValFunctor(LayerParams&) {} AbsValFunctor(LayerParams&) {}
template<typename TFloat> template<typename TFloat>
inline TFloat operator()(TFloat x) inline TFloat operator()(TFloat x) const
{ {
return abs(x); return abs(x);
} }
}; };
struct PowerFunctor struct PowerFunctor
{ {
float power, scale, shift; float power, scale, shift;
PowerFunctor(LayerParams &params) PowerFunctor(LayerParams &params)
...@@ -163,22 +179,23 @@ using std::pow; ...@@ -163,22 +179,23 @@ using std::pow;
} }
template<typename TFloat> template<typename TFloat>
inline TFloat operator()(TFloat x) inline TFloat operator()(TFloat x) const
{ {
return pow((TFloat)shift + (TFloat)scale * x, (TFloat)power); return pow((TFloat)shift + (TFloat)scale * x, (TFloat)power);
} }
}; };
struct BNLLFunctor struct BNLLFunctor
{ {
BNLLFunctor(LayerParams&) {} BNLLFunctor(LayerParams&) {}
template<typename TFloat> template<typename TFloat>
inline TFloat operator()(TFloat x) inline TFloat operator()(TFloat x) const
{ {
return log((TFloat)1 + exp(-abs(x))); return log((TFloat)1 + exp(-abs(x)));
} }
}; };
} }
} }
#endif #endif
...@@ -41,6 +41,8 @@ ...@@ -41,6 +41,8 @@
#ifndef __OPENCV_DNN_LAYERS_IM2COL_HPP__ #ifndef __OPENCV_DNN_LAYERS_IM2COL_HPP__
#define __OPENCV_DNN_LAYERS_IM2COL_HPP__ #define __OPENCV_DNN_LAYERS_IM2COL_HPP__
#include <opencv2/core.hpp>
#include <iostream>
namespace cv namespace cv
{ {
...@@ -48,17 +50,50 @@ namespace dnn ...@@ -48,17 +50,50 @@ namespace dnn
{ {
template <typename Dtype> template <typename Dtype>
void im2col_cpu(const Dtype* data_im, class im2col_CpuPBody : public cv::ParallelLoopBody
{
const Dtype* data_im;
int channels, height, width;
int kernel_h, kernel_w;
int pad_h, pad_w;
int stride_h, stride_w;
Dtype* data_col;
int height_col, width_col, channels_col;
public:
im2col_CpuPBody(const Dtype* data_im_,
int channels_, int height_, int width_,
int kernel_h_, int kernel_w_,
int pad_h_, int pad_w_,
int stride_h_, int stride_w_,
Dtype* data_col_) :
data_im(data_im_),
channels(channels_), height(height_), width(width_),
kernel_h(kernel_h_), kernel_w(kernel_w_),
pad_h(pad_h_), pad_w(pad_w_),
stride_h(stride_h_), stride_w(stride_w_),
data_col(data_col_)
{
height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
channels_col = channels * kernel_h * kernel_w;
}
static void run(const Dtype* data_im,
int channels, int height, int width, int channels, int height, int width,
int kernel_h, int kernel_w, int kernel_h, int kernel_w,
int pad_h, int pad_w, int pad_h, int pad_w,
int stride_h, int stride_w, int stride_h, int stride_w,
Dtype* data_col) Dtype* data_col)
{ {
int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1; im2col_CpuPBody<Dtype> pb(data_im, channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, data_col);
int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1; cv::parallel_for_(Range(0, pb.channels_col), pb);
int channels_col = channels * kernel_h * kernel_w; }
for (int c = 0; c < channels_col; ++c) {
virtual void operator ()(const Range &r) const
{
for (int c = r.start; c < r.end; ++c) {
int w_offset = c % kernel_w; int w_offset = c % kernel_w;
int h_offset = (c / kernel_w) % kernel_h; int h_offset = (c / kernel_w) % kernel_h;
int c_im = c / kernel_h / kernel_w; int c_im = c / kernel_h / kernel_w;
...@@ -74,7 +109,8 @@ void im2col_cpu(const Dtype* data_im, ...@@ -74,7 +109,8 @@ void im2col_cpu(const Dtype* data_im,
} }
} }
} }
} }
};
template <typename Dtype> template <typename Dtype>
void col2im_cpu(const Dtype* data_col, void col2im_cpu(const Dtype* data_col,
......
...@@ -65,6 +65,8 @@ static void testLayer(String basename, bool useCaffeModel = false, bool useCommo ...@@ -65,6 +65,8 @@ static void testLayer(String basename, bool useCaffeModel = false, bool useCommo
String inpfile = (useCommonInputBlob) ? _tf("blob.npy") : _tf(basename + ".input.npy"); String inpfile = (useCommonInputBlob) ? _tf("blob.npy") : _tf(basename + ".input.npy");
String outfile = _tf(basename + ".npy"); String outfile = _tf(basename + ".npy");
cv::setNumThreads(cv::getNumberOfCPUs());
Net net; Net net;
{ {
Ptr<Importer> importer = createCaffeImporter(prototxt, (useCaffeModel) ? caffemodel : String()); Ptr<Importer> importer = createCaffeImporter(prototxt, (useCaffeModel) ? caffemodel : String());
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment