Commit b70a9dc1 authored by Vitaliy Lyudvichenko's avatar Vitaliy Lyudvichenko

Adding of templated GPU/CPU implementation of Convolution layer

parent b26896c0
...@@ -297,6 +297,7 @@ namespace dnn ...@@ -297,6 +297,7 @@ namespace dnn
mutable uchar state; mutable uchar state;
#endif #endif
public:
enum DataState enum DataState
{ {
UNINITIALIZED, UNINITIALIZED,
......
...@@ -346,12 +346,12 @@ inline size_t Blob::offset(int n, int cn, int row, int col) const ...@@ -346,12 +346,12 @@ inline size_t Blob::offset(int n, int cn, int row, int col) const
inline float *Blob::ptrf(int n, int cn, int row, int col) inline float *Blob::ptrf(int n, int cn, int row, int col)
{ {
return matRef().ptr<float>() + offset(n, cn, row, col); return matRef(false).ptr<float>() + offset(n, cn, row, col);
} }
inline uchar *Blob::ptr(int n, int cn, int row, int col) inline uchar *Blob::ptr(int n, int cn, int row, int col)
{ {
Mat &mat = matRef(); Mat &mat = matRef(false);
return mat.ptr() + mat.elemSize() * offset(n, cn, row, col); return mat.ptr() + mat.elemSize() * offset(n, cn, row, col);
} }
......
...@@ -51,8 +51,23 @@ namespace cv ...@@ -51,8 +51,23 @@ namespace cv
{ {
namespace dnn namespace dnn
{ {
ConvolutionLayer::ConvolutionLayer(LayerParams &params) : Layer(params)
{ typedef BlobShape Shape;
template<typename Mat>
void reshape(Mat &m, const BlobShape &shape)
{
m = m.reshape(1, shape.dims(), shape.ptr());
}
template<typename Mat>
Mat reshaped(const Mat &m, const BlobShape &shape)
{
return m.reshape(1, shape.dims(), shape.ptr());
}
ConvolutionLayer::ConvolutionLayer(LayerParams &params) : Layer(params)
{
getKernelParams(params, kerH, kerW, padH, padW, strideH, strideW); getKernelParams(params, kerH, kerW, padH, padW, strideH, strideW);
numOutput = params.get<int>("num_output"); numOutput = params.get<int>("num_output");
...@@ -72,21 +87,18 @@ namespace dnn ...@@ -72,21 +87,18 @@ namespace dnn
CV_Assert(biasBlob.total() == (size_t)numOutput); CV_Assert(biasBlob.total() == (size_t)numOutput);
} }
//TBD
useOpenCL = params.has("use_opencl");
#if HAVE_CBLAS #if HAVE_CBLAS
{
if (getBlasThreads() != cv::getThreadNum()) if (getBlasThreads() != cv::getThreadNum())
{ {
setBlasThreads(cv::getThreadNum()); setBlasThreads(cv::getThreadNum());
} }
}
#endif #endif
}
void ConvolutionLayer::allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs) tryUseOpenCL = true;
{ }
void ConvolutionLayer::allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
{
CV_Assert(inputs.size() > 0); CV_Assert(inputs.size() > 0);
const Blob &inpBlob = *inputs[0]; const Blob &inpBlob = *inputs[0];
...@@ -107,85 +119,119 @@ namespace dnn ...@@ -107,85 +119,119 @@ namespace dnn
CV_Assert(inputs[i]->dims() == 4 && inputs[i]->channels() == inpBlob.channels()); CV_Assert(inputs[i]->dims() == 4 && inputs[i]->channels() == inpBlob.channels());
CV_Assert(inputs[i]->rows() == inpBlob.rows() && inputs[i]->cols() == inpBlob.cols()); CV_Assert(inputs[i]->rows() == inpBlob.rows() && inputs[i]->cols() == inpBlob.cols());
outputs[i].create(BlobShape(inputs[i]->num(), topCn, topH, topW)); outputs[i].create(Shape(inputs[i]->num(), topCn, topH, topW));
} }
#ifdef HAVE_OPENCL
useOpenCL = ocl::useOpenCL() && tryUseOpenCL;
#else
useOpenCL = false;
#endif
int allocFlags = useOpenCL ? Blob::ALLOC_BOTH : Blob::ALLOC_MAT;
if (!is1x1()) if (!is1x1())
colMat.create(ksize, outH * outW, inpBlob.type()); {
colBlob.create(Shape(ksize, outH * outW), inpBlob.type(), allocFlags);
colMat = colBlob.matRef();
}
if (bias) if (bias)
biasOnesMat = Mat::ones(1, topH * topW, inpBlob.type()); {
biasOnesBlob.create(Shape(1, topH * topW), inpBlob.type(), allocFlags);
biasOnesBlob.matRef().setTo(1);
biasOnesMat = biasOnesBlob.matRefConst();
} }
}
inline bool ConvolutionLayer::is1x1() const inline bool ConvolutionLayer::is1x1() const
{ {
return (kerH == 1 && kerW == 1) && (strideW == 1 && strideH == 1); //hotfix with stride return (kerH == 1 && kerW == 1) && (strideW == 1 && strideH == 1); //hotfix with stride
} }
void ConvolutionLayer::forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs) template<typename Mat>
{ void ConvolutionLayer::forward_(std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
Blob &wgtBlob = blobs[0]; {
Mat weightsMat = reshaped(blobs[0].getRefConst<Mat>(), Shape(outCn, ksize));
Mat biasesMat = reshaped(blobs[1].getRefConst<Mat>(), Shape(outCn, 1));
for (size_t ii = 0; ii < outputs.size(); ii++) for (size_t ii = 0; ii < outputs.size(); ii++)
{ {
Blob &inpBlob = *inputs[ii]; Blob &inpBlob = *inputs[ii];
Blob &outBlob = outputs[ii]; Blob &outBlob = outputs[ii];
Mat inpMat = inpBlob.getRefConst<Mat>();
Mat outMat = reshaped(outBlob.getRef<Mat>(), Shape(inpBlob.num()*group*outGroupCn, outH*outW));
int outCurrCn = 0;
for (int n = 0; n < inpBlob.num(); n++) for (int n = 0; n < inpBlob.num(); n++)
{ {
int kerCurrCn = 0;
for (int g = 0; g < group; g++) for (int g = 0; g < group; g++)
{ {
im2col(inpBlob, n, g); im2col(inpBlob, n, g, colBlob);
const Mat &colMat = colBlob.getRefConst<Mat>();
Mat kerMat(outGroupCn, ksize, wgtBlob.type(), wgtBlob.ptr(g*outGroupCn)); Range kerRange(kerCurrCn, kerCurrCn + outGroupCn);
Mat dstMat(outGroupCn, outH*outW, outBlob.type(), outBlob.ptr(n, g*outGroupCn)); Mat kerMat = weightsMat.rowRange(kerRange);
gemmCPU(kerMat, colMat, 1, dstMat, 0); Range outRange(outCurrCn, outCurrCn + outGroupCn);
Mat dstMat = outMat.rowRange(outRange);
dnn::gemm(kerMat, colMat, 1, dstMat, 0);
if (bias) if (bias)
{ {
float *biasPtr = blobs[1].ptrf() + g*outGroupCn; dnn::gemm(biasesMat.rowRange(kerRange), biasOnesMat, 1, dstMat, 1);
Mat biasMat(outGroupCn, 1, CV_32F, biasPtr);
gemmCPU(biasMat, biasOnesMat, 1, dstMat, 1); //TODO: gemv
}
} }
kerCurrCn += outGroupCn;
outCurrCn += outGroupCn;
} }
} }
} }
}
void ConvolutionLayer::im2col(Blob &inpBlob, int imNum, int cnGroup) void ConvolutionLayer::forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
{ {
uchar *srcPtr = inpBlob.ptr(imNum, cnGroup*inpGroupCn); if (!useOpenCL)
forward_<Mat>(inputs, outputs);
if (is1x1()) else
{ forward_<UMat>(inputs, outputs);
colMat = Mat(ksize, inpBlob.rows()*inpBlob.cols(), inpBlob.type(), srcPtr); }
return;
}
void ConvolutionLayer::im2col(Blob &inpBlob, int imNum, int cnGroup, Blob &colBlob)
{
#ifdef HAVE_OPENCL #ifdef HAVE_OPENCL
if (useOpenCL && ocl::useOpenCL() && inpBlob.type() == CV_32F && !is1x1()) if (useOpenCL)
{ {
std::vector<Range> ranges(4, Range::all()); std::vector<Range> ranges(4, Range::all());
ranges[0] = Range(imNum, imNum+1); ranges[0] = Range(imNum, imNum+1);
ranges[1] = Range(cnGroup*inpGroupCn, (cnGroup + 1)*inpGroupCn); ranges[1] = Range(cnGroup*inpGroupCn, (cnGroup + 1)*inpGroupCn);
UMat src = inpBlob.matRef()(&ranges[0]).getUMat(ACCESS_READ); UMat src = inpBlob.umatRef()(&ranges[0]);
UMat dst(colMat.size(), colMat.type()); UMat &dst = colBlob.umatRef();
im2col_ocl(src, inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, dst); im2col_ocl(src, inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, dst);
dst.copyTo(colMat);
return; return;
} }
#endif // HAVE_OPENCL #endif // HAVE_OPENCL
Mat &colMat = colBlob.matRef();
uchar *srcPtr = inpBlob.ptr(imNum, cnGroup*inpGroupCn);
if (is1x1())
{
colMat = Mat(ksize, inpBlob.rows()*inpBlob.cols(), inpBlob.type(), srcPtr);
return;
}
if (inpBlob.type() == CV_32F) if (inpBlob.type() == CV_32F)
im2col_CpuPBody<float>::run((float*)srcPtr, inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, colMat.ptr<float>()); im2col_CpuPBody<float>::run((float*)srcPtr, inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, colMat.ptr<float>());
if (inpBlob.type() == CV_64F) if (inpBlob.type() == CV_64F)
im2col_CpuPBody<double>::run((double*)srcPtr, inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, colMat.ptr<double>()); im2col_CpuPBody<double>::run((double*)srcPtr, inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, colMat.ptr<double>());
} }
void ConvolutionLayer::computeInpOutShape(const Blob &inpBlob) void ConvolutionLayer::computeInpOutShape(const Blob &inpBlob)
{ {
inpH = inpBlob.rows(); inpH = inpBlob.rows();
inpW = inpBlob.cols(); inpW = inpBlob.cols();
inpCn = inpBlob.channels(); inpCn = inpBlob.channels();
...@@ -195,13 +241,13 @@ namespace dnn ...@@ -195,13 +241,13 @@ namespace dnn
outCn = numOutput; outCn = numOutput;
topH = outH; topW = outW; topCn = outCn; topH = outH; topW = outW; topCn = outCn;
} }
DeConvolutionLayer::DeConvolutionLayer(LayerParams &params) DeConvolutionLayer::DeConvolutionLayer(LayerParams &params)
: ConvolutionLayer(params) {} : ConvolutionLayer(params) {}
void DeConvolutionLayer::computeInpOutShape(const Blob &inpBlob) void DeConvolutionLayer::computeInpOutShape(const Blob &inpBlob)
{ {
outH = inpBlob.rows(); outH = inpBlob.rows();
outW = inpBlob.cols(); outW = inpBlob.cols();
outCn = inpBlob.channels(); outCn = inpBlob.channels();
...@@ -211,10 +257,10 @@ namespace dnn ...@@ -211,10 +257,10 @@ namespace dnn
inpCn = numOutput; inpCn = numOutput;
topH = inpH; topW = inpW; topCn = inpCn; topH = inpH; topW = inpW; topCn = inpCn;
} }
void DeConvolutionLayer::forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs) void DeConvolutionLayer::forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
{ {
Blob &wghtBlob = blobs[0]; Blob &wghtBlob = blobs[0];
for (size_t ii = 0; ii < outputs.size(); ii++) for (size_t ii = 0; ii < outputs.size(); ii++)
...@@ -246,16 +292,17 @@ namespace dnn ...@@ -246,16 +292,17 @@ namespace dnn
} }
} }
} }
} }
void DeConvolutionLayer::col2im(Mat &dstMat) void DeConvolutionLayer::col2im(Mat &dstMat)
{ {
if (is1x1()) return; if (is1x1()) return;
if (dstMat.type() == CV_32F) if (dstMat.type() == CV_32F)
col2im_cpu(colMat.ptr<float>(), inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, dstMat.ptr<float>()); col2im_cpu(colMat.ptr<float>(), inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, dstMat.ptr<float>());
if (dstMat.type() == CV_64F) if (dstMat.type() == CV_64F)
col2im_cpu(colMat.ptr<double>(), inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, dstMat.ptr<double>()); col2im_cpu(colMat.ptr<double>(), inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, dstMat.ptr<double>());
} }
} }
} }
...@@ -63,18 +63,23 @@ namespace dnn ...@@ -63,18 +63,23 @@ namespace dnn
int inpGroupCn, outGroupCn; int inpGroupCn, outGroupCn;
int ksize; int ksize;
bool useOpenCL; bool tryUseOpenCL, useOpenCL;
Blob colBlob, biasOnesBlob;
Mat colMat, biasOnesMat; Mat colMat, biasOnesMat;
inline bool is1x1() const; inline bool is1x1() const;
virtual void computeInpOutShape(const Blob &inpBlob); virtual void computeInpOutShape(const Blob &inpBlob);
void im2col(Blob &inpBlob, int imNum, int cnGroup); void im2col(Blob &inpBlob, int imNum, int cnGroup, Blob &colBlob);
public: public:
ConvolutionLayer() {} ConvolutionLayer() {}
ConvolutionLayer(LayerParams &params); ConvolutionLayer(LayerParams &params);
void allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs); void allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs); void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
template<typename XMat>
void forward_(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
}; };
class DeConvolutionLayer : public ConvolutionLayer class DeConvolutionLayer : public ConvolutionLayer
......
...@@ -11,9 +11,15 @@ namespace cv ...@@ -11,9 +11,15 @@ namespace cv
namespace dnn namespace dnn
{ {
void gemm(InputArray A, InputArray B, double alpha, InputOutputArray C, double beta, int flags /*= 0*/) void gemm(InputArray A, InputArray B, double alpha, InputOutputArray C, double beta, int flags)
{ {
if (C.isMat())
gemmCPU(A.getMat(), B.getMat(), alpha, C.getMatRef(), beta, flags);
else
{
cv::gemm(A, B, alpha, C, beta, C, flags); cv::gemm(A, B, alpha, C, beta, C, flags);
std::cout << "OCL gemm\n";
}
} }
inline void SwapRowCols(const Mat &A, int &rows, int &cols, bool isTrans) inline void SwapRowCols(const Mat &A, int &rows, int &cols, bool isTrans)
...@@ -35,10 +41,9 @@ void gemmCPU(const Mat &A, const Mat &B, double alpha, Mat &C, double beta, int ...@@ -35,10 +41,9 @@ void gemmCPU(const Mat &A, const Mat &B, double alpha, Mat &C, double beta, int
SwapRowCols(B, Brows, Bcols, transB); SwapRowCols(B, Brows, Bcols, transB);
SwapRowCols(C, Crows, Ccols, transC); SwapRowCols(C, Crows, Ccols, transC);
CV_DbgAssert(!(flags & GEMM_3_T)); CV_Assert(!(flags & GEMM_3_T));
CV_Assert(Acols == Brows && Arows == Crows && Bcols == Ccols); CV_Assert(Acols == Brows && Arows == Crows && Bcols == Ccols);
CV_Assert(A.isContinuous() && B.isContinuous() && C.isContinuous()); CV_Assert(A.isContinuous() && B.isContinuous() && C.isContinuous());
CV_Assert(A.type() == CV_32F || A.type() == CV_64F);
CV_Assert(A.type() == B.type() && B.type() == C.type()); CV_Assert(A.type() == B.type() && B.type() == C.type());
CV_Assert(A.data != C.data && B.data != C.data); CV_Assert(A.data != C.data && B.data != C.data);
...@@ -59,6 +64,10 @@ void gemmCPU(const Mat &A, const Mat &B, double alpha, Mat &C, double beta, int ...@@ -59,6 +64,10 @@ void gemmCPU(const Mat &A, const Mat &B, double alpha, Mat &C, double beta, int
B.ptr<double>(), B.cols, B.ptr<double>(), B.cols,
beta, C.ptr<double>(), C.cols); beta, C.ptr<double>(), C.cols);
} }
else
{
CV_Error(Error::BadDepth, "Only floating point types are supported");
}
#else #else
cv::gemm(A, B, alpha, C, beta, C, flags); cv::gemm(A, B, alpha, C, beta, C, flags);
#endif #endif
...@@ -70,7 +79,6 @@ int getBlasThreads() ...@@ -70,7 +79,6 @@ int getBlasThreads()
return openblas_get_num_threads(); return openblas_get_num_threads();
#else #else
return 1; return 1;
#endif #endif
} }
...@@ -81,7 +89,6 @@ void setBlasThreads(int numThreads) ...@@ -81,7 +89,6 @@ void setBlasThreads(int numThreads)
goto_set_num_threads(numThreads); goto_set_num_threads(numThreads);
#else #else
(void)numThreads; //suppress compilers' warning (void)numThreads; //suppress compilers' warning
numThreads = 0;
#endif #endif
} }
......
...@@ -42,6 +42,7 @@ ...@@ -42,6 +42,7 @@
#if defined(ENABLE_CAFFE_MODEL_TESTS) #if defined(ENABLE_CAFFE_MODEL_TESTS)
#include "test_precomp.hpp" #include "test_precomp.hpp"
#include "npy_blob.hpp" #include "npy_blob.hpp"
#include <opencv2/core/ocl.hpp>
namespace cvtest namespace cvtest
{ {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment