Adding of templated GPU/CPU implementation of Convolution layer

b70a9dc1 · Vitaliy Lyudvichenko · b26896c0 · b70a9dc1 · b70a9dc1 · b70a9dc1
Commit b70a9dc1 authored Jul 09, 2016 by Vitaliy Lyudvichenko
6 changed files
--- a/modules/dnn/include/opencv2/dnn/blob.hpp
+++ b/modules/dnn/include/opencv2/dnn/blob.hpp
@@ -297,6 +297,7 @@ namespace dnn
        mutable uchar state;
 #endif
+public:
        enum DataState
        {
            UNINITIALIZED,

--- a/modules/dnn/include/opencv2/dnn/blob.inl.hpp
+++ b/modules/dnn/include/opencv2/dnn/blob.inl.hpp
@@ -346,12 +346,12 @@ inline size_t Blob::offset(int n, int cn, int row, int col) const
 inline float *Blob::ptrf(int n, int cn, int row, int col)
 {
-    return matRef().ptr<float>() + offset(n, cn, row, col);
+    return matRef(false).ptr<float>() + offset(n, cn, row, col);
 }
 inline uchar *Blob::ptr(int n, int cn, int row, int col)
 {
-    Mat &mat = matRef();
+    Mat &mat = matRef(false);
    return mat.ptr() + mat.elemSize() * offset(n, cn, row, col);
 }

--- a/modules/dnn/src/layers/convolution_layer.cpp
+++ b/modules/dnn/src/layers/convolution_layer.cpp
@@ -51,8 +51,23 @@ namespace cv
 {
 namespace dnn
 {
-    ConvolutionLayer::ConvolutionLayer(LayerParams &params) : Layer(params)
-    {
+typedef BlobShape Shape;
+template<typename Mat>
+void reshape(Mat &m, const BlobShape &shape)
+{
+    m = m.reshape(1, shape.dims(), shape.ptr());
+}
+template<typename Mat>
+Mat reshaped(const Mat &m, const BlobShape &shape)
+{
+    return m.reshape(1, shape.dims(), shape.ptr());
+}
+ConvolutionLayer::ConvolutionLayer(LayerParams &params) : Layer(params)
+{
    getKernelParams(params, kerH, kerW, padH, padW, strideH, strideW);
    numOutput = params.get<int>("num_output");
@@ -72,21 +87,18 @@ namespace dnn
        CV_Assert(biasBlob.total() == (size_t)numOutput);
    }
-        //TBD
-        useOpenCL = params.has("use_opencl");
    #if HAVE_CBLAS
-        {
        if (getBlasThreads() != cv::getThreadNum())
        {
            setBlasThreads(cv::getThreadNum());
        }
-        }
    #endif
-    }
-    void ConvolutionLayer::allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
+    tryUseOpenCL = true;
-    {
+}
+void ConvolutionLayer::allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
+{
    CV_Assert(inputs.size() > 0);
    const Blob &inpBlob = *inputs[0];
@@ -107,85 +119,119 @@ namespace dnn
        CV_Assert(inputs[i]->dims() == 4 && inputs[i]->channels() == inpBlob.channels());
        CV_Assert(inputs[i]->rows() == inpBlob.rows() && inputs[i]->cols() == inpBlob.cols());
-            outputs[i].create(BlobShape(inputs[i]->num(), topCn, topH, topW));
+        outputs[i].create(Shape(inputs[i]->num(), topCn, topH, topW));
    }
+    #ifdef HAVE_OPENCL
+    useOpenCL = ocl::useOpenCL() && tryUseOpenCL;
+    #else
+    useOpenCL = false;
+    #endif
+    int allocFlags = useOpenCL ? Blob::ALLOC_BOTH : Blob::ALLOC_MAT;
    if (!is1x1())
-            colMat.create(ksize, outH * outW, inpBlob.type());
+    {
+        colBlob.create(Shape(ksize, outH * outW), inpBlob.type(), allocFlags);
+        colMat = colBlob.matRef();
+    }
    if (bias)
-            biasOnesMat = Mat::ones(1, topH * topW, inpBlob.type());
+    {
+        biasOnesBlob.create(Shape(1, topH * topW), inpBlob.type(), allocFlags);
+        biasOnesBlob.matRef().setTo(1);
+        biasOnesMat = biasOnesBlob.matRefConst();
    }
+}
-    inline bool ConvolutionLayer::is1x1() const
+inline bool ConvolutionLayer::is1x1() const
-    {
+{
    return (kerH == 1 && kerW == 1) && (strideW == 1 && strideH == 1); //hotfix with stride
-    }
+}
-    void ConvolutionLayer::forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
+template<typename Mat>
-    {
+void ConvolutionLayer::forward_(std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
-        Blob &wgtBlob = blobs[0];
+{
+    Mat weightsMat = reshaped(blobs[0].getRefConst<Mat>(), Shape(outCn, ksize));
+    Mat biasesMat  = reshaped(blobs[1].getRefConst<Mat>(), Shape(outCn, 1));
    for (size_t ii = 0; ii < outputs.size(); ii++)
    {
        Blob &inpBlob = *inputs[ii];
        Blob &outBlob = outputs[ii];
+        Mat inpMat = inpBlob.getRefConst<Mat>();
+        Mat outMat = reshaped(outBlob.getRef<Mat>(), Shape(inpBlob.num()*group*outGroupCn, outH*outW));
+        int outCurrCn = 0;
        for (int n = 0; n < inpBlob.num(); n++)
        {
+            int kerCurrCn = 0;
            for (int g = 0; g < group; g++)
            {
-                    im2col(inpBlob, n, g);
+                im2col(inpBlob, n, g, colBlob);
+                const Mat &colMat = colBlob.getRefConst<Mat>();
-                    Mat kerMat(outGroupCn, ksize, wgtBlob.type(), wgtBlob.ptr(g*outGroupCn));
+                Range kerRange(kerCurrCn, kerCurrCn + outGroupCn);
-                    Mat dstMat(outGroupCn, outH*outW, outBlob.type(), outBlob.ptr(n, g*outGroupCn));
+                Mat kerMat = weightsMat.rowRange(kerRange);
-                    gemmCPU(kerMat, colMat, 1, dstMat, 0);
+                Range outRange(outCurrCn, outCurrCn + outGroupCn);
+                Mat dstMat = outMat.rowRange(outRange);
+                dnn::gemm(kerMat, colMat, 1, dstMat, 0);
                if (bias)
                {
-                        float *biasPtr = blobs[1].ptrf() + g*outGroupCn;
+                    dnn::gemm(biasesMat.rowRange(kerRange), biasOnesMat, 1, dstMat, 1);
-                        Mat biasMat(outGroupCn, 1, CV_32F, biasPtr);
-                        gemmCPU(biasMat, biasOnesMat, 1, dstMat, 1); //TODO: gemv
-                    }
                }
+                kerCurrCn += outGroupCn;
+                outCurrCn += outGroupCn;
            }
        }
    }
+}
-    void ConvolutionLayer::im2col(Blob &inpBlob, int imNum, int cnGroup)
+void ConvolutionLayer::forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
-    {
+{
-        uchar *srcPtr = inpBlob.ptr(imNum, cnGroup*inpGroupCn);
+    if (!useOpenCL)
+        forward_<Mat>(inputs, outputs);
-        if (is1x1())
+    else
-        {
+        forward_<UMat>(inputs, outputs);
-            colMat = Mat(ksize, inpBlob.rows()*inpBlob.cols(), inpBlob.type(), srcPtr);
+}
-            return;
-        }
+void ConvolutionLayer::im2col(Blob &inpBlob, int imNum, int cnGroup, Blob &colBlob)
+{
 #ifdef HAVE_OPENCL
-        if (useOpenCL && ocl::useOpenCL() && inpBlob.type() == CV_32F && !is1x1())
+    if (useOpenCL)
    {
        std::vector<Range> ranges(4, Range::all());
        ranges[0] = Range(imNum, imNum+1);
        ranges[1] = Range(cnGroup*inpGroupCn, (cnGroup + 1)*inpGroupCn);
-            UMat src = inpBlob.matRef()(&ranges[0]).getUMat(ACCESS_READ);
+        UMat src = inpBlob.umatRef()(&ranges[0]);
-            UMat dst(colMat.size(), colMat.type());
+        UMat &dst = colBlob.umatRef();
        im2col_ocl(src, inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, dst);
-            dst.copyTo(colMat);
        return;
    }
 #endif // HAVE_OPENCL
+    Mat &colMat = colBlob.matRef();
+    uchar *srcPtr = inpBlob.ptr(imNum, cnGroup*inpGroupCn);
+    if (is1x1())
+    {
+        colMat = Mat(ksize, inpBlob.rows()*inpBlob.cols(), inpBlob.type(), srcPtr);
+        return;
+    }
    if (inpBlob.type() == CV_32F)
        im2col_CpuPBody<float>::run((float*)srcPtr, inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, colMat.ptr<float>());
    if (inpBlob.type() == CV_64F)
        im2col_CpuPBody<double>::run((double*)srcPtr, inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, colMat.ptr<double>());
-    }
+}
-    void ConvolutionLayer::computeInpOutShape(const Blob &inpBlob)
+void ConvolutionLayer::computeInpOutShape(const Blob &inpBlob)
-    {
+{
    inpH = inpBlob.rows();
    inpW = inpBlob.cols();
    inpCn = inpBlob.channels();
@@ -195,13 +241,13 @@ namespace dnn
    outCn = numOutput;
    topH = outH; topW = outW; topCn = outCn;
-    }
+}
-    DeConvolutionLayer::DeConvolutionLayer(LayerParams &params)
+DeConvolutionLayer::DeConvolutionLayer(LayerParams &params)
    : ConvolutionLayer(params) {}
-    void DeConvolutionLayer::computeInpOutShape(const Blob &inpBlob)
+void DeConvolutionLayer::computeInpOutShape(const Blob &inpBlob)
-    {
+{
    outH = inpBlob.rows();
    outW = inpBlob.cols();
    outCn = inpBlob.channels();
@@ -211,10 +257,10 @@ namespace dnn
    inpCn = numOutput;
    topH = inpH; topW = inpW; topCn = inpCn;
-    }
+}
-    void DeConvolutionLayer::forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
+void DeConvolutionLayer::forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
-    {
+{
    Blob &wghtBlob = blobs[0];
    for (size_t ii = 0; ii < outputs.size(); ii++)
@@ -246,16 +292,17 @@ namespace dnn
            }
        }
    }
-    }
+}
-    void DeConvolutionLayer::col2im(Mat &dstMat)
+void DeConvolutionLayer::col2im(Mat &dstMat)
-    {
+{
    if (is1x1()) return;
    if (dstMat.type() == CV_32F)
        col2im_cpu(colMat.ptr<float>(), inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, dstMat.ptr<float>());
    if (dstMat.type() == CV_64F)
        col2im_cpu(colMat.ptr<double>(), inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, dstMat.ptr<double>());
-    }
+}
 }
 }
--- a/modules/dnn/src/layers/convolution_layer.hpp
+++ b/modules/dnn/src/layers/convolution_layer.hpp
@@ -63,18 +63,23 @@ namespace dnn
        int inpGroupCn, outGroupCn;
        int ksize;
-        bool useOpenCL;
+        bool tryUseOpenCL, useOpenCL;
+        Blob colBlob, biasOnesBlob;
        Mat colMat, biasOnesMat;
        inline bool is1x1() const;
        virtual void computeInpOutShape(const Blob &inpBlob);
-        void im2col(Blob &inpBlob, int imNum, int cnGroup);
+        void im2col(Blob &inpBlob, int imNum, int cnGroup, Blob &colBlob);
    public:
        ConvolutionLayer() {}
        ConvolutionLayer(LayerParams &params);
        void allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
        void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
+        template<typename XMat>
+        void forward_(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
    };
    class DeConvolutionLayer : public ConvolutionLayer

--- a/modules/dnn/src/layers/op_blas.cpp
+++ b/modules/dnn/src/layers/op_blas.cpp
@@ -11,9 +11,15 @@ namespace cv
 namespace dnn
 {
-void gemm(InputArray A, InputArray B, double alpha, InputOutputArray C, double beta, int flags /*= 0*/)
+void gemm(InputArray A, InputArray B, double alpha, InputOutputArray C, double beta, int flags)
 {
+    if (C.isMat())
+        gemmCPU(A.getMat(), B.getMat(), alpha, C.getMatRef(), beta, flags);
+    else
+    {
        cv::gemm(A, B, alpha, C, beta, C, flags);
+        std::cout << "OCL gemm\n";
+    }
 }
 inline void SwapRowCols(const Mat &A, int &rows, int &cols, bool isTrans)
@@ -35,10 +41,9 @@ void gemmCPU(const Mat &A, const Mat &B, double alpha, Mat &C, double beta, int
    SwapRowCols(B, Brows, Bcols, transB);
    SwapRowCols(C, Crows, Ccols, transC);
-    CV_DbgAssert(!(flags & GEMM_3_T));
+    CV_Assert(!(flags & GEMM_3_T));
    CV_Assert(Acols == Brows && Arows == Crows && Bcols == Ccols);
    CV_Assert(A.isContinuous() && B.isContinuous() && C.isContinuous());
-    CV_Assert(A.type() == CV_32F || A.type() == CV_64F);
    CV_Assert(A.type() == B.type() && B.type() == C.type());
    CV_Assert(A.data != C.data && B.data != C.data);
@@ -59,6 +64,10 @@ void gemmCPU(const Mat &A, const Mat &B, double alpha, Mat &C, double beta, int
                    B.ptr<double>(), B.cols,
                    beta, C.ptr<double>(), C.cols);
    }
+    else
+    {
+        CV_Error(Error::BadDepth, "Only floating point types are supported");
+    }
    #else
    cv::gemm(A, B, alpha, C, beta, C, flags);
    #endif
@@ -70,7 +79,6 @@ int getBlasThreads()
    return openblas_get_num_threads();
    #else
    return 1;
    #endif
 }
@@ -81,7 +89,6 @@ void setBlasThreads(int numThreads)
    goto_set_num_threads(numThreads);
    #else
    (void)numThreads;   //suppress compilers' warning
-    numThreads = 0;
    #endif
 }

--- a/modules/dnn/test/test_googlenet.cpp
+++ b/modules/dnn/test/test_googlenet.cpp
@@ -42,6 +42,7 @@
 #if defined(ENABLE_CAFFE_MODEL_TESTS)
 #include "test_precomp.hpp"
 #include "npy_blob.hpp"
+#include <opencv2/core/ocl.hpp>
 namespace cvtest
 {