Updating of (De)Convolution layer implementations

Adding OCL version of col2im() Polymorphic Deconvolution forward() implementation Fixed OCL gemm problem

Updating of (De)Convolution layer implementations
Adding OCL version of col2im() Polymorphic Deconvolution forward() implementation Fixed OCL gemm problem
50c9e1c9 · Vitaliy Lyudvichenko · 1913eb70 · 50c9e1c9 · 50c9e1c9 · 50c9e1c9
Commit 50c9e1c9 authored Jul 17, 2016 by Vitaliy Lyudvichenko
8 changed files
--- a/modules/dnn/src/layers/convolution_layer.cpp
+++ b/modules/dnn/src/layers/convolution_layer.cpp
@@ -117,14 +117,12 @@ void ConvolutionLayer::allocate(const std::vector<Blob*> &inputs, std::vector<Bl
    if (!is1x1())
    {
        colBlob.create(Shape(ksize, outH * outW), inpBlob.type(), allocFlags);
-        colMat = colBlob.matRef();
    }

    if (bias)
    {
        biasOnesBlob.create(Shape(1, topH * topW), inpBlob.type(), allocFlags);
        biasOnesBlob.matRef().setTo(1);
-        biasOnesMat = biasOnesBlob.matRefConst();
    }
 }

@@ -141,12 +139,11 @@ void ConvolutionLayer::forward_(std::vector<Blob*> &inputs, std::vector<Blob> &o

    for (size_t ii = 0; ii < outputs.size(); ii++)
    {
-        Blob &inpBlob = *inputs[ii];
-        Blob &outBlob = outputs[ii];
-        XMat inpMat = inpBlob.getRefConst<XMat>();
-        XMat outMat = reshaped(outBlob.getRef<XMat>(), Shape(inpBlob.num()*group*outGroupCn, outH*outW));
+        int numImg = inputs[ii]->size(0);
+        XMat inpMat = inputs[ii]->getRefConst<XMat>();
+        XMat outMat = reshaped(outputs[ii].getRef<XMat>(), Shape(numImg*group*outGroupCn, outH*outW));

-        for (int n = 0; n < inpBlob.num(); n++)
+        for (int n = 0; n < numImg; n++)
        {
            for (int g = 0; g < group; g++)
            {
@@ -163,7 +160,7 @@ void ConvolutionLayer::forward_(std::vector<Blob*> &inputs, std::vector<Blob> &o

                if (bias)
                {
-                    dnn::gemm(biasesMat.rowRange(kerRange), biasOnesMat, 1, dstMat, 1);
+                    dnn::gemm(biasesMat.rowRange(kerRange), biasOnesBlob.getRefConst<XMat>(), 1, dstMat, 1);
                }
            }
        }
@@ -180,16 +177,14 @@ void ConvolutionLayer::forward(std::vector<Blob*> &inputs, std::vector<Blob> &ou

 void ConvolutionLayer::im2col(const UMat &srcImg, UMat &dstCol)
 {
-#ifdef HAVE_OPENCL
-    if (!is1x1())
-    {
-        im2col_ocl(srcImg, inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, this->colBlob.umatRef());
-        dstCol = this->colBlob.umatRefConst();
-    }
-    else
+    if (is1x1())
    {
        dstCol = reshaped(srcImg, Shape(ksize, outH*outW));
+        return;
    }
+#ifdef HAVE_OPENCL
+    CV_Assert(im2col_ocl(srcImg, inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, this->colBlob.umatRef()));
+    dstCol = this->colBlob.umatRefConst();
 #else
    CV_Error(Error::StsInternal, "");
    dstCol = srcImg; //supress warning
@@ -244,47 +239,75 @@ void DeConvolutionLayer::computeInpOutShape(const Blob &inpBlob)

 void DeConvolutionLayer::forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
 {
-    Blob &wghtBlob = blobs[0];
+    if (!useOpenCL)
+        forward_<Mat>(inputs, outputs);
+    else
+        forward_<UMat>(inputs, outputs);
+}
+
+template<typename XMat>
+void DeConvolutionLayer::forward_(std::vector<Blob *> &inputs, std::vector<Blob> &outputs)
+{
+    XMat weightsMat = reshaped(blobs[0].getRefConst<XMat>(), Shape(outCn, ksize));
+    XMat biasesMat  = reshaped(blobs[1].getRefConst<XMat>(), Shape(outCn, 1));

    for (size_t ii = 0; ii < outputs.size(); ii++)
    {
-        Blob &convBlob = *inputs[ii];
-        Blob &decnBlob = outputs[ii];
+        int numImg = inputs[ii]->size(0);
+        XMat convBlob = reshaped(inputs[ii]->getRefConst<XMat>(), Shape(numImg*outCn, outH*outW));
+        XMat decnBlob = reshaped(outputs[ii].getRef<XMat>(), Shape(numImg*inpCn, inpH*inpW));

-        for (int n = 0; n < convBlob.num(); n++)
+        for (int n = 0; n < numImg; n++)
        {
            for (int g = 0; g < group; g++)
            {
-                Mat dstMat(inpGroupCn, inpH*inpW, decnBlob.type(), decnBlob.ptr(n, g*inpGroupCn));
+                XMat dstMat = decnBlob.rowRange(_Range((g + n * group) * inpGroupCn, inpGroupCn));
+                XMat &colMat = (is1x1()) ? dstMat : colBlob.getRef<XMat>();

-                if (is1x1())
-                    colMat = dstMat;
+                XMat convMat = convBlob.rowRange(_Range((g + n * group) * outGroupCn, outGroupCn));
+                XMat wghtMat = weightsMat.rowRange(_Range(g * outGroupCn, outGroupCn));

-                Mat convMat(outGroupCn, outH*outW, convBlob.type(), convBlob.ptr(n, g*outGroupCn));
-                Mat wghtMat(outGroupCn, ksize, wghtBlob.type(), wghtBlob.ptr(g*outGroupCn));
-                gemmCPU(wghtMat, convMat, 1, colMat, 0, GEMM_1_T);
+                dnn::gemm(wghtMat, convMat, 1, colMat, 0, GEMM_1_T);

-                col2im(dstMat);
+                if (!is1x1())
+                    col2im(colMat, dstMat);

                if (bias)
                {
-                    float *biasPtr = blobs[1].ptrf() + g*inpGroupCn;
-                    Mat biasMat(inpGroupCn, 1, CV_32F, biasPtr);
-                    gemmCPU(biasMat, biasOnesMat, 1, dstMat, 1); //TODO: gemv
+                    XMat curBiasMat = biasesMat.rowRange(_Range(g * outGroupCn, outGroupCn));
+                    dnn::gemm(curBiasMat, biasOnesBlob.getRefConst<XMat>(), 1, dstMat, 1);
                }
            }
        }
    }
 }

-void DeConvolutionLayer::col2im(Mat &dstMat)
+void DeConvolutionLayer::col2im(const Mat &colMat, Mat &dstImg)
 {
-    if (is1x1()) return;
+    if (is1x1())
+    {
+        dstImg = colMat;
+        return;
+    }
+    if (dstImg.type() == CV_32F)
+        col2im_CpuPBody<float>::run(colMat.ptr<float>(), inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, dstImg.ptr<float>());
+    if (dstImg.type() == CV_64F)
+        col2im_CpuPBody<double>::run(colMat.ptr<double>(), inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, dstImg.ptr<double>());
+}

-    if (dstMat.type() == CV_32F)
-        col2im_cpu(colMat.ptr<float>(), inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, dstMat.ptr<float>());
-    if (dstMat.type() == CV_64F)
-        col2im_cpu(colMat.ptr<double>(), inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, dstMat.ptr<double>());
+void DeConvolutionLayer::col2im(const UMat &colMat, UMat &dstImg)
+{
+    if (is1x1())
+    {
+        dstImg = colMat;
+        return;
+    }
+#ifdef HAVE_OPENCL
+    CV_Assert(col2im_ocl(colMat, inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, dstImg));
+#else
+    CV_Error(Error::StsInternal, "");
+    dstImg = colMat;
+#endif
 }

 }

--- a/modules/dnn/src/layers/convolution_layer.hpp
+++ b/modules/dnn/src/layers/convolution_layer.hpp
@@ -66,7 +66,6 @@ namespace dnn
        bool tryUseOpenCL, useOpenCL;

        Blob colBlob, biasOnesBlob;
-        Mat colMat, biasOnesMat;

        inline bool is1x1() const;
        virtual void computeInpOutShape(const Blob &inpBlob);
@@ -88,11 +87,15 @@ namespace dnn
    {
    protected:
        void computeInpOutShape(const Blob &inpBlob);
-        void col2im(Mat &dstMat);
+        void col2im(const  Mat &colMat, Mat  &dstImg);
+        void col2im(const UMat &colMat, UMat &dstImg);

    public:
        DeConvolutionLayer(LayerParams &params);
        void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
+
+        template<typename XMat>
+        void forward_(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
    };
 }
 }

--- a/modules/dnn/src/layers/op_blas.cpp
+++ b/modules/dnn/src/layers/op_blas.cpp
@@ -16,7 +16,9 @@ void gemm(InputArray A, InputArray B, double alpha, InputOutputArray C, double b
    if (C.isMat())
        gemmCPU(A.getMat(), B.getMat(), alpha, C.getMatRef(), beta, flags);
    else
-        cv::gemm(A, B, alpha, C, beta, C, flags);
+    {
+        cv::gemm(A, B, alpha, (beta == 0) ? noArray() : C, beta, C, flags);
+    }
 }

 inline void SwapRowCols(const Mat &A, int &rows, int &cols, bool isTrans)

--- a/modules/dnn/src/layers/op_im2col.cpp
+++ b/modules/dnn/src/layers/op_im2col.cpp
@@ -40,6 +40,7 @@
 //M*/

 #include "../precomp.hpp"
+#include <opencv2/core/ocl.hpp>
 #include "opencl_kernels_dnn.hpp"
 #include "op_im2col.hpp"

@@ -49,36 +50,73 @@ namespace dnn
 {

 #ifdef HAVE_OPENCL
-void im2col_ocl(const UMat &img,
+
+bool im2col_ocl(const UMat &img,
+                 int channels, int height, int width,
+                 int kernel_h, int kernel_w,
+                 int pad_h, int pad_w,
+                 int stride_h, int stride_w,
+                 UMat &col)
+{
+    int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
+    int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
+    int channels_col = channels * kernel_h * kernel_w;
+    int esz = img.elemSize();
+
+    CV_Assert(img.isContinuous() && col.isContinuous());
+    CV_Assert(img.total() == (size_t)channels * height * width);
+    CV_Assert(col.total() == (size_t)channels_col * height_col * width_col);
+
+    ocl::Kernel ker("im2col", ocl::dnn::im2col_oclsrc, String("-DT=") + ocl::typeToStr(img.type()));
+    if (ker.empty())
+        return false;
+
+    ker.args(ocl::KernelArg::PtrReadOnly(img), (int)img.offset/esz,
+             channels, height, width,
+             kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+             height_col, width_col,
+             ocl::KernelArg::PtrWriteOnly(col), (int)col.offset/esz
+             );
+
+    size_t localSize = ocl::Device::getDefault().maxWorkGroupSize();
+    size_t globalSize = (size_t)channels * height_col * width_col;
+    return ker.run(1, &globalSize, &localSize, true);
+}
+
+bool col2im_ocl(const UMat &col,
                int channels, int height, int width,
                int kernel_h, int kernel_w,
                int pad_h, int pad_w,
                int stride_h, int stride_w,
-                UMat &col)
+                UMat &img)
 {
-    int h_out = (height + 2 * pad_h - kernel_h) / stride_h + 1;
-    int w_out = (width + 2 * pad_w - kernel_w) / stride_w + 1;
+    int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
+    int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
+    int channels_col = channels * kernel_h * kernel_w;
+    int esz = img.elemSize();

    CV_Assert(img.isContinuous() && col.isContinuous());
    CV_Assert(img.total() == (size_t)channels * height * width);
-    CV_Assert(col.total() == (size_t)channels * kernel_h * kernel_w * h_out * w_out);
+    CV_Assert(col.total() == (size_t)channels_col * height_col * width_col);

-    ocl::Kernel im2col_ker("im2col", ocl::dnn::im2col_oclsrc);
-    CV_Assert(!im2col_ker.empty());
+    ocl::Kernel ker("col2im", ocl::dnn::col2im_oclsrc, String("-DT=") + ocl::typeToStr(col.type()));
+    if (ker.empty())
+        return false;

-    im2col_ker.args(ocl::KernelArg::PtrReadOnly(img), (int)img.offset,
-             channels, height, width,
-             kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
-             h_out, w_out,
-             ocl::KernelArg::PtrWriteOnly(col), (int)col.offset
-        );
+    ker.args((int)img.total(),
+             ocl::KernelArg::PtrReadOnly(col), (int)col.offset/esz,
+             height, width, channels,
+             kernel_h, kernel_w,
+             pad_h, pad_w,
+             stride_h, stride_w,
+             height_col, width_col,
+             ocl::KernelArg::PtrWriteOnly(img), (int)img.offset/esz);

    size_t localSize = ocl::Device::getDefault().maxWorkGroupSize();
-    size_t globalSize = (size_t)channels * h_out * w_out;
-
-    CV_Assert(im2col_ker.run(1, &globalSize, &localSize, true));
+    size_t globalSize = img.total();
+    return ker.run(1, &globalSize, &localSize, true);
 }
-#endif

+#endif
 }
 }
--- a/modules/dnn/src/layers/op_im2col.hpp
+++ b/modules/dnn/src/layers/op_im2col.hpp
@@ -41,7 +41,7 @@

 #ifndef __OPENCV_DNN_LAYERS_IM2COL_HPP__
 #define __OPENCV_DNN_LAYERS_IM2COL_HPP__
-#include <opencv2/core.hpp>
+#include "../precomp.hpp"
 #include <iostream>

 namespace cv
@@ -60,26 +60,9 @@ class im2col_CpuPBody : public cv::ParallelLoopBody
    Dtype* data_col;
    int height_col, width_col, channels_col;

+    im2col_CpuPBody() {}
 public:

-    im2col_CpuPBody(const Dtype* data_im_,
-                     int channels_, int height_, int width_,
-                     int kernel_h_, int kernel_w_,
-                     int pad_h_, int pad_w_,
-                     int stride_h_, int stride_w_,
-                     Dtype* data_col_) :
-        data_im(data_im_),
-        channels(channels_), height(height_), width(width_),
-        kernel_h(kernel_h_), kernel_w(kernel_w_),
-        pad_h(pad_h_), pad_w(pad_w_),
-        stride_h(stride_h_), stride_w(stride_w_),
-        data_col(data_col_)
-    {
-        height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
-        width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
-        channels_col = channels * kernel_h * kernel_w;
-    }
-
    static void run(const Dtype* data_im,
                    int channels, int height, int width,
                    int kernel_h, int kernel_w,
@@ -87,8 +70,18 @@ public:
                    int stride_h, int stride_w,
                    Dtype* data_col)
    {
-        im2col_CpuPBody<Dtype> pb(data_im, channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, data_col);
-        cv::parallel_for_(Range(0, pb.channels_col), pb);
+        im2col_CpuPBody<Dtype> t;
+        t.data_im = data_im;
+        t.data_col = data_col;
+        t.channels = channels; t.height = height; t.width = width;
+        t.kernel_h = kernel_h; t.kernel_w = kernel_w;
+        t.pad_h = pad_h; t.pad_w = pad_w;
+        t.stride_h = stride_h; t.stride_w = stride_w;
+        t.height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
+        t.width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
+        t.channels_col = channels * kernel_h * kernel_w;
+
+        cv::parallel_for_(Range(0, t.channels_col), t);
    }

    virtual void operator ()(const Range &r) const
@@ -112,25 +105,94 @@ public:
    }
 };

+template <typename Dtype>
+class col2im_CpuPBody : public cv::ParallelLoopBody
+{
+    const Dtype* data_col;
+    int channels, height, width;
+    int kernel_h, kernel_w;
+    int pad_h, pad_w;
+    int stride_h, stride_w;
+    Dtype* data_im;
+    int height_col, width_col;
+
+    col2im_CpuPBody() {}
+
+public:
+
+    static void run(const Dtype* data_col,
+                    int channels, int height, int width,
+                    int kernel_h, int kernel_w,
+                    int pad_h, int pad_w,
+                    int stride_h, int stride_w,
+                    Dtype* data_im)
+    {
+        //TODO: single-threaded version switch
+
+        col2im_CpuPBody t;
+        t.data_col = data_col;
+        t.data_im = data_im;
+        t.channels = channels; t.height = height; t.width = width;
+        t.kernel_h = kernel_h; t.kernel_w = kernel_w;
+        t.pad_h = pad_h; t.pad_w = pad_w;
+        t.stride_h = stride_h; t.stride_w = stride_w;
+        t.height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
+        t.width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
+        int img_total = channels * height * width;
+
+        cv::parallel_for_(Range(0, img_total), t);
+    }
+
+    virtual void operator ()(const Range &r) const
+    {
+        for (int index = r.start; index < r.end; index++)
+        {
+            Dtype val = 0;
+            int w = index % width + pad_w;
+            int h = (index / width) % height + pad_h;
+            int c = index / (width * height);
+
+            // compute the start and end of the output
+            int w_col_start = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
+            int w_col_end = std::min(w / stride_w + 1, width_col);
+            int h_col_start = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
+            int h_col_end = std::min(h / stride_h + 1, height_col);
+
+            // equivalent implementation
+            int offset =
+            (c * kernel_h * kernel_w + h * kernel_w + w) * height_col * width_col;
+            int coeff_h_col = (1 - stride_h * kernel_w * height_col) * width_col;
+            int coeff_w_col = (1 - stride_w * height_col * width_col);
+            for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
+              for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
+                val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col];
+              }
+            }
+            data_im[index] = val;
+        }
+    }
+};
+
+//single-threaded version
 template <typename Dtype>
 void col2im_cpu(const Dtype* data_col,
                int channels, int height, int width,
-                int patch_h, int patch_w,
+                int kernel_h, int kernel_w,
                int pad_h, int pad_w,
                int stride_h, int stride_w,
                Dtype* data_im)
 {
-    memset(data_im, 0, height * width * channels * sizeof(Dtype));
+    int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
+    int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
+    int channels_col = channels * kernel_h * kernel_w;

-    int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1;
-    int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1;
-    int channels_col = channels * patch_h * patch_w;
+    std::memset(data_im, 0, height * width * channels * sizeof(Dtype));

    for (int c = 0; c < channels_col; ++c)
    {
-        int w_offset = c % patch_w;
-        int h_offset = (c / patch_w) % patch_h;
-        int c_im = c / patch_h / patch_w;
+        int w_offset = c % kernel_w;
+        int h_offset = (c / kernel_w) % kernel_h;
+        int c_im = c / kernel_h / kernel_w;

        for (int h = 0; h < height_col; ++h)
        {
@@ -148,12 +210,19 @@ void col2im_cpu(const Dtype* data_col,
 }

 #ifdef HAVE_OPENCL
-void im2col_ocl(const UMat &img,
+bool im2col_ocl(const UMat &img,
                int channels, int height, int width,
                int kernel_h, int kernel_w,
                int pad_h, int pad_w,
                int stride_h, int stride_w,
                UMat &col);
+
+bool col2im_ocl(const UMat &col,
+                int channels, int height, int width,
+                int kernel_h, int kernel_w,
+                int pad_h, int pad_w,
+                int stride_h, int stride_w,
+                UMat &img);
 #endif

 }

--- a/modules/dnn/src/opencl/col2im.cl
+++ b/modules/dnn/src/opencl/col2im.cl
+/*************************************************************************************
+ * Copyright (c) 2015, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation and/or
+ *  other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************************/
+
+__kernel void col2im(const int n, __global const T* data_col, const int col_offset,
+    const int height, const int width, const int channels,
+    const int patch_h, const int patch_w,
+    const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w,
+    const int height_col, const int width_col,
+    __global T* data_im, const int img_offset)
+{
+  data_col = data_col + col_offset;
+  data_im = data_im + img_offset;
+  int index = get_global_id(0);
+  if(index < n) {
+    T val = 0;
+    int w = index % width + pad_w;
+    int h = (index / width) % height + pad_h;
+    int c = index / (width * height);
+
+    // compute the start and end of the output
+    int w_col_start = (w < patch_w) ? 0 : (w - patch_w) / stride_w + 1;
+    int w_col_end = min(w / stride_w + 1, width_col);
+    int h_col_start = (h < patch_h) ? 0 : (h - patch_h) / stride_h + 1;
+    int h_col_end = min(h / stride_h + 1, height_col);
+
+    // equivalent implementation
+    int offset =
+    (c * patch_h * patch_w + h * patch_w + w) * height_col * width_col;
+    int coeff_h_col = (1 - stride_h * patch_w * height_col) * width_col;
+    int coeff_w_col = (1 - stride_w * height_col * width_col);
+    for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
+      for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
+        val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col];
+      }
+    }
+    data_im[index] = val;
+  }
+}
--- a/modules/dnn/src/opencl/im2col.cl
+++ b/modules/dnn/src/opencl/im2col.cl
@@ -39,11 +39,11 @@
 //
 //M*/

-__kernel void im2col(__global const float *im_src, int im_src_offset,
+__kernel void im2col(__global const T *im_src, int im_src_offset,
                     int channels, int height_inp, int width_inp,
                     int kernel_h, int kernel_w, int pad_h, int pad_w, int stride_h, int stride_w,
                     int height_out, int width_out,
-                     __global float *im_col, int im_col_offset
+                     __global T *im_col, int im_col_offset
                    )
 {
    int index = get_global_id(0);
@@ -52,13 +52,13 @@ __kernel void im2col(__global const float *im_src, int im_src_offset,
    int j_out = index % width_out;
    int i_out = (index / width_out) % height_out;
    int c_inp = (index / width_out) / height_out;
-    
+
    int c_out = c_inp * kernel_h * kernel_w;
    int i_inp = i_out * stride_h - pad_h;
    int j_inp = j_out * stride_w - pad_w;

-    im_src += (c_inp * height_inp + i_inp) * width_inp + j_inp + im_src_offset / sizeof(float);
-    im_col += (c_out * height_out + i_out) * width_out + j_out + im_col_offset / sizeof(float);
+    im_src += (c_inp * height_inp + i_inp) * width_inp + j_inp + im_src_offset;
+    im_col += (c_out * height_out + i_out) * width_out + j_out + im_col_offset;

    for (int ki = 0; ki < kernel_h; ++ki)
        for (int kj = 0; kj < kernel_w; ++kj) {

--- a/modules/dnn/test/test_layers.cpp
+++ b/modules/dnn/test/test_layers.cpp
@@ -107,7 +107,7 @@ TEST(Layer_Test_LRN_channels, Accuracy)

 TEST(Layer_Test_Convolution, Accuracy)
 {
-     testLayerUsingCaffeModels("layer_convolution", true);
+     OCL_OFF(testLayerUsingCaffeModels("layer_convolution", true));
 }
 OCL_TEST(Layer_Test_Convolution, Accuracy)
 {
@@ -117,7 +117,7 @@ OCL_TEST(Layer_Test_Convolution, Accuracy)

 TEST(Layer_Test_DeConvolution, Accuracy)
 {
-     testLayerUsingCaffeModels("layer_deconvolution", true, false);
+     OCL_OFF(testLayerUsingCaffeModels("layer_deconvolution", true, false));
 }
 OCL_TEST(Layer_Test_DeConvolution, Accuracy)
 {