Commit 50c9e1c9 authored by Vitaliy Lyudvichenko's avatar Vitaliy Lyudvichenko

Updating of (De)Convolution layer implementations

Adding OCL version of col2im()
Polymorphic Deconvolution forward() implementation
Fixed OCL gemm problem
parent 1913eb70
......@@ -117,14 +117,12 @@ void ConvolutionLayer::allocate(const std::vector<Blob*> &inputs, std::vector<Bl
if (!is1x1())
{
colBlob.create(Shape(ksize, outH * outW), inpBlob.type(), allocFlags);
colMat = colBlob.matRef();
}
if (bias)
{
biasOnesBlob.create(Shape(1, topH * topW), inpBlob.type(), allocFlags);
biasOnesBlob.matRef().setTo(1);
biasOnesMat = biasOnesBlob.matRefConst();
}
}
......@@ -141,12 +139,11 @@ void ConvolutionLayer::forward_(std::vector<Blob*> &inputs, std::vector<Blob> &o
for (size_t ii = 0; ii < outputs.size(); ii++)
{
Blob &inpBlob = *inputs[ii];
Blob &outBlob = outputs[ii];
XMat inpMat = inpBlob.getRefConst<XMat>();
XMat outMat = reshaped(outBlob.getRef<XMat>(), Shape(inpBlob.num()*group*outGroupCn, outH*outW));
int numImg = inputs[ii]->size(0);
XMat inpMat = inputs[ii]->getRefConst<XMat>();
XMat outMat = reshaped(outputs[ii].getRef<XMat>(), Shape(numImg*group*outGroupCn, outH*outW));
for (int n = 0; n < inpBlob.num(); n++)
for (int n = 0; n < numImg; n++)
{
for (int g = 0; g < group; g++)
{
......@@ -163,7 +160,7 @@ void ConvolutionLayer::forward_(std::vector<Blob*> &inputs, std::vector<Blob> &o
if (bias)
{
dnn::gemm(biasesMat.rowRange(kerRange), biasOnesMat, 1, dstMat, 1);
dnn::gemm(biasesMat.rowRange(kerRange), biasOnesBlob.getRefConst<XMat>(), 1, dstMat, 1);
}
}
}
......@@ -180,16 +177,14 @@ void ConvolutionLayer::forward(std::vector<Blob*> &inputs, std::vector<Blob> &ou
void ConvolutionLayer::im2col(const UMat &srcImg, UMat &dstCol)
{
#ifdef HAVE_OPENCL
if (!is1x1())
{
im2col_ocl(srcImg, inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, this->colBlob.umatRef());
dstCol = this->colBlob.umatRefConst();
}
else
if (is1x1())
{
dstCol = reshaped(srcImg, Shape(ksize, outH*outW));
return;
}
#ifdef HAVE_OPENCL
CV_Assert(im2col_ocl(srcImg, inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, this->colBlob.umatRef()));
dstCol = this->colBlob.umatRefConst();
#else
CV_Error(Error::StsInternal, "");
dstCol = srcImg; //supress warning
......@@ -244,47 +239,75 @@ void DeConvolutionLayer::computeInpOutShape(const Blob &inpBlob)
void DeConvolutionLayer::forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
{
Blob &wghtBlob = blobs[0];
if (!useOpenCL)
forward_<Mat>(inputs, outputs);
else
forward_<UMat>(inputs, outputs);
}
template<typename XMat>
void DeConvolutionLayer::forward_(std::vector<Blob *> &inputs, std::vector<Blob> &outputs)
{
XMat weightsMat = reshaped(blobs[0].getRefConst<XMat>(), Shape(outCn, ksize));
XMat biasesMat = reshaped(blobs[1].getRefConst<XMat>(), Shape(outCn, 1));
for (size_t ii = 0; ii < outputs.size(); ii++)
{
Blob &convBlob = *inputs[ii];
Blob &decnBlob = outputs[ii];
int numImg = inputs[ii]->size(0);
XMat convBlob = reshaped(inputs[ii]->getRefConst<XMat>(), Shape(numImg*outCn, outH*outW));
XMat decnBlob = reshaped(outputs[ii].getRef<XMat>(), Shape(numImg*inpCn, inpH*inpW));
for (int n = 0; n < convBlob.num(); n++)
for (int n = 0; n < numImg; n++)
{
for (int g = 0; g < group; g++)
{
Mat dstMat(inpGroupCn, inpH*inpW, decnBlob.type(), decnBlob.ptr(n, g*inpGroupCn));
XMat dstMat = decnBlob.rowRange(_Range((g + n * group) * inpGroupCn, inpGroupCn));
XMat &colMat = (is1x1()) ? dstMat : colBlob.getRef<XMat>();
if (is1x1())
colMat = dstMat;
XMat convMat = convBlob.rowRange(_Range((g + n * group) * outGroupCn, outGroupCn));
XMat wghtMat = weightsMat.rowRange(_Range(g * outGroupCn, outGroupCn));
Mat convMat(outGroupCn, outH*outW, convBlob.type(), convBlob.ptr(n, g*outGroupCn));
Mat wghtMat(outGroupCn, ksize, wghtBlob.type(), wghtBlob.ptr(g*outGroupCn));
gemmCPU(wghtMat, convMat, 1, colMat, 0, GEMM_1_T);
dnn::gemm(wghtMat, convMat, 1, colMat, 0, GEMM_1_T);
col2im(dstMat);
if (!is1x1())
col2im(colMat, dstMat);
if (bias)
{
float *biasPtr = blobs[1].ptrf() + g*inpGroupCn;
Mat biasMat(inpGroupCn, 1, CV_32F, biasPtr);
gemmCPU(biasMat, biasOnesMat, 1, dstMat, 1); //TODO: gemv
XMat curBiasMat = biasesMat.rowRange(_Range(g * outGroupCn, outGroupCn));
dnn::gemm(curBiasMat, biasOnesBlob.getRefConst<XMat>(), 1, dstMat, 1);
}
}
}
}
}
void DeConvolutionLayer::col2im(Mat &dstMat)
void DeConvolutionLayer::col2im(const Mat &colMat, Mat &dstImg)
{
if (is1x1()) return;
if (is1x1())
{
dstImg = colMat;
return;
}
if (dstImg.type() == CV_32F)
col2im_CpuPBody<float>::run(colMat.ptr<float>(), inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, dstImg.ptr<float>());
if (dstImg.type() == CV_64F)
col2im_CpuPBody<double>::run(colMat.ptr<double>(), inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, dstImg.ptr<double>());
}
if (dstMat.type() == CV_32F)
col2im_cpu(colMat.ptr<float>(), inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, dstMat.ptr<float>());
if (dstMat.type() == CV_64F)
col2im_cpu(colMat.ptr<double>(), inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, dstMat.ptr<double>());
void DeConvolutionLayer::col2im(const UMat &colMat, UMat &dstImg)
{
if (is1x1())
{
dstImg = colMat;
return;
}
#ifdef HAVE_OPENCL
CV_Assert(col2im_ocl(colMat, inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, dstImg));
#else
CV_Error(Error::StsInternal, "");
dstImg = colMat;
#endif
}
}
......
......@@ -66,7 +66,6 @@ namespace dnn
bool tryUseOpenCL, useOpenCL;
Blob colBlob, biasOnesBlob;
Mat colMat, biasOnesMat;
inline bool is1x1() const;
virtual void computeInpOutShape(const Blob &inpBlob);
......@@ -88,11 +87,15 @@ namespace dnn
{
protected:
void computeInpOutShape(const Blob &inpBlob);
void col2im(Mat &dstMat);
void col2im(const Mat &colMat, Mat &dstImg);
void col2im(const UMat &colMat, UMat &dstImg);
public:
DeConvolutionLayer(LayerParams &params);
void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
template<typename XMat>
void forward_(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
};
}
}
......
......@@ -16,7 +16,9 @@ void gemm(InputArray A, InputArray B, double alpha, InputOutputArray C, double b
if (C.isMat())
gemmCPU(A.getMat(), B.getMat(), alpha, C.getMatRef(), beta, flags);
else
cv::gemm(A, B, alpha, C, beta, C, flags);
{
cv::gemm(A, B, alpha, (beta == 0) ? noArray() : C, beta, C, flags);
}
}
inline void SwapRowCols(const Mat &A, int &rows, int &cols, bool isTrans)
......
......@@ -40,6 +40,7 @@
//M*/
#include "../precomp.hpp"
#include <opencv2/core/ocl.hpp>
#include "opencl_kernels_dnn.hpp"
#include "op_im2col.hpp"
......@@ -49,36 +50,73 @@ namespace dnn
{
#ifdef HAVE_OPENCL
void im2col_ocl(const UMat &img,
bool im2col_ocl(const UMat &img,
int channels, int height, int width,
int kernel_h, int kernel_w,
int pad_h, int pad_w,
int stride_h, int stride_w,
UMat &col)
{
int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
int channels_col = channels * kernel_h * kernel_w;
int esz = img.elemSize();
CV_Assert(img.isContinuous() && col.isContinuous());
CV_Assert(img.total() == (size_t)channels * height * width);
CV_Assert(col.total() == (size_t)channels_col * height_col * width_col);
ocl::Kernel ker("im2col", ocl::dnn::im2col_oclsrc, String("-DT=") + ocl::typeToStr(img.type()));
if (ker.empty())
return false;
ker.args(ocl::KernelArg::PtrReadOnly(img), (int)img.offset/esz,
channels, height, width,
kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
height_col, width_col,
ocl::KernelArg::PtrWriteOnly(col), (int)col.offset/esz
);
size_t localSize = ocl::Device::getDefault().maxWorkGroupSize();
size_t globalSize = (size_t)channels * height_col * width_col;
return ker.run(1, &globalSize, &localSize, true);
}
bool col2im_ocl(const UMat &col,
int channels, int height, int width,
int kernel_h, int kernel_w,
int pad_h, int pad_w,
int stride_h, int stride_w,
UMat &col)
UMat &img)
{
int h_out = (height + 2 * pad_h - kernel_h) / stride_h + 1;
int w_out = (width + 2 * pad_w - kernel_w) / stride_w + 1;
int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
int channels_col = channels * kernel_h * kernel_w;
int esz = img.elemSize();
CV_Assert(img.isContinuous() && col.isContinuous());
CV_Assert(img.total() == (size_t)channels * height * width);
CV_Assert(col.total() == (size_t)channels * kernel_h * kernel_w * h_out * w_out);
CV_Assert(col.total() == (size_t)channels_col * height_col * width_col);
ocl::Kernel im2col_ker("im2col", ocl::dnn::im2col_oclsrc);
CV_Assert(!im2col_ker.empty());
ocl::Kernel ker("col2im", ocl::dnn::col2im_oclsrc, String("-DT=") + ocl::typeToStr(col.type()));
if (ker.empty())
return false;
im2col_ker.args(ocl::KernelArg::PtrReadOnly(img), (int)img.offset,
channels, height, width,
kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
h_out, w_out,
ocl::KernelArg::PtrWriteOnly(col), (int)col.offset
);
ker.args((int)img.total(),
ocl::KernelArg::PtrReadOnly(col), (int)col.offset/esz,
height, width, channels,
kernel_h, kernel_w,
pad_h, pad_w,
stride_h, stride_w,
height_col, width_col,
ocl::KernelArg::PtrWriteOnly(img), (int)img.offset/esz);
size_t localSize = ocl::Device::getDefault().maxWorkGroupSize();
size_t globalSize = (size_t)channels * h_out * w_out;
CV_Assert(im2col_ker.run(1, &globalSize, &localSize, true));
size_t globalSize = img.total();
return ker.run(1, &globalSize, &localSize, true);
}
#endif
#endif
}
}
......@@ -41,7 +41,7 @@
#ifndef __OPENCV_DNN_LAYERS_IM2COL_HPP__
#define __OPENCV_DNN_LAYERS_IM2COL_HPP__
#include <opencv2/core.hpp>
#include "../precomp.hpp"
#include <iostream>
namespace cv
......@@ -60,26 +60,9 @@ class im2col_CpuPBody : public cv::ParallelLoopBody
Dtype* data_col;
int height_col, width_col, channels_col;
im2col_CpuPBody() {}
public:
im2col_CpuPBody(const Dtype* data_im_,
int channels_, int height_, int width_,
int kernel_h_, int kernel_w_,
int pad_h_, int pad_w_,
int stride_h_, int stride_w_,
Dtype* data_col_) :
data_im(data_im_),
channels(channels_), height(height_), width(width_),
kernel_h(kernel_h_), kernel_w(kernel_w_),
pad_h(pad_h_), pad_w(pad_w_),
stride_h(stride_h_), stride_w(stride_w_),
data_col(data_col_)
{
height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
channels_col = channels * kernel_h * kernel_w;
}
static void run(const Dtype* data_im,
int channels, int height, int width,
int kernel_h, int kernel_w,
......@@ -87,8 +70,18 @@ public:
int stride_h, int stride_w,
Dtype* data_col)
{
im2col_CpuPBody<Dtype> pb(data_im, channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, data_col);
cv::parallel_for_(Range(0, pb.channels_col), pb);
im2col_CpuPBody<Dtype> t;
t.data_im = data_im;
t.data_col = data_col;
t.channels = channels; t.height = height; t.width = width;
t.kernel_h = kernel_h; t.kernel_w = kernel_w;
t.pad_h = pad_h; t.pad_w = pad_w;
t.stride_h = stride_h; t.stride_w = stride_w;
t.height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
t.width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
t.channels_col = channels * kernel_h * kernel_w;
cv::parallel_for_(Range(0, t.channels_col), t);
}
virtual void operator ()(const Range &r) const
......@@ -112,25 +105,94 @@ public:
}
};
template <typename Dtype>
class col2im_CpuPBody : public cv::ParallelLoopBody
{
const Dtype* data_col;
int channels, height, width;
int kernel_h, kernel_w;
int pad_h, pad_w;
int stride_h, stride_w;
Dtype* data_im;
int height_col, width_col;
col2im_CpuPBody() {}
public:
static void run(const Dtype* data_col,
int channels, int height, int width,
int kernel_h, int kernel_w,
int pad_h, int pad_w,
int stride_h, int stride_w,
Dtype* data_im)
{
//TODO: single-threaded version switch
col2im_CpuPBody t;
t.data_col = data_col;
t.data_im = data_im;
t.channels = channels; t.height = height; t.width = width;
t.kernel_h = kernel_h; t.kernel_w = kernel_w;
t.pad_h = pad_h; t.pad_w = pad_w;
t.stride_h = stride_h; t.stride_w = stride_w;
t.height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
t.width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
int img_total = channels * height * width;
cv::parallel_for_(Range(0, img_total), t);
}
virtual void operator ()(const Range &r) const
{
for (int index = r.start; index < r.end; index++)
{
Dtype val = 0;
int w = index % width + pad_w;
int h = (index / width) % height + pad_h;
int c = index / (width * height);
// compute the start and end of the output
int w_col_start = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
int w_col_end = std::min(w / stride_w + 1, width_col);
int h_col_start = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
int h_col_end = std::min(h / stride_h + 1, height_col);
// equivalent implementation
int offset =
(c * kernel_h * kernel_w + h * kernel_w + w) * height_col * width_col;
int coeff_h_col = (1 - stride_h * kernel_w * height_col) * width_col;
int coeff_w_col = (1 - stride_w * height_col * width_col);
for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col];
}
}
data_im[index] = val;
}
}
};
//single-threaded version
template <typename Dtype>
void col2im_cpu(const Dtype* data_col,
int channels, int height, int width,
int patch_h, int patch_w,
int kernel_h, int kernel_w,
int pad_h, int pad_w,
int stride_h, int stride_w,
Dtype* data_im)
{
memset(data_im, 0, height * width * channels * sizeof(Dtype));
int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
int channels_col = channels * kernel_h * kernel_w;
int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1;
int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1;
int channels_col = channels * patch_h * patch_w;
std::memset(data_im, 0, height * width * channels * sizeof(Dtype));
for (int c = 0; c < channels_col; ++c)
{
int w_offset = c % patch_w;
int h_offset = (c / patch_w) % patch_h;
int c_im = c / patch_h / patch_w;
int w_offset = c % kernel_w;
int h_offset = (c / kernel_w) % kernel_h;
int c_im = c / kernel_h / kernel_w;
for (int h = 0; h < height_col; ++h)
{
......@@ -148,12 +210,19 @@ void col2im_cpu(const Dtype* data_col,
}
#ifdef HAVE_OPENCL
void im2col_ocl(const UMat &img,
bool im2col_ocl(const UMat &img,
int channels, int height, int width,
int kernel_h, int kernel_w,
int pad_h, int pad_w,
int stride_h, int stride_w,
UMat &col);
bool col2im_ocl(const UMat &col,
int channels, int height, int width,
int kernel_h, int kernel_w,
int pad_h, int pad_w,
int stride_h, int stride_w,
UMat &img);
#endif
}
......
/*************************************************************************************
* Copyright (c) 2015, Advanced Micro Devices, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation and/or
* other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
* INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
* OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
**************************************************************************************/
__kernel void col2im(const int n, __global const T* data_col, const int col_offset,
const int height, const int width, const int channels,
const int patch_h, const int patch_w,
const int pad_h, const int pad_w,
const int stride_h, const int stride_w,
const int height_col, const int width_col,
__global T* data_im, const int img_offset)
{
data_col = data_col + col_offset;
data_im = data_im + img_offset;
int index = get_global_id(0);
if(index < n) {
T val = 0;
int w = index % width + pad_w;
int h = (index / width) % height + pad_h;
int c = index / (width * height);
// compute the start and end of the output
int w_col_start = (w < patch_w) ? 0 : (w - patch_w) / stride_w + 1;
int w_col_end = min(w / stride_w + 1, width_col);
int h_col_start = (h < patch_h) ? 0 : (h - patch_h) / stride_h + 1;
int h_col_end = min(h / stride_h + 1, height_col);
// equivalent implementation
int offset =
(c * patch_h * patch_w + h * patch_w + w) * height_col * width_col;
int coeff_h_col = (1 - stride_h * patch_w * height_col) * width_col;
int coeff_w_col = (1 - stride_w * height_col * width_col);
for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col];
}
}
data_im[index] = val;
}
}
......@@ -39,11 +39,11 @@
//
//M*/
__kernel void im2col(__global const float *im_src, int im_src_offset,
__kernel void im2col(__global const T *im_src, int im_src_offset,
int channels, int height_inp, int width_inp,
int kernel_h, int kernel_w, int pad_h, int pad_w, int stride_h, int stride_w,
int height_out, int width_out,
__global float *im_col, int im_col_offset
__global T *im_col, int im_col_offset
)
{
int index = get_global_id(0);
......@@ -52,13 +52,13 @@ __kernel void im2col(__global const float *im_src, int im_src_offset,
int j_out = index % width_out;
int i_out = (index / width_out) % height_out;
int c_inp = (index / width_out) / height_out;
int c_out = c_inp * kernel_h * kernel_w;
int i_inp = i_out * stride_h - pad_h;
int j_inp = j_out * stride_w - pad_w;
im_src += (c_inp * height_inp + i_inp) * width_inp + j_inp + im_src_offset / sizeof(float);
im_col += (c_out * height_out + i_out) * width_out + j_out + im_col_offset / sizeof(float);
im_src += (c_inp * height_inp + i_inp) * width_inp + j_inp + im_src_offset;
im_col += (c_out * height_out + i_out) * width_out + j_out + im_col_offset;
for (int ki = 0; ki < kernel_h; ++ki)
for (int kj = 0; kj < kernel_w; ++kj) {
......
......@@ -107,7 +107,7 @@ TEST(Layer_Test_LRN_channels, Accuracy)
TEST(Layer_Test_Convolution, Accuracy)
{
testLayerUsingCaffeModels("layer_convolution", true);
OCL_OFF(testLayerUsingCaffeModels("layer_convolution", true));
}
OCL_TEST(Layer_Test_Convolution, Accuracy)
{
......@@ -117,7 +117,7 @@ OCL_TEST(Layer_Test_Convolution, Accuracy)
TEST(Layer_Test_DeConvolution, Accuracy)
{
testLayerUsingCaffeModels("layer_deconvolution", true, false);
OCL_OFF(testLayerUsingCaffeModels("layer_deconvolution", true, false));
}
OCL_TEST(Layer_Test_DeConvolution, Accuracy)
{
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment