Commit dcdd6af5 authored by Alexander Alekhin's avatar Alexander Alekhin

Merge pull request #10341 from pengli:dnn

parents badc3bd3 3b84acfc
......@@ -45,6 +45,7 @@
#include <float.h>
#include <algorithm>
#include <cmath>
#include "opencl_kernels_dnn.hpp"
namespace cv
{
......@@ -270,11 +271,108 @@ public:
return false;
}
#ifdef HAVE_OPENCL
bool forward_ocl(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals)
{
std::vector<UMat> inputs;
std::vector<UMat> outputs;
inps.getUMatVector(inputs);
outs.getUMatVector(outputs);
int _layerWidth = inputs[0].size[3];
int _layerHeight = inputs[0].size[2];
int _imageWidth = inputs[1].size[3];
int _imageHeight = inputs[1].size[2];
float stepX, stepY;
if (_stepX == 0 || _stepY == 0)
{
stepX = static_cast<float>(_imageWidth) / _layerWidth;
stepY = static_cast<float>(_imageHeight) / _layerHeight;
} else {
stepX = _stepX;
stepY = _stepY;
}
if (umat_offsetsX.empty())
{
Mat offsetsX(1, _offsetsX.size(), CV_32FC1, &_offsetsX[0]);
Mat offsetsY(1, _offsetsX.size(), CV_32FC1, &_offsetsY[0]);
Mat aspectRatios(1, _aspectRatios.size(), CV_32FC1, &_aspectRatios[0]);
Mat variance(1, _variance.size(), CV_32FC1, &_variance[0]);
offsetsX.copyTo(umat_offsetsX);
offsetsY.copyTo(umat_offsetsY);
aspectRatios.copyTo(umat_aspectRatios);
variance.copyTo(umat_variance);
int real_numPriors = _numPriors / pow(2, _offsetsX.size() - 1);
umat_scales = UMat(1, &real_numPriors, CV_32F, 1.0f);
}
size_t nthreads = _layerHeight * _layerWidth;
ocl::Kernel kernel("prior_box", ocl::dnn::prior_box_oclsrc);
kernel.set(0, (int)nthreads);
kernel.set(1, (float)stepX);
kernel.set(2, (float)stepY);
kernel.set(3, (float)_minSize);
kernel.set(4, (float)_maxSize);
kernel.set(5, ocl::KernelArg::PtrReadOnly(umat_offsetsX));
kernel.set(6, ocl::KernelArg::PtrReadOnly(umat_offsetsY));
kernel.set(7, (int)_offsetsX.size());
kernel.set(8, ocl::KernelArg::PtrReadOnly(umat_aspectRatios));
kernel.set(9, (int)_aspectRatios.size());
kernel.set(10, ocl::KernelArg::PtrReadOnly(umat_scales));
kernel.set(11, ocl::KernelArg::PtrWriteOnly(outputs[0]));
kernel.set(12, (int)_layerHeight);
kernel.set(13, (int)_layerWidth);
kernel.set(14, (int)_imageHeight);
kernel.set(15, (int)_imageWidth);
kernel.run(1, &nthreads, NULL, false);
// clip the prior's coordidate such that it is within [0, 1]
if (_clip)
{
Mat mat = outputs[0].getMat(ACCESS_READ);
int aspect_count = (_maxSize > 0) ? 1 : 0;
int offset = nthreads * 4 * _offsetsX.size() * (1 + aspect_count + _aspectRatios.size());
float* outputPtr = mat.ptr<float>() + offset;
int _outChannelSize = _layerHeight * _layerWidth * _numPriors * 4;
for (size_t d = 0; d < _outChannelSize; ++d)
{
outputPtr[d] = std::min<float>(std::max<float>(outputPtr[d], 0.), 1.);
}
}
// set the variance.
{
ocl::Kernel kernel("set_variance", ocl::dnn::prior_box_oclsrc);
int offset = total(shape(outputs[0]), 2);
size_t nthreads = _layerHeight * _layerWidth * _numPriors;
kernel.set(0, (int)nthreads);
kernel.set(1, (int)offset);
kernel.set(2, (int)_variance.size());
kernel.set(3, ocl::KernelArg::PtrReadOnly(umat_variance));
kernel.set(4, ocl::KernelArg::PtrWriteOnly(outputs[0]));
if (!kernel.run(1, &nthreads, NULL, false))
return false;
}
return true;
}
#endif
void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr)
{
CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
forward_ocl(inputs_arr, outputs_arr, internals_arr))
Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr);
}
......@@ -441,6 +539,14 @@ private:
std::vector<float> _offsetsX;
std::vector<float> _offsetsY;
#ifdef HAVE_OPENCL
UMat umat_offsetsX;
UMat umat_offsetsY;
UMat umat_aspectRatios;
UMat umat_scales;
UMat umat_variance;
#endif
bool _flip;
bool _clip;
bool _explicitSizes;
......
......@@ -215,6 +215,9 @@ class OCL4DNNConvSpatial
bool createGEMMLikeConvKernel(int32_t blockWidth,
int32_t blockHeight,
int32_t blockDepth);
bool createDWConvKernel(int32_t blockWidth,
int32_t blockHeight,
int32_t blockDepth);
void CreateSubBuffer(const UMat& buffer, UMat& sub_buffer,
int32_t offset, int32_t size, bool write_only);
bool convolve(const UMat &bottom, UMat &top,
......@@ -282,6 +285,8 @@ class OCL4DNNConvSpatial
int32_t M_;
bool tuned_;
bool dwconv_;
std::string key_, key_sanitized_;
std::string short_key_;
std::string kernel_name_;
......
......@@ -103,6 +103,7 @@ OCL4DNNConvSpatial<Dtype>::OCL4DNNConvSpatial(OCL4DNNConvConfig config)
top_dim_ = num_output_ * output_w_ * output_h_;
cache_path_ = utils::getConfigurationParameterString("OPENCV_OCL4DNN_CONFIG_PATH", "");
dwconv_ = (num_output_ == channels_ && channels_ == group_);
use_cache_path_ = false;
if (!cache_path_.empty())
......@@ -203,7 +204,8 @@ void OCL4DNNConvSpatial<Dtype>::collectCommonInformation()
typedef enum {
KERNEL_TYPE_INTEL_IDLF = 2,
KERNEL_TYPE_BASIC = 4,
KERNEL_TYPE_GEMM_LIKE = 5
KERNEL_TYPE_GEMM_LIKE = 5,
KERNEL_TYPE_DWCONV = 6
} ocl4dnnConvSpatialKernelType_t;
template<typename Dtype>
......@@ -313,6 +315,7 @@ void OCL4DNNConvSpatial<Dtype>::setupKernelDetails(int32_t kernelType,
if (clOptionSupport("-cl-no-subgroup-ifp"))
options_ << " -cl-no-subgroup-ifp ";
addDef("KERNEL_GEMM_LIKE");
addDef("INPUT_DEPTH", channels_);
addDef("WIDTH1", M_);
addDef("OUT_PADDING_LEFT", 0);
......@@ -329,6 +332,28 @@ void OCL4DNNConvSpatial<Dtype>::setupKernelDetails(int32_t kernelType,
setFusionDefine(fused_activ_, fused_eltwise_);
src_ = ocl::dnn::conv_layer_spatial_oclsrc;
}
else if (kernelType == KERNEL_TYPE_DWCONV)
{
kernelUKey = generateSpecificKey(KERNEL_TYPE_DWCONV, blockM, blockK, blockN);
kernel_name_ = "DWCONV_";
kernel_name_ += kernelUKey.c_str();
options_ << " -cl-fast-relaxed-math ";
if (clOptionSupport("-cl-no-subgroup-ifp"))
options_ << " -cl-no-subgroup-ifp ";
addDef("KERNEL_DWCONV");
addDef("KERNEL_SIZE", kernel_w_ * kernel_h_);
addDef("KERNEL_W", kernel_w_);
addDef("KERNEL_H", kernel_h_);
addDef("APPLY_BIAS", bias_term_);
addDef("OUTPUT_Z", num_output_ * num_);
addDef("CHANNELS", num_output_);
setFusionDefine(fused_activ_, fused_eltwise_);
options_ << " -D DWCONV=" << kernel_name_;
src_ = cv::ocl::dnn::conv_layer_spatial_oclsrc;
}
}
template<typename Dtype>
......@@ -906,6 +931,33 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
return false;
}
}
} else if (config->kernelType == KERNEL_TYPE_DWCONV) {
ocl::Kernel kernel(config->kernelName.c_str(), program);
if (kernel.empty())
return false;
cl_uint argIdx = 0;
setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
if (bias_term_)
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
kernel.set(argIdx++, (uint16_t)width_);
kernel.set(argIdx++, (uint16_t)height_);
kernel.set(argIdx++, (uint16_t)output_w_);
kernel.set(argIdx++, (uint16_t)output_h_);
size_t global_size[3];
global_size[0] = output_w_;
global_size[1] = output_h_;
global_size[2] = num_output_ * num_;
if (!kernel.run(3, global_size, NULL, false))
{
std::cout << "DWCONV kernel run failed." << std::endl;
return false;
}
} else {
for (int32_t n = 0; n < numImages; ++n) {
for (int32_t g = 0; g < group_; ++g) {
......@@ -1222,6 +1274,39 @@ bool OCL4DNNConvSpatial<float>::createIDLFKernel(int32_t blockWidth,
return false;
}
template<>
bool OCL4DNNConvSpatial<float>::createDWConvKernel(int32_t blockWidth,
int32_t blockHeight,
int32_t blockDepth)
{
if (!dwconv_)
return false;
int workItemOutput[3] = { 1, 1, 1 };
size_t local_size[3] = { 1, 1, 1 };
size_t global_size[3];
global_size[0] = divUp(output_w_, workItemOutput[0]);
global_size[1] = divUp(output_h_, workItemOutput[1]);
global_size[2] = divUp(M_ * num_, workItemOutput[2]);
kernelType_ = KERNEL_TYPE_DWCONV;
blockM_ = blockWidth;
blockK_ = blockHeight;
blockN_ = blockDepth;
setupKernel();
ocl::Program program = compileKernel();
if (program.ptr())
{
kernelQueue.push_back(makePtr<kernelConfig>(kernel_name_, &global_size[0], &local_size[0],
&workItemOutput[0], false, KERNEL_TYPE_DWCONV));
return true;
}
else
return false;
}
template<>
bool OCL4DNNConvSpatial<float>::createConvolutionKernel(int32_t kernelType,
int32_t blockWidth,
......@@ -1238,6 +1323,8 @@ bool OCL4DNNConvSpatial<float>::createConvolutionKernel(int32_t kernelType,
return createBasicKernel(blockWidth, blockHeight, blockDepth);
else if (kernelType == KERNEL_TYPE_GEMM_LIKE)
return createGEMMLikeConvKernel(blockWidth, blockHeight, blockDepth);
else if (kernelType == KERNEL_TYPE_DWCONV)
return createDWConvKernel(blockWidth, blockHeight, blockDepth);
else
CV_Assert(0 && "Internal error");
return false;
......@@ -1246,7 +1333,16 @@ bool OCL4DNNConvSpatial<float>::createConvolutionKernel(int32_t kernelType,
template<>
void OCL4DNNConvSpatial<float>::generateTunerItems(std::vector< cv::Ptr<tunerParam> > &tunerItems)
{
if (ocl::Device::getDefault().intelSubgroupsSupport()) {
if (ocl::Device::getDefault().intelSubgroupsSupport())
{
//depth_wise kernels
if (dwconv_)
{
tunerItems.push_back(makePtr<tunerParam>(KERNEL_TYPE_DWCONV, 1, 1, 1));
if (group_ > 8)
return;
}
/* IDLF kernels are using Intel specific extension which make
them intel only. */
// Generates static key_
......
......@@ -383,7 +383,7 @@ convolve_simd(
}
}
#else // KERNEL_GEMM_LIKE
#elif defined KERNEL_GEMM_LIKE
#if APPLY_BIAS
// Dtype bias[4];
......@@ -1501,4 +1501,59 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
INTERLEAVED_SIMD16_OUTPUT(dst, out_offset, 0);
}
#endif
#endif // KERNEL_BASIC/IDLF/GEMM_LIKE
#elif defined KERNEL_DWCONV
__kernel void DWCONV(
ELTWISE_DATA_ARG
NEGATIVE_SLOPE_ARG
__global Dtype* image_data,
__global Dtype* kernel_data,
BIAS_KERNEL_ARG
__global Dtype* convolved_image,
const ushort input_width,
const ushort input_height,
const ushort output_width,
const ushort output_height) {
const int outputX = get_global_id(0);
const int outputY = get_global_id(1);
const int outputZ = get_global_id(2);
if(outputX < output_width && outputY < output_height)
{
Dtype sum = 0.;
const int org_y = outputY * STRIDE_Y - INPUT_PAD_H;
const int org_x = outputX * STRIDE_X - INPUT_PAD_W;
const int currentKernelOffset = KERNEL_SIZE*(outputZ%CHANNELS);
const int biasIndex=outputZ%CHANNELS;
const int local_image_offset = org_y*input_width + org_x;
const int imageSize = input_width*input_height;
__global Dtype* image_dataPtrFloat = (image_data + (imageSize*outputZ + local_image_offset));
__global Dtype* kernel_dataPtrFloat = (kernel_data + (currentKernelOffset));
for(int y = 0; y < KERNEL_H; y++)
{
for(int x = 0; x < KERNEL_W; x++)
{
if(!(org_y + y * DILATION_Y >= 0 && org_y + y * DILATION_Y < input_height && org_x + x * DILATION_X >= 0 && org_x + x * DILATION_X < input_width))
{
continue;
}
sum += image_dataPtrFloat[x * DILATION_X] * kernel_dataPtrFloat[x];
}
image_dataPtrFloat += input_width * DILATION_Y;
kernel_dataPtrFloat += KERNEL_W;
}
#if APPLY_BIAS
int offset = outputZ*output_height*output_width + outputY*output_width + outputX;
ACTIVATION_FUNCTION(convolved_image, offset, sum + biases_base[biasIndex], biasIndex);
#else
int offset = outputZ*output_height*output_width + outputY*output_width + outputX;
ACTIVATION_FUNCTION(convolved_image, offset, sum, biasIndex);
#endif
}
}
#endif // KERNEL_BASIC/IDLF/GEMM_LIKE/DWCONV
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#define Dtype float
#define Dtype4 float4
__kernel void prior_box(const int nthreads,
const Dtype stepX,
const Dtype stepY,
const Dtype _minSize,
const Dtype _maxSize,
__global const Dtype* _offsetsX,
__global const Dtype* _offsetsY,
const int offsetsX_size,
__global const Dtype* _aspectRatios,
const int aspectRatios_size,
__global const Dtype* scales,
__global Dtype* dst,
const int _layerHeight,
const int _layerWidth,
const int imgHeight,
const int imgWidth)
{
for (int index = get_global_id(0); index < nthreads; index += get_global_size(0))
{
int w = index % _layerWidth;
int h = index / _layerWidth;
__global Dtype* outputPtr;
int aspect_count = (_maxSize > 0) ? 1 : 0;
outputPtr = dst + index * 4 * offsetsX_size * (1 + aspect_count + aspectRatios_size);
Dtype _boxWidth, _boxHeight;
Dtype4 vec;
_boxWidth = _boxHeight = _minSize * scales[0];
for (int i = 0; i < offsetsX_size; ++i)
{
float center_x = (w + _offsetsX[i]) * stepX;
float center_y = (h + _offsetsY[i]) * stepY;
vec.x = (center_x - _boxWidth * 0.5f) / imgWidth; // xmin
vec.y = (center_y - _boxHeight * 0.5f) / imgHeight; // ymin
vec.z = (center_x + _boxWidth * 0.5f) / imgWidth; // xmax
vec.w = (center_y + _boxHeight * 0.5f) / imgHeight; // ymax
vstore4(vec, 0, outputPtr);
outputPtr += 4;
}
if (_maxSize > 0)
{
_boxWidth = _boxHeight = native_sqrt(_minSize * _maxSize) * scales[1];
for (int i = 0; i < offsetsX_size; ++i)
{
float center_x = (w + _offsetsX[i]) * stepX;
float center_y = (h + _offsetsY[i]) * stepY;
vec.x = (center_x - _boxWidth * 0.5f) / imgWidth; // xmin
vec.y = (center_y - _boxHeight * 0.5f) / imgHeight; // ymin
vec.z = (center_x + _boxWidth * 0.5f) / imgWidth; // xmax
vec.w = (center_y + _boxHeight * 0.5f) / imgHeight; // ymax
vstore4(vec, 0, outputPtr);
outputPtr += 4;
}
}
for (int r = 0; r < aspectRatios_size; ++r)
{
float ar = native_sqrt(_aspectRatios[r]);
float scale = scales[(_maxSize > 0 ? 2 : 1) + r];
_boxWidth = _minSize * ar * scale;
_boxHeight = _minSize / ar * scale;
for (int i = 0; i < offsetsX_size; ++i)
{
float center_x = (w + _offsetsX[i]) * stepX;
float center_y = (h + _offsetsY[i]) * stepY;
vec.x = (center_x - _boxWidth * 0.5f) / imgWidth; // xmin
vec.y = (center_y - _boxHeight * 0.5f) / imgHeight; // ymin
vec.z = (center_x + _boxWidth * 0.5f) / imgWidth; // xmax
vec.w = (center_y + _boxHeight * 0.5f) / imgHeight; // ymax
vstore4(vec, 0, outputPtr);
outputPtr += 4;
}
}
}
}
__kernel void set_variance(const int nthreads,
const int offset,
const int variance_size,
__global const Dtype* variance,
__global Dtype* dst)
{
for (int index = get_global_id(0); index < nthreads; index += get_global_size(0))
{
Dtype4 var_vec;
if (variance_size == 1)
var_vec = (Dtype4)(variance[0]);
else
var_vec = vload4(0, variance);
vstore4(var_vec, 0, dst + offset + index * 4);
}
}
......@@ -11,6 +11,8 @@ Test for Tensorflow models loading
#include "test_precomp.hpp"
#include "npy_blob.hpp"
#include <opencv2/core/ocl.hpp>
#include <opencv2/ts/ocl_test.hpp>
namespace cvtest
{
......@@ -219,6 +221,43 @@ TEST(Test_TensorFlow, MobileNet_SSD)
normAssert(target[2].reshape(1, 1), output[2].reshape(1, 1), "", 4e-5, 1e-2);
}
OCL_TEST(Test_TensorFlow, MobileNet_SSD)
{
std::string netPath = findDataFile("dnn/ssd_mobilenet_v1_coco.pb", false);
std::string netConfig = findDataFile("dnn/ssd_mobilenet_v1_coco.pbtxt", false);
std::string imgPath = findDataFile("dnn/street.png", false);
Mat inp;
resize(imread(imgPath), inp, Size(300, 300));
inp = blobFromImage(inp, 1.0f / 127.5, Size(), Scalar(127.5, 127.5, 127.5), true);
std::vector<String> outNames(3);
outNames[0] = "concat";
outNames[1] = "concat_1";
outNames[2] = "detection_out";
std::vector<Mat> target(outNames.size());
for (int i = 0; i < outNames.size(); ++i)
{
std::string path = findDataFile("dnn/tensorflow/ssd_mobilenet_v1_coco." + outNames[i] + ".npy", false);
target[i] = blobFromNPY(path);
}
Net net = readNetFromTensorflow(netPath, netConfig);
net.setPreferableBackend(DNN_BACKEND_DEFAULT);
net.setPreferableTarget(DNN_TARGET_OPENCL);
net.setInput(inp);
std::vector<Mat> output;
net.forward(output, outNames);
normAssert(target[0].reshape(1, 1), output[0].reshape(1, 1));
normAssert(target[1].reshape(1, 1), output[1].reshape(1, 1), "", 1e-5, 2e-4);
normAssert(target[2].reshape(1, 1), output[2].reshape(1, 1), "", 4e-5, 1e-2);
}
TEST(Test_TensorFlow, lstm)
{
runTensorFlowNet("lstm", true);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment