Commit 7fef9aa9 authored by Fenglei's avatar Fenglei Committed by Robert Kimball

gpu convolution asymmetric pad (#1064)

* change convolution to use cudnn emitter

* convolution working

* add asymmetric pad

* forward with asymmetric working

* backward asymmetric

* padding to padding_below

* pad still has bug on backward

* change name

* fix convolution back prop

* fix code block

* slice back from padded output:

* working code

* extra ,

* Update gpu_emitter.cpp

* splict build_convolution to 3 function

* format and fix bugs

* Update cudnn_emitter.hpp
parent 6638e02b
......@@ -18,6 +18,7 @@
#include <sstream>
#include <vector>
#include "ngraph/log.hpp"
#include "ngraph/runtime/gpu/cudnn_emitter.hpp"
#include "ngraph/runtime/gpu/gpu_invoke.hpp"
#include "ngraph/runtime/gpu/gpu_primitive_emitter.hpp"
......@@ -177,6 +178,305 @@ size_t runtime::gpu::CUDNNEmitter::build_reduce_forward(const runtime::gpu::GPUR
return primitive_index;
}
cudnnFilterDescriptor_t& runtime::gpu::CUDNNEmitter::get_cudnn_filter_descriptor(
const Shape& shape, const cudnnDataType_t data_type, const cudnnTensorFormat_t tensor_format)
{
std::vector<int> dimensions(fmax(4, shape.size()), 1);
int idx = 0;
for (size_t i = dimensions.size() - shape.size(); i < dimensions.size(); i++)
{
dimensions[i] = static_cast<int>(shape[idx++]);
}
auto& filter_descriptor = m_descriptors.build<cudnnFilterDescriptor_t>();
if (dimensions.size() <= 4)
{
CUDNN_SAFE_CALL(cudnnSetFilter4dDescriptor(filter_descriptor,
/*dataType=*/data_type,
/*format=*/tensor_format,
/*dimension_size*/ dimensions[0],
/*dimension_size*/ dimensions[1],
/*dimension_size*/ dimensions[2],
/*dimension_size*/ dimensions[3]));
}
else
{
CUDNN_SAFE_CALL(
cudnnSetFilterNdDescriptor(filter_descriptor,
/*dataType=*/data_type,
/*format=*/tensor_format,
/*num_dimensions=*/static_cast<int>(dimensions.size()),
/*dimensions*/ dimensions.data()));
}
return filter_descriptor;
}
cudnnConvolutionDescriptor_t& runtime::gpu::CUDNNEmitter::get_cudnn_convolution_descriptor(
const Shape& padding,
const Strides& window_movement_strides,
const Strides& window_dilation_strides,
cudnnConvolutionMode_t mode,
cudnnDataType_t data_type)
{
auto& conv_descriptor = m_descriptors.build<cudnnConvolutionDescriptor_t>();
std::vector<int> window_movement_strides_int(window_movement_strides.size());
std::vector<int> window_dilation_strides_int(window_dilation_strides.size());
std::vector<int> padding_int(padding.size());
for (int i = 0; i < padding.size(); i++)
{
window_movement_strides_int[i] = static_cast<int>(window_movement_strides[i]);
window_dilation_strides_int[i] = static_cast<int>(window_dilation_strides[i]);
padding_int[i] = static_cast<int>(padding[i]);
}
if (padding.size() == 2)
{
CUDNN_SAFE_CALL(cudnnSetConvolution2dDescriptor(conv_descriptor,
padding_int[0],
padding_int[1],
window_movement_strides_int[0],
window_movement_strides_int[1],
window_dilation_strides_int[0],
window_dilation_strides_int[1],
mode,
data_type));
}
else
{
CUDNN_SAFE_CALL(cudnnSetConvolutionNdDescriptor(conv_descriptor,
static_cast<int>(padding_int.size()),
padding_int.data(),
window_movement_strides_int.data(),
window_dilation_strides_int.data(),
mode,
data_type));
}
return conv_descriptor;
}
size_t runtime::gpu::CUDNNEmitter::build_convolution(const runtime::gpu::GPURuntimeContext* ctx,
const cudnnDataType_t data_type,
const Shape& input_tensor_shape,
const Shape& input_filter_shape,
const Shape& output_tensor_shape,
const Strides& window_movement_strides,
const Strides& window_dilation_strides,
const Shape& padding_below)
{
// construct hash to determine if kernel needs to be emitted
// or if it already exists in the primitive list
std::stringstream ss;
ss << "convolution_op" << data_type << "_i" << join(input_tensor_shape, "_") << "_w"
<< join(input_filter_shape, "_") << "_o" << join(output_tensor_shape, "_") << "_ws"
<< join(window_movement_strides, "_") << "_wd" << join(window_dilation_strides, "_") << "_p"
<< join(padding_below, "_");
std::string hash = ss.str();
// check if the requested kernel is already an inserted primitive
size_t primitive_index = m_primitive_emitter->lookup(hash);
if (primitive_index != std::numeric_limits<size_t>::max())
{
return primitive_index;
}
const cudnnTensorFormat_t tensor_format = CUDNN_TENSOR_NCHW;
const cudnnConvolutionMode_t mode = CUDNN_CROSS_CORRELATION;
auto& tensor_desc_0 = tensor_descriptor_from_shape(input_tensor_shape);
auto& tensor_desc_1 = tensor_descriptor_from_shape(output_tensor_shape);
auto& filter_desc = get_cudnn_filter_descriptor(input_filter_shape, data_type, tensor_format);
auto& conv_desc = get_cudnn_convolution_descriptor(
padding_below, window_movement_strides, window_dilation_strides, mode, data_type);
const cudnnConvolutionFwdAlgo_t conv_fwd_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
size_t workspace_size_in_bytes = 0;
CUDNN_SAFE_CALL(cudnnGetConvolutionForwardWorkspaceSize(*ctx->cudnn_handle,
tensor_desc_0,
filter_desc,
conv_desc,
tensor_desc_1,
conv_fwd_algo,
&workspace_size_in_bytes));
// get an allocator for transient per kernel gpu memory
GPUAllocator allocator = this->m_primitive_emitter->get_memory_allocator();
// (lazy) allocation for kernel arguments
size_t workspace_idx = allocator.reserve_workspace(workspace_size_in_bytes);
std::unique_ptr<gpu::primitive> conv;
conv.reset(new gpu::primitive{[=, &conv_desc, &tensor_desc_0, &filter_desc, &tensor_desc_1](
void** inputs, void** outputs) {
float alpha = 1.0;
float beta = 0.0;
void* workspace_ptr = runtime::gpu::invoke_memory_primitive(ctx, workspace_idx);
CUDNN_SAFE_CALL(cudnnConvolutionForward(*ctx->cudnn_handle,
&alpha,
tensor_desc_0,
inputs[0],
filter_desc,
inputs[1],
conv_desc,
conv_fwd_algo,
workspace_ptr,
workspace_size_in_bytes,
&beta,
tensor_desc_1,
outputs[0]));
}});
primitive_index = this->m_primitive_emitter->insert(std::move(conv));
m_primitive_emitter->cache(hash, primitive_index);
return primitive_index;
}
size_t runtime::gpu::CUDNNEmitter::build_convolution_backward_data(
const runtime::gpu::GPURuntimeContext* ctx,
const cudnnDataType_t data_type,
const Shape& input_filter_shape,
const Shape& input_tensor_shape,
const Shape& output_tensor_shape,
const Strides& window_movement_strides,
const Strides& window_dilation_strides,
const Shape& padding_below)
{
// construct hash to determine if kernel needs to be emitted
// or if it already exists in the primitive list
std::stringstream ss;
ss << "convolution_bp_data_op" << data_type << "_i" << join(input_tensor_shape, "_") << "_w"
<< join(input_filter_shape, "_") << "_o" << join(output_tensor_shape, "_") << "_ws"
<< join(window_movement_strides, "_") << "_wd" << join(window_dilation_strides, "_") << "_p"
<< join(padding_below, "_");
std::string hash = ss.str();
// check if the requested kernel is already an inserted primitive
size_t primitive_index = m_primitive_emitter->lookup(hash);
if (primitive_index != std::numeric_limits<size_t>::max())
{
return primitive_index;
}
const cudnnTensorFormat_t tensor_format = CUDNN_TENSOR_NCHW;
const cudnnConvolutionMode_t mode = CUDNN_CROSS_CORRELATION;
auto& tensor_desc_0 = tensor_descriptor_from_shape(input_tensor_shape);
auto& tensor_desc_1 = tensor_descriptor_from_shape(output_tensor_shape);
auto& filter_desc = get_cudnn_filter_descriptor(input_filter_shape, data_type, tensor_format);
auto& conv_desc = get_cudnn_convolution_descriptor(
padding_below, window_movement_strides, window_dilation_strides, mode, data_type);
const cudnnConvolutionBwdDataAlgo_t conv_bwd_data_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
size_t workspace_size_in_bytes = 0;
CUDNN_SAFE_CALL(cudnnGetConvolutionBackwardDataWorkspaceSize(*ctx->cudnn_handle,
filter_desc,
tensor_desc_0,
conv_desc,
tensor_desc_1,
conv_bwd_data_algo,
&workspace_size_in_bytes));
// get an allocator for transient per kernel gpu memory
GPUAllocator allocator = this->m_primitive_emitter->get_memory_allocator();
// (lazy) allocation for kernel arguments
size_t workspace_idx = allocator.reserve_workspace(workspace_size_in_bytes);
std::unique_ptr<gpu::primitive> conv;
conv.reset(new gpu::primitive{[=, &conv_desc, &tensor_desc_0, &filter_desc, &tensor_desc_1](
void** inputs, void** outputs) {
float alpha = 1.0;
float beta = 0.0;
void* workspace_ptr = runtime::gpu::invoke_memory_primitive(ctx, workspace_idx);
CUDNN_SAFE_CALL(cudnnConvolutionBackwardData(*ctx->cudnn_handle,
&alpha,
filter_desc,
inputs[0],
tensor_desc_0,
inputs[1],
conv_desc,
conv_bwd_data_algo,
workspace_ptr,
workspace_size_in_bytes,
&beta,
tensor_desc_1,
outputs[0]));
}});
primitive_index = this->m_primitive_emitter->insert(std::move(conv));
m_primitive_emitter->cache(hash, primitive_index);
return primitive_index;
}
size_t runtime::gpu::CUDNNEmitter::build_convolution_backward_filter(
const runtime::gpu::GPURuntimeContext* ctx,
const cudnnDataType_t data_type,
const Shape& input_tensor_shape_0,
const Shape& input_tensor_shape_1,
const Shape& output_filter_shape,
const Strides& window_movement_strides,
const Strides& window_dilation_strides,
const Shape& padding_below)
{
// construct hash to determine if kernel needs to be emitted
// or if it already exists in the primitive list
std::stringstream ss;
ss << "convolution_bp_filter_op" << data_type << "_i" << join(input_tensor_shape_0, "_") << "_w"
<< join(output_filter_shape, "_") << "_o" << join(input_tensor_shape_1, "_") << "_ws"
<< join(window_movement_strides, "_") << "_wd" << join(window_dilation_strides, "_") << "_p"
<< join(padding_below, "_");
std::string hash = ss.str();
// check if the requested kernel is already an inserted primitive
size_t primitive_index = m_primitive_emitter->lookup(hash);
if (primitive_index != std::numeric_limits<size_t>::max())
{
return primitive_index;
}
const cudnnTensorFormat_t tensor_format = CUDNN_TENSOR_NCHW;
const cudnnConvolutionMode_t mode = CUDNN_CROSS_CORRELATION;
auto& tensor_desc_0 = tensor_descriptor_from_shape(input_tensor_shape_0);
auto& tensor_desc_1 = tensor_descriptor_from_shape(input_tensor_shape_1);
auto& filter_desc = get_cudnn_filter_descriptor(output_filter_shape, data_type, tensor_format);
auto& conv_desc = get_cudnn_convolution_descriptor(
padding_below, window_movement_strides, window_dilation_strides, mode, data_type);
const cudnnConvolutionBwdFilterAlgo_t conv_bwd_filter_algo =
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
size_t workspace_size_in_bytes = 0;
CUDNN_SAFE_CALL(cudnnGetConvolutionBackwardFilterWorkspaceSize(*ctx->cudnn_handle,
tensor_desc_0,
tensor_desc_1,
conv_desc,
filter_desc,
conv_bwd_filter_algo,
&workspace_size_in_bytes));
// get an allocator for transient per kernel gpu memory
GPUAllocator allocator = this->m_primitive_emitter->get_memory_allocator();
// (lazy) allocation for kernel arguments
size_t workspace_idx = allocator.reserve_workspace(workspace_size_in_bytes);
std::unique_ptr<gpu::primitive> conv;
conv.reset(new gpu::primitive{[=, &conv_desc, &tensor_desc_0, &filter_desc, &tensor_desc_1](
void** inputs, void** outputs) {
float alpha = 1.0;
float beta = 0.0;
void* workspace_ptr = runtime::gpu::invoke_memory_primitive(ctx, workspace_idx);
CUDNN_SAFE_CALL(cudnnConvolutionBackwardFilter(*ctx->cudnn_handle,
&alpha,
tensor_desc_0,
inputs[0],
tensor_desc_1,
inputs[1],
conv_desc,
conv_bwd_filter_algo,
workspace_ptr,
workspace_size_in_bytes,
&beta,
filter_desc,
outputs[0]));
}});
primitive_index = this->m_primitive_emitter->insert(std::move(conv));
m_primitive_emitter->cache(hash, primitive_index);
return primitive_index;
}
size_t runtime::gpu::CUDNNEmitter::build_pooling(const runtime::gpu::GPURuntimeContext* ctx,
const cudnnPoolingMode_t& pool_op,
const Prop& direction,
......
......@@ -50,11 +50,38 @@ namespace ngraph
public:
enum class Prop
{
Inference,
Inference = 0,
Forward,
Backward
};
size_t build_convolution(const runtime::gpu::GPURuntimeContext* ctx,
const cudnnDataType_t data_type,
const Shape& input_tensor_shape,
const Shape& input_filter_shape,
const Shape& output_tensor_shape,
const Strides& window_movement_strides,
const Strides& window_dilation_strides,
const Shape& padding_below);
size_t build_convolution_backward_data(const runtime::gpu::GPURuntimeContext* ctx,
const cudnnDataType_t data_type,
const Shape& input_filter_shape,
const Shape& input_tensor_shape,
const Shape& output_tensor_shape,
const Strides& window_movement_strides,
const Strides& window_dilation_strides,
const Shape& padding_below);
size_t build_convolution_backward_filter(const runtime::gpu::GPURuntimeContext* ctx,
const cudnnDataType_t data_type,
const Shape& input_tensor_shape_0,
const Shape& input_tensor_shape_1,
const Shape& output_filter_shape,
const Strides& window_movement_strides,
const Strides& window_dilation_strides,
const Shape& padding_below);
size_t build_reduce_forward(const GPURuntimeContext* ctx,
const cudnnReduceTensorOp_t& reduce_op,
const Shape& input_shape,
......@@ -84,6 +111,16 @@ namespace ngraph
const Shape& tensor_shape);
cudnnTensorDescriptor_t& tensor_descriptor_from_shape(const Shape& shape);
cudnnFilterDescriptor_t&
get_cudnn_filter_descriptor(const Shape& shape,
const cudnnDataType_t data_type,
const cudnnTensorFormat_t tensor_format);
cudnnConvolutionDescriptor_t&
get_cudnn_convolution_descriptor(const Shape& padding,
const Strides& window_movement_strides,
const Strides& window_dilation_strides,
cudnnConvolutionMode_t mode,
cudnnDataType_t data_type);
private:
CUDNNEmitter(GPUPrimitiveEmitter* emitter);
......
......@@ -156,22 +156,21 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
return;
}
const std::string args0 = "x_descriptor";
const std::string args1 = "w_descriptor";
const std::string out0 = "y_descriptor";
const std::string conv_descriptor = "conv_descriptor";
const std::string data_type = "CUDNN_DATA_FLOAT";
const std::string tensor_format = "CUDNN_TENSOR_NCHW";
const std::string mode = "CUDNN_CROSS_CORRELATION";
const std::string conv_algo = "CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM";
auto convolution = static_cast<const ngraph::op::Convolution*>(node);
Strides window_dilation_strides = convolution->get_window_dilation_strides();
Strides window_movement_strides = convolution->get_window_movement_strides();
Strides data_dilation_strides = convolution->get_data_dilation_strides();
CoordinateDiff padding = convolution->get_padding_below();
CoordinateDiff padding_above = convolution->get_padding_above();
CoordinateDiff padding_below_diff = convolution->get_padding_below();
CoordinateDiff padding_above_diff = convolution->get_padding_above();
Shape padding_below(padding_below_diff.size(), 0);
Shape padding_above(padding_above_diff.size(), 0);
for (int i = 0; i < padding_below_diff.size(); i++)
{
padding_below[i] = static_cast<size_t>(padding_below_diff[i]);
padding_above[i] = static_cast<size_t>(padding_above_diff[i]);
}
if (padding.size() > 3)
if (padding_below.size() > 3)
{
throw std::runtime_error(node->get_name() +
"with more than 3D is not implemented.");
......@@ -184,49 +183,76 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
"with data dilation is not implemented.");
}
}
for (int i = 0; i < padding.size(); i++)
bool pad_required = false;
if (padding_below != padding_above)
{
if (padding[i] != padding_above[i])
{
throw std::runtime_error(node->get_name() +
"with asymmetric padding is not implemented.");
}
pad_required = true;
}
auto input_shape = args[0].get_shape();
auto input_shape_padded = input_shape;
if (pad_required)
{
input_shape_padded =
get_padded_shape(input_shape, padding_below, padding_above, {});
auto temp_size =
shape_size(input_shape_padded) * args[0].get_element_type().size();
GPUAllocator allocator =
external_function->get_primitive_emitter()->get_memory_allocator();
size_t idx_workspace = allocator.reserve_workspace(temp_size);
writer << "void* pad_buffer = runtime::gpu::invoke_memory_primitive(ctx, "
<< idx_workspace << ");\n";
auto& cuda_emitter =
external_function->get_primitive_emitter()->get_cuda_emitter();
auto pad_index =
cuda_emitter->build_pad(external_function->ctx().get(),
{{args[0].get_type(), out[0].get_type()}},
input_shape,
input_shape_padded,
padding_below,
padding_above,
Shape{},
std::string("0"));
writer << "gpu::invoke_primitive(ctx, " << pad_index << ", ";
writer << "std::vector<void*>{" << args[0].get_name() << "}.data(), ";
writer << "std::vector<void*>{pad_buffer}.data()";
writer << ");\n";
// asymetric padding has been applied, zero out padding vectors to
// ensure cudnn does not assume padding
std::fill(padding_below.begin(), padding_below.end(), 0);
std::fill(padding_above.begin(), padding_above.end(), 0);
}
auto& cudnn_emitter =
external_function->get_primitive_emitter()->get_cudnn_emitter();
cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
size_t index = cudnn_emitter->build_convolution(external_function->ctx().get(),
data_type,
input_shape_padded,
args[1].get_shape(),
out[0].get_shape(),
window_movement_strides,
window_dilation_strides,
padding_below);
writer.block_begin(" // " + node->get_name());
writer << "float alpha = 1.0;\n";
writer << "float beta = 0.0;\n";
// construct input and output tensor descriptor
kernel::emit_cudnnTensorDescriptor(
writer, args0, tensor_format, data_type, args[0].get_shape());
kernel::emit_cudnnFilterDescriptor(
writer, args1, tensor_format, data_type, args[1].get_shape());
kernel::emit_cudnnTensorDescriptor(
writer, out0, tensor_format, data_type, out[0].get_shape());
kernel::emit_cudnnConvolutionDescriptor(writer,
conv_descriptor,
padding,
window_movement_strides,
window_dilation_strides,
mode,
data_type);
writer << "size_t workSpaceSizeInBytes = 0;\n";
writer << "CUDNN_SAFE_CALL(cudnnGetConvolutionForwardWorkspaceSize(*ctx->cudnn_"
"handle, "
<< args0 << ", " << args1 << ", " << conv_descriptor << ", " << out0 << ", "
<< conv_algo << ", "
<< "&workSpaceSizeInBytes));\n";
writer << "void* workspace = "
"runtime::gpu::create_gpu_buffer(workSpaceSizeInBytes);\n";
writer << "CUDNN_SAFE_CALL(cudnnConvolutionForward(*ctx->cudnn_handle, "
<< "&alpha, " << args0 << ", " << args[0].get_name() << ", " << args1 << ", "
<< args[1].get_name() << ", " << conv_descriptor << ", " << conv_algo << ", "
<< "workspace, workSpaceSizeInBytes, "
<< "&beta, " << out0 << ", " << out[0].get_name() << "));\n";
writer << "runtime::gpu::free_gpu_buffer(workspace);\n";
writer << "gpu::invoke_primitive(ctx, " << index << ", ";
if (pad_required)
{
writer << "std::vector<void*>{pad_buffer, " << args[1].get_name()
<< "}.data(), ";
}
else
{
writer << "std::vector<void*>{" << args[0].get_name() << ","
<< args[1].get_name() << "}.data(), ";
}
writer << "std::vector<void*>{" << out[0].get_name() << "}.data()";
writer << ");\n";
writer.block_end();
}
......@@ -238,25 +264,22 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
return;
}
const std::string args0 = "w_descriptor";
const std::string args1 = "dy_descriptor";
const std::string out0 = "dx_descriptor";
const std::string conv_descriptor = "conv_descriptor";
const std::string data_type = "CUDNN_DATA_FLOAT";
const std::string tensor_format = "CUDNN_TENSOR_NCHW";
const std::string mode = "CUDNN_CROSS_CORRELATION";
const std::string conv_algo = "CUDNN_CONVOLUTION_BWD_DATA_ALGO_0";
auto convolution = static_cast<const ngraph::op::ConvolutionBackpropData*>(node);
Strides window_dilation_strides =
convolution->get_window_dilation_strides_forward();
Strides window_movement_strides =
convolution->get_window_movement_strides_forward();
Strides data_dilation_strides = convolution->get_data_dilation_strides_forward();
CoordinateDiff padding = convolution->get_padding_below_forward();
CoordinateDiff padding_above = convolution->get_padding_above_forward();
if (padding.size() > 3)
CoordinateDiff padding_below_diff = convolution->get_padding_below_forward();
CoordinateDiff padding_above_diff = convolution->get_padding_above_forward();
Shape padding_below(padding_below_diff.size(), 0);
Shape padding_above(padding_above_diff.size(), 0);
for (int i = 0; i < padding_below_diff.size(); i++)
{
padding_below[i] = static_cast<size_t>(padding_below_diff[i]);
padding_above[i] = static_cast<size_t>(padding_above_diff[i]);
}
if (padding_below.size() > 3)
{
throw std::runtime_error(node->get_name() +
"with more than 3D is not implemented.");
......@@ -269,51 +292,126 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
"with data dilation is not implemented.");
}
}
for (int i = 0; i < padding.size(); i++)
bool pad_required = false;
if (padding_below != padding_above)
{
if (padding[i] != padding_above[i])
{
throw std::runtime_error(node->get_name() +
"with asymmetric padding is not implemented.");
}
pad_required = true;
}
auto output_shape = out[0].get_shape();
auto output_shape_padded = output_shape;
Shape padding_below_back(output_shape.size(), 0);
size_t i = padding_below_back.size() - padding_below.size();
size_t j = 0;
for (; i < padding_below_back.size(); i++)
{
padding_below_back[i] = padding_below[j++];
}
writer.block_begin(" // " + node->get_name());
writer << "float alpha = 1.0;\n";
writer << "float beta = 0.0;\n";
// construct input and output tensor descriptor
kernel::emit_cudnnFilterDescriptor(
writer, args0, tensor_format, data_type, args[0].get_shape());
kernel::emit_cudnnTensorDescriptor(
writer, args1, tensor_format, data_type, args[1].get_shape());
kernel::emit_cudnnTensorDescriptor(
writer, out0, tensor_format, data_type, out[0].get_shape());
kernel::emit_cudnnConvolutionDescriptor(writer,
conv_descriptor,
padding,
window_movement_strides,
window_dilation_strides,
mode,
data_type);
writer << "size_t workSpaceSizeInBytes = 0;\n";
writer << "CUDNN_SAFE_CALL(cudnnGetConvolutionBackwardDataWorkspaceSize(*ctx->"
"cudnn_handle, "
<< args0 << ", " << args1 << ", " << conv_descriptor << ", " << out0 << ", "
<< conv_algo << ", "
<< "&workSpaceSizeInBytes));\n";
writer << "void* workspace = "
"runtime::gpu::create_gpu_buffer(workSpaceSizeInBytes);\n";
writer << "CUDNN_SAFE_CALL(cudnnConvolutionBackwardData(*ctx->cudnn_handle, "
<< "&alpha, " << args0 << ", " << args[0].get_name() << ", " << args1 << ", "
<< args[1].get_name() << ", " << conv_descriptor << ", " << conv_algo << ", "
<< "workspace, workSpaceSizeInBytes, "
<< "&beta, " << out0 << ", " << out[0].get_name() << "));\n";
writer << "runtime::gpu::free_gpu_buffer(workspace);\n";
if (pad_required)
{
output_shape_padded =
get_padded_shape(output_shape, padding_below, padding_above, {});
auto temp_size =
shape_size(output_shape_padded) * args[0].get_element_type().size();
GPUAllocator allocator =
external_function->get_primitive_emitter()->get_memory_allocator();
size_t idx_workspace = allocator.reserve_workspace(temp_size);
writer << "void* pad_buffer = runtime::gpu::invoke_memory_primitive(ctx, "
<< idx_workspace << ");\n";
auto& cuda_emitter =
external_function->get_primitive_emitter()->get_cuda_emitter();
auto pad_index =
cuda_emitter->build_pad(external_function->ctx().get(),
{{args[0].get_type(), out[0].get_type()}},
output_shape,
output_shape_padded,
padding_below,
padding_above,
Shape{},
std::string("0"));
writer << "gpu::invoke_primitive(ctx, " << pad_index << ", ";
writer << "std::vector<void*>{" << out[0].get_name() << "}.data(), ";
writer << "std::vector<void*>{pad_buffer}.data()";
writer << ");\n";
// asymetric padding has been applied, zero out padding vectors to
// ensure cudnn does not assume padding
std::fill(padding_below.begin(), padding_below.end(), 0);
std::fill(padding_above.begin(), padding_above.end(), 0);
}
auto& cudnn_emitter =
external_function->get_primitive_emitter()->get_cudnn_emitter();
cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
size_t index =
cudnn_emitter->build_convolution_backward_data(external_function->ctx().get(),
data_type,
args[0].get_shape(),
args[1].get_shape(),
output_shape_padded,
window_movement_strides,
window_dilation_strides,
padding_below);
writer << "gpu::invoke_primitive(ctx, " << index << ", ";
writer << "std::vector<void*>{" << args[0].get_name() << "," << args[1].get_name()
<< "}.data(), ";
if (pad_required)
{
writer << "std::vector<void*>{pad_buffer}.data()";
}
else
{
writer << "std::vector<void*>{" << out[0].get_name() << "}.data()";
}
writer << ");\n";
// since we padded output with temp buffer, we need to copy back to ouput
if (pad_required)
{
const auto arg_rank = output_shape.size();
const Strides slice_strides(output_shape.size(), 1);
const auto input_strides = row_major_strides(output_shape_padded);
const auto output_strides = row_major_strides(output_shape);
GPUAllocator allocator =
external_function->get_primitive_emitter()->get_memory_allocator();
size_t idx_input_strides = allocator.reserve_argspace(
input_strides.data(), input_strides.size() * sizeof(size_t));
size_t idx_output_strides = allocator.reserve_argspace(
output_strides.data(), output_strides.size() * sizeof(size_t));
size_t idx_lower_bounds = allocator.reserve_argspace(
padding_below_back.data(), padding_below_back.size() * sizeof(size_t));
size_t idx_slice_strides = allocator.reserve_argspace(
slice_strides.data(), slice_strides.size() * sizeof(size_t));
writer << "size_t rank = " << arg_rank << ";\n";
writer << "void* input_strides_d = "
<< " runtime::gpu::invoke_memory_primitive(ctx, " << idx_input_strides
<< ");\n";
writer << "void* output_strides_d = "
<< " runtime::gpu::invoke_memory_primitive(ctx, " << idx_output_strides
<< ");\n";
writer << "void* slice_strides_d = "
<< " runtime::gpu::invoke_memory_primitive(ctx, " << idx_slice_strides
<< ");\n";
writer << "void* lower_bounds_d = "
<< " runtime::gpu::invoke_memory_primitive(ctx, " << idx_lower_bounds
<< ");\n";
writer << "runtime::gpu::emit_slice(\"" << node->description()
<< "\", CUdeviceptr(pad_buffer), CUdeviceptr(" << out[0].get_name()
<< ")"
<< ", {\"" << args[0].get_type() << "\", \"" << out[0].get_type()
<< "\"}"
<< ", "
<< "ctx, "
<< "CUdeviceptr(input_strides_d), CUdeviceptr(lower_bounds_d), "
"CUdeviceptr(slice_strides_d), CUdeviceptr(output_strides_d)"
<< ", " << arg_rank << ", " << out[0].get_size() << ");\n";
}
writer.block_end();
}
......@@ -325,25 +423,25 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
return;
}
const std::string args0 = "x_descriptor";
const std::string args1 = "dy_descriptor";
const std::string out0 = "dw_descriptor";
const std::string conv_descriptor = "conv_descriptor";
const std::string data_type = "CUDNN_DATA_FLOAT";
const std::string tensor_format = "CUDNN_TENSOR_NCHW";
const std::string mode = "CUDNN_CROSS_CORRELATION";
const std::string conv_algo = "CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0";
auto convolution = static_cast<const ngraph::op::ConvolutionBackpropFilters*>(node);
Strides window_dilation_strides =
convolution->get_window_dilation_strides_forward();
Strides window_movement_strides =
convolution->get_window_movement_strides_forward();
Strides data_dilation_strides = convolution->get_data_dilation_strides_forward();
CoordinateDiff padding = convolution->get_padding_below_forward();
CoordinateDiff padding_above = convolution->get_padding_above_forward();
CoordinateDiff padding_below_diff = convolution->get_padding_below_forward();
CoordinateDiff padding_above_diff = convolution->get_padding_above_forward();
Shape padding_below(padding_below_diff.size(), 0);
Shape padding_above(padding_above_diff.size(), 0);
if (padding.size() > 3)
for (int i = 0; i < padding_below_diff.size(); i++)
{
padding_below[i] = static_cast<size_t>(padding_below_diff[i]);
padding_above[i] = static_cast<size_t>(padding_above_diff[i]);
}
if (padding_below.size() > 3)
{
throw std::runtime_error(node->get_name() +
"with more than 3D is not implemented.");
......@@ -356,51 +454,75 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
"with data dilation is not implemented.");
}
}
for (int i = 0; i < padding.size(); i++)
bool pad_required = false;
if (padding_below != padding_above)
{
if (padding[i] != padding_above[i])
{
throw std::runtime_error(node->get_name() +
"with asymmetric padding is not implemented.");
}
pad_required = true;
}
auto input_shape = args[0].get_shape();
auto input_shape_padded = input_shape;
writer.block_begin(" //data_dilation_ " + node->get_name());
writer << "int count = " << out[0].get_size() << ";\n";
writer << "float alpha = 1.0;\n";
writer << "float beta = 0.0;\n";
// construct input and output tensor descriptor
kernel::emit_cudnnTensorDescriptor(
writer, args0, tensor_format, data_type, args[0].get_shape());
kernel::emit_cudnnTensorDescriptor(
writer, args1, tensor_format, data_type, args[1].get_shape());
kernel::emit_cudnnFilterDescriptor(
writer, out0, tensor_format, data_type, out[0].get_shape());
kernel::emit_cudnnConvolutionDescriptor(writer,
conv_descriptor,
padding,
window_movement_strides,
window_dilation_strides,
mode,
data_type);
writer << "size_t workSpaceSizeInBytes = 0;\n";
writer << "CUDNN_SAFE_CALL(cudnnGetConvolutionBackwardFilterWorkspaceSize(*ctx->"
"cudnn_handle, "
<< args0 << ", " << args1 << ", " << conv_descriptor << ", " << out0 << ", "
<< conv_algo << ", "
<< "&workSpaceSizeInBytes));\n";
writer << "void* workspace = "
"runtime::gpu::create_gpu_buffer(workSpaceSizeInBytes);\n";
writer << "CUDNN_SAFE_CALL(cudnnConvolutionBackwardFilter(*ctx->cudnn_handle, "
<< "&alpha, " << args0 << ", " << args[0].get_name() << ", " << args1 << ", "
<< args[1].get_name() << ", " << conv_descriptor << ", " << conv_algo << ", "
<< "workspace, workSpaceSizeInBytes, "
<< "&beta, " << out0 << ", " << out[0].get_name() << "));\n";
writer << "runtime::gpu::free_gpu_buffer(workspace);\n";
writer.block_begin(" // " + node->get_name());
if (pad_required)
{
input_shape_padded =
get_padded_shape(input_shape, padding_below, padding_above, {});
auto temp_size =
shape_size(input_shape_padded) * args[0].get_element_type().size();
GPUAllocator allocator =
external_function->get_primitive_emitter()->get_memory_allocator();
size_t idx_workspace = allocator.reserve_workspace(temp_size);
writer << "void* pad_buffer = runtime::gpu::invoke_memory_primitive(ctx, "
<< idx_workspace << ");\n";
auto& cuda_emitter =
external_function->get_primitive_emitter()->get_cuda_emitter();
auto pad_index =
cuda_emitter->build_pad(external_function->ctx().get(),
{{args[0].get_type(), out[0].get_type()}},
input_shape,
input_shape_padded,
padding_below,
padding_above,
Shape{},
std::string("0"));
writer << "gpu::invoke_primitive(ctx, " << pad_index << ", ";
writer << "std::vector<void*>{" << args[0].get_name() << "}.data(), ";
writer << "std::vector<void*>{pad_buffer}.data()";
writer << ");\n";
// asymetric padding has been applied, zero out padding vectors to
// ensure cudnn does not assume padding
std::fill(padding_below.begin(), padding_below.end(), 0);
std::fill(padding_above.begin(), padding_above.end(), 0);
}
auto& cudnn_emitter =
external_function->get_primitive_emitter()->get_cudnn_emitter();
cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
size_t index =
cudnn_emitter->build_convolution_backward_filter(external_function->ctx().get(),
data_type,
input_shape_padded,
args[1].get_shape(),
out[0].get_shape(),
window_movement_strides,
window_dilation_strides,
padding_below);
writer << "gpu::invoke_primitive(ctx, " << index << ", ";
if (pad_required)
{
writer << "std::vector<void*>{pad_buffer, " << args[1].get_name()
<< "}.data(), ";
}
else
{
writer << "std::vector<void*>{" << args[0].get_name() << ","
<< args[1].get_name() << "}.data(), ";
}
writer << "std::vector<void*>{" << out[0].get_name() << "}.data()";
writer << ");\n";
writer.block_end();
}
......
......@@ -15,7 +15,6 @@ convolution_2d_1item_1o1i_data_dilated
convolution_2d_1item_2o1i_data_dilated
convolution_2d_1item_2o2i_data_dilated
convolution_2d_1item_5o3i_data_dilated
convolution_2d_1item_padded_2_3x4_5
convolution_2d_2item_5o3i_data_dilated
convolution_2d_2items_dilated_padded
convolution_2d_2items_strided_padded
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment