Commit d38aba91 authored by Fenglei's avatar Fenglei Committed by Robert Kimball

add find algorithm for convolution without extra padding (#1710)

parent 2aa7899c
...@@ -386,6 +386,7 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::Convolution* node) ...@@ -386,6 +386,7 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::Convolution* node)
size_t idx_workspace = std::numeric_limits<size_t>::max(); size_t idx_workspace = std::numeric_limits<size_t>::max();
size_t pad_dynamic_index = std::numeric_limits<size_t>::max(); size_t pad_dynamic_index = std::numeric_limits<size_t>::max();
bool can_find_algo = true;
if (pad_required || is_deconvolution) if (pad_required || is_deconvolution)
{ {
input_shape_padded = runtime::gpu::get_padded_shape( input_shape_padded = runtime::gpu::get_padded_shape(
...@@ -408,6 +409,8 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::Convolution* node) ...@@ -408,6 +409,8 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::Convolution* node)
// asymetric padding has been applied, zero out padding vectors to // asymetric padding has been applied, zero out padding vectors to
// ensure cudnn does not assume padding // ensure cudnn does not assume padding
std::fill(padding_below.begin(), padding_below.end(), 0); std::fill(padding_below.begin(), padding_below.end(), 0);
// padding will make find_algorithm for convolution get wrong result
can_find_algo = false;
} }
size_t conv_index = build_convolution(dtype, size_t conv_index = build_convolution(dtype,
...@@ -416,7 +419,8 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::Convolution* node) ...@@ -416,7 +419,8 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::Convolution* node)
output_shape, output_shape,
window_movement_strides, window_movement_strides,
window_dilation_strides, window_dilation_strides,
padding_below); padding_below,
can_find_algo);
std::unique_ptr<gpu::primitive> kernel_launch( std::unique_ptr<gpu::primitive> kernel_launch(
new gpu::primitive{[=](void** inputs, void** outputs) mutable { new gpu::primitive{[=](void** inputs, void** outputs) mutable {
...@@ -508,6 +512,7 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::ConvolutionBackprop ...@@ -508,6 +512,7 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::ConvolutionBackprop
size_t idx_workspace = std::numeric_limits<size_t>::max(); size_t idx_workspace = std::numeric_limits<size_t>::max();
size_t pad_dynamic_index = std::numeric_limits<size_t>::max(); size_t pad_dynamic_index = std::numeric_limits<size_t>::max();
size_t slice_index = std::numeric_limits<size_t>::max(); size_t slice_index = std::numeric_limits<size_t>::max();
bool can_find_algo = true;
if (pad_required || is_deconvolution) if (pad_required || is_deconvolution)
{ {
output_shape_padded = output_shape_padded =
...@@ -534,6 +539,8 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::ConvolutionBackprop ...@@ -534,6 +539,8 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::ConvolutionBackprop
// asymetric padding has been applied, zero out padding vectors to // asymetric padding has been applied, zero out padding vectors to
// ensure cudnn does not assume padding // ensure cudnn does not assume padding
std::fill(padding_below.begin(), padding_below.end(), 0); std::fill(padding_below.begin(), padding_below.end(), 0);
// padding will make find_algorithm for convolution get wrong result
can_find_algo = false;
} }
size_t conv_index = build_convolution_backward_data(output_type, size_t conv_index = build_convolution_backward_data(output_type,
...@@ -542,7 +549,8 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::ConvolutionBackprop ...@@ -542,7 +549,8 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::ConvolutionBackprop
output_shape_padded, output_shape_padded,
window_movement_strides, window_movement_strides,
window_dilation_strides, window_dilation_strides,
padding_below); padding_below,
can_find_algo);
std::unique_ptr<gpu::primitive> kernel_launch(new gpu::primitive{[=](void** inputs, std::unique_ptr<gpu::primitive> kernel_launch(new gpu::primitive{[=](void** inputs,
void** outputs) mutable { void** outputs) mutable {
...@@ -625,6 +633,7 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::ConvolutionBackprop ...@@ -625,6 +633,7 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::ConvolutionBackprop
size_t idx_workspace = std::numeric_limits<size_t>::max(); size_t idx_workspace = std::numeric_limits<size_t>::max();
size_t pad_dynamic_index = std::numeric_limits<size_t>::max(); size_t pad_dynamic_index = std::numeric_limits<size_t>::max();
bool can_find_algo = true;
if (pad_required || is_deconvolution) if (pad_required || is_deconvolution)
{ {
input_shape_padded = runtime::gpu::get_padded_shape( input_shape_padded = runtime::gpu::get_padded_shape(
...@@ -645,6 +654,8 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::ConvolutionBackprop ...@@ -645,6 +654,8 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::ConvolutionBackprop
// asymetric padding has been applied, zero out padding vectors to // asymetric padding has been applied, zero out padding vectors to
// ensure cudnn does not assume padding // ensure cudnn does not assume padding
std::fill(padding_below.begin(), padding_below.end(), 0); std::fill(padding_below.begin(), padding_below.end(), 0);
// padding will make find_algorithm for convolution get wrong result
can_find_algo = false;
} }
size_t conv_index = build_convolution_backward_filter(output_type, size_t conv_index = build_convolution_backward_filter(output_type,
...@@ -653,7 +664,8 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::ConvolutionBackprop ...@@ -653,7 +664,8 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::ConvolutionBackprop
filter_shape, filter_shape,
window_movement_strides, window_movement_strides,
window_dilation_strides, window_dilation_strides,
padding_below); padding_below,
can_find_algo);
std::unique_ptr<gpu::primitive> kernel_launch( std::unique_ptr<gpu::primitive> kernel_launch(
new gpu::primitive{[=](void** inputs, void** outputs) mutable { new gpu::primitive{[=](void** inputs, void** outputs) mutable {
...@@ -915,7 +927,8 @@ size_t runtime::gpu::CUDNNEmitter::build_convolution(const std::string& dtype, ...@@ -915,7 +927,8 @@ size_t runtime::gpu::CUDNNEmitter::build_convolution(const std::string& dtype,
const Shape& output_tensor_shape, const Shape& output_tensor_shape,
const Strides& window_movement_strides, const Strides& window_movement_strides,
const Strides& window_dilation_strides, const Strides& window_dilation_strides,
const Shape& padding_below) const Shape& padding_below,
const bool find_algo)
{ {
cudnnDataType_t data_type = get_cudnn_datatype(dtype); cudnnDataType_t data_type = get_cudnn_datatype(dtype);
const cudnnTensorFormat_t tensor_format = CUDNN_TENSOR_NCHW; const cudnnTensorFormat_t tensor_format = CUDNN_TENSOR_NCHW;
...@@ -928,7 +941,20 @@ size_t runtime::gpu::CUDNNEmitter::build_convolution(const std::string& dtype, ...@@ -928,7 +941,20 @@ size_t runtime::gpu::CUDNNEmitter::build_convolution(const std::string& dtype,
auto& filter_desc = get_cudnn_filter_descriptor(input_filter_shape, data_type, tensor_format); auto& filter_desc = get_cudnn_filter_descriptor(input_filter_shape, data_type, tensor_format);
auto& conv_desc = get_cudnn_convolution_descriptor( auto& conv_desc = get_cudnn_convolution_descriptor(
padding_below, window_movement_strides, window_dilation_strides, mode, data_type); padding_below, window_movement_strides, window_dilation_strides, mode, data_type);
const cudnnConvolutionFwdAlgo_t conv_fwd_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
cudnnConvolutionFwdAlgo_t conv_fwd_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
if (find_algo)
{
CUDNN_SAFE_CALL(cudnnGetConvolutionForwardAlgorithm(*m_ctx->cudnn_handle,
tensor_desc_0,
filter_desc,
conv_desc,
tensor_desc_1,
CUDNN_CONVOLUTION_FWD_PREFER_FASTEST,
/*memoryLimitInBytes=*/0,
&conv_fwd_algo));
}
void* alpha = m_host_parameters.allocate_by_datatype(data_type, 1.0); void* alpha = m_host_parameters.allocate_by_datatype(data_type, 1.0);
void* beta = m_host_parameters.allocate_by_datatype(data_type, 0); void* beta = m_host_parameters.allocate_by_datatype(data_type, 0);
...@@ -976,7 +1002,8 @@ size_t runtime::gpu::CUDNNEmitter::build_convolution_backward_data( ...@@ -976,7 +1002,8 @@ size_t runtime::gpu::CUDNNEmitter::build_convolution_backward_data(
const Shape& output_tensor_shape, const Shape& output_tensor_shape,
const Strides& window_movement_strides, const Strides& window_movement_strides,
const Strides& window_dilation_strides, const Strides& window_dilation_strides,
const Shape& padding_below) const Shape& padding_below,
const bool find_algo)
{ {
const cudnnDataType_t data_type = get_cudnn_datatype(dtype); const cudnnDataType_t data_type = get_cudnn_datatype(dtype);
const cudnnTensorFormat_t tensor_format = CUDNN_TENSOR_NCHW; const cudnnTensorFormat_t tensor_format = CUDNN_TENSOR_NCHW;
...@@ -989,7 +1016,20 @@ size_t runtime::gpu::CUDNNEmitter::build_convolution_backward_data( ...@@ -989,7 +1016,20 @@ size_t runtime::gpu::CUDNNEmitter::build_convolution_backward_data(
auto& filter_desc = get_cudnn_filter_descriptor(input_filter_shape, data_type, tensor_format); auto& filter_desc = get_cudnn_filter_descriptor(input_filter_shape, data_type, tensor_format);
auto& conv_desc = get_cudnn_convolution_descriptor( auto& conv_desc = get_cudnn_convolution_descriptor(
padding_below, window_movement_strides, window_dilation_strides, mode, data_type); padding_below, window_movement_strides, window_dilation_strides, mode, data_type);
const cudnnConvolutionBwdDataAlgo_t conv_bwd_data_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0; cudnnConvolutionBwdDataAlgo_t conv_bwd_data_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
if (find_algo)
{
CUDNN_SAFE_CALL(
cudnnGetConvolutionBackwardDataAlgorithm(*m_ctx->cudnn_handle,
filter_desc,
tensor_desc_0,
conv_desc,
tensor_desc_1,
CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST,
0,
&conv_bwd_data_algo));
}
void* alpha = m_host_parameters.allocate_by_datatype(data_type, 1.0); void* alpha = m_host_parameters.allocate_by_datatype(data_type, 1.0);
void* beta = m_host_parameters.allocate_by_datatype(data_type, 0); void* beta = m_host_parameters.allocate_by_datatype(data_type, 0);
...@@ -1037,7 +1077,8 @@ size_t runtime::gpu::CUDNNEmitter::build_convolution_backward_filter( ...@@ -1037,7 +1077,8 @@ size_t runtime::gpu::CUDNNEmitter::build_convolution_backward_filter(
const Shape& output_filter_shape, const Shape& output_filter_shape,
const Strides& window_movement_strides, const Strides& window_movement_strides,
const Strides& window_dilation_strides, const Strides& window_dilation_strides,
const Shape& padding_below) const Shape& padding_below,
const bool find_algo)
{ {
const cudnnDataType_t data_type = get_cudnn_datatype(dtype); const cudnnDataType_t data_type = get_cudnn_datatype(dtype);
const cudnnTensorFormat_t tensor_format = CUDNN_TENSOR_NCHW; const cudnnTensorFormat_t tensor_format = CUDNN_TENSOR_NCHW;
...@@ -1050,9 +1091,19 @@ size_t runtime::gpu::CUDNNEmitter::build_convolution_backward_filter( ...@@ -1050,9 +1091,19 @@ size_t runtime::gpu::CUDNNEmitter::build_convolution_backward_filter(
auto& filter_desc = get_cudnn_filter_descriptor(output_filter_shape, data_type, tensor_format); auto& filter_desc = get_cudnn_filter_descriptor(output_filter_shape, data_type, tensor_format);
auto& conv_desc = get_cudnn_convolution_descriptor( auto& conv_desc = get_cudnn_convolution_descriptor(
padding_below, window_movement_strides, window_dilation_strides, mode, data_type); padding_below, window_movement_strides, window_dilation_strides, mode, data_type);
const cudnnConvolutionBwdFilterAlgo_t conv_bwd_filter_algo = cudnnConvolutionBwdFilterAlgo_t conv_bwd_filter_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0; if (find_algo)
{
CUDNN_SAFE_CALL(
cudnnGetConvolutionBackwardFilterAlgorithm(*m_ctx->cudnn_handle,
tensor_desc_0,
tensor_desc_1,
conv_desc,
filter_desc,
CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST,
0,
&conv_bwd_filter_algo));
}
size_t workspace_size_in_bytes = 0; size_t workspace_size_in_bytes = 0;
CUDNN_SAFE_CALL(cudnnGetConvolutionBackwardFilterWorkspaceSize(*m_ctx->cudnn_handle, CUDNN_SAFE_CALL(cudnnGetConvolutionBackwardFilterWorkspaceSize(*m_ctx->cudnn_handle,
tensor_desc_0, tensor_desc_0,
......
...@@ -76,7 +76,8 @@ namespace ngraph ...@@ -76,7 +76,8 @@ namespace ngraph
const Shape& output_tensor_shape, const Shape& output_tensor_shape,
const Strides& window_movement_strides, const Strides& window_movement_strides,
const Strides& window_dilation_strides, const Strides& window_dilation_strides,
const Shape& padding_below); const Shape& padding_below,
const bool find_algo = false);
size_t build_convolution_backward_data(const std::string& dtype, size_t build_convolution_backward_data(const std::string& dtype,
const Shape& input_filter_shape, const Shape& input_filter_shape,
...@@ -84,7 +85,8 @@ namespace ngraph ...@@ -84,7 +85,8 @@ namespace ngraph
const Shape& output_tensor_shape, const Shape& output_tensor_shape,
const Strides& window_movement_strides, const Strides& window_movement_strides,
const Strides& window_dilation_strides, const Strides& window_dilation_strides,
const Shape& padding_below); const Shape& padding_below,
const bool find_algo = false);
size_t build_convolution_backward_filter(const std::string& dtype, size_t build_convolution_backward_filter(const std::string& dtype,
const Shape& input_tensor_shape_0, const Shape& input_tensor_shape_0,
...@@ -92,7 +94,8 @@ namespace ngraph ...@@ -92,7 +94,8 @@ namespace ngraph
const Shape& output_filter_shape, const Shape& output_filter_shape,
const Strides& window_movement_strides, const Strides& window_movement_strides,
const Strides& window_dilation_strides, const Strides& window_dilation_strides,
const Shape& padding_below); const Shape& padding_below,
const bool find_algo = false);
size_t build_reduce_forward(const cudnnReduceTensorOp_t& reduce_op, size_t build_reduce_forward(const cudnnReduceTensorOp_t& reduce_op,
const std::string& dtype, const std::string& dtype,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment