Commit 20a01781 authored by Chris Sullivan's avatar Chris Sullivan Committed by Robert Kimball

cudnnFind/Get interoperability (#1721)

* add find algorithm for convolution without extra padding

* Use cudnnFind* or cudnnGet* depending on tuning param boolean. Add select function to search the perf results of the cudnn queries.

* Formatting.

* Algo search no longer binary, now it is either off, a heuristic search (cudnnGet*) or an explicit search (cudnnFind*).

* Formatting.

* switch to explicit.

* Throw if no suitable cudnn algo found.

* Formatting

* Remove comment.
parent f2f42fa9
...@@ -419,7 +419,7 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::Convolution* node) ...@@ -419,7 +419,7 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::Convolution* node)
size_t idx_workspace = std::numeric_limits<size_t>::max(); size_t idx_workspace = std::numeric_limits<size_t>::max();
size_t pad_index = std::numeric_limits<size_t>::max(); size_t pad_index = std::numeric_limits<size_t>::max();
bool can_find_algo = true; auto algo_policy = algo_search::EXPLICIT;
if (pad_required || is_deconvolution) if (pad_required || is_deconvolution)
{ {
input_shape_padded = runtime::gpu::get_padded_shape( input_shape_padded = runtime::gpu::get_padded_shape(
...@@ -442,7 +442,7 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::Convolution* node) ...@@ -442,7 +442,7 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::Convolution* node)
// ensure cudnn does not assume padding // ensure cudnn does not assume padding
std::fill(padding_below.begin(), padding_below.end(), 0); std::fill(padding_below.begin(), padding_below.end(), 0);
// padding will make find_algorithm for convolution get wrong result // padding will make find_algorithm for convolution get wrong result
can_find_algo = false; algo_policy = algo_search::NONE;
} }
size_t conv_index = build_convolution(dtype, size_t conv_index = build_convolution(dtype,
...@@ -452,7 +452,7 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::Convolution* node) ...@@ -452,7 +452,7 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::Convolution* node)
window_movement_strides, window_movement_strides,
window_dilation_strides, window_dilation_strides,
padding_below, padding_below,
can_find_algo); algo_policy);
std::unique_ptr<gpu::primitive> kernel_launch( std::unique_ptr<gpu::primitive> kernel_launch(
new gpu::primitive{[=](void** inputs, void** outputs) mutable { new gpu::primitive{[=](void** inputs, void** outputs) mutable {
...@@ -543,7 +543,7 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::ConvolutionBackprop ...@@ -543,7 +543,7 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::ConvolutionBackprop
size_t idx_workspace = std::numeric_limits<size_t>::max(); size_t idx_workspace = std::numeric_limits<size_t>::max();
size_t pad_index = std::numeric_limits<size_t>::max(); size_t pad_index = std::numeric_limits<size_t>::max();
size_t slice_index = std::numeric_limits<size_t>::max(); size_t slice_index = std::numeric_limits<size_t>::max();
bool can_find_algo = true; auto algo_policy = algo_search::EXPLICIT;
if (pad_required || is_deconvolution) if (pad_required || is_deconvolution)
{ {
output_shape_padded = output_shape_padded =
...@@ -571,7 +571,7 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::ConvolutionBackprop ...@@ -571,7 +571,7 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::ConvolutionBackprop
// ensure cudnn does not assume padding // ensure cudnn does not assume padding
std::fill(padding_below.begin(), padding_below.end(), 0); std::fill(padding_below.begin(), padding_below.end(), 0);
// padding will make find_algorithm for convolution get wrong result // padding will make find_algorithm for convolution get wrong result
can_find_algo = false; algo_policy = algo_search::NONE;
} }
size_t conv_index = build_convolution_backward_data(output_type, size_t conv_index = build_convolution_backward_data(output_type,
...@@ -581,7 +581,7 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::ConvolutionBackprop ...@@ -581,7 +581,7 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::ConvolutionBackprop
window_movement_strides, window_movement_strides,
window_dilation_strides, window_dilation_strides,
padding_below, padding_below,
can_find_algo); algo_policy);
std::unique_ptr<gpu::primitive> kernel_launch(new gpu::primitive{[=](void** inputs, std::unique_ptr<gpu::primitive> kernel_launch(new gpu::primitive{[=](void** inputs,
void** outputs) mutable { void** outputs) mutable {
...@@ -662,7 +662,7 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::ConvolutionBackprop ...@@ -662,7 +662,7 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::ConvolutionBackprop
size_t idx_workspace = std::numeric_limits<size_t>::max(); size_t idx_workspace = std::numeric_limits<size_t>::max();
size_t pad_index = std::numeric_limits<size_t>::max(); size_t pad_index = std::numeric_limits<size_t>::max();
bool can_find_algo = true; auto algo_policy = algo_search::EXPLICIT;
if (pad_required || is_deconvolution) if (pad_required || is_deconvolution)
{ {
input_shape_padded = runtime::gpu::get_padded_shape( input_shape_padded = runtime::gpu::get_padded_shape(
...@@ -684,7 +684,7 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::ConvolutionBackprop ...@@ -684,7 +684,7 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::ConvolutionBackprop
// ensure cudnn does not assume padding // ensure cudnn does not assume padding
std::fill(padding_below.begin(), padding_below.end(), 0); std::fill(padding_below.begin(), padding_below.end(), 0);
// padding will make find_algorithm for convolution get wrong result // padding will make find_algorithm for convolution get wrong result
can_find_algo = false; algo_policy = algo_search::NONE;
} }
size_t conv_index = build_convolution_backward_filter(output_type, size_t conv_index = build_convolution_backward_filter(output_type,
...@@ -694,7 +694,7 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::ConvolutionBackprop ...@@ -694,7 +694,7 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::ConvolutionBackprop
window_movement_strides, window_movement_strides,
window_dilation_strides, window_dilation_strides,
padding_below, padding_below,
can_find_algo); algo_policy);
std::unique_ptr<gpu::primitive> kernel_launch( std::unique_ptr<gpu::primitive> kernel_launch(
new gpu::primitive{[=](void** inputs, void** outputs) mutable { new gpu::primitive{[=](void** inputs, void** outputs) mutable {
...@@ -1155,9 +1155,7 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::gpu::Rnn* node) ...@@ -1155,9 +1155,7 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::gpu::Rnn* node)
debug_sync(); debug_sync();
}}); }});
primitive_index = this->m_primitive_emitter->insert(std::move(kernel_launch)); return this->m_primitive_emitter->register_primitive(kernel_launch, hash);
m_primitive_emitter->cache(hash, primitive_index);
return primitive_index;
} }
#endif #endif
...@@ -1168,7 +1166,7 @@ size_t runtime::gpu::CUDNNEmitter::build_convolution(const std::string& dtype, ...@@ -1168,7 +1166,7 @@ size_t runtime::gpu::CUDNNEmitter::build_convolution(const std::string& dtype,
const Strides& window_movement_strides, const Strides& window_movement_strides,
const Strides& window_dilation_strides, const Strides& window_dilation_strides,
const Shape& padding_below, const Shape& padding_below,
const bool find_algo) const algo_search find_algo)
{ {
cudnnDataType_t data_type = get_cudnn_datatype(dtype); cudnnDataType_t data_type = get_cudnn_datatype(dtype);
const cudnnTensorFormat_t tensor_format = CUDNN_TENSOR_NCHW; const cudnnTensorFormat_t tensor_format = CUDNN_TENSOR_NCHW;
...@@ -1181,20 +1179,7 @@ size_t runtime::gpu::CUDNNEmitter::build_convolution(const std::string& dtype, ...@@ -1181,20 +1179,7 @@ size_t runtime::gpu::CUDNNEmitter::build_convolution(const std::string& dtype,
auto& filter_desc = get_cudnn_filter_descriptor(input_filter_shape, data_type, tensor_format); auto& filter_desc = get_cudnn_filter_descriptor(input_filter_shape, data_type, tensor_format);
auto& conv_desc = get_cudnn_convolution_descriptor( auto& conv_desc = get_cudnn_convolution_descriptor(
padding_below, window_movement_strides, window_dilation_strides, mode, data_type); padding_below, window_movement_strides, window_dilation_strides, mode, data_type);
const cudnnConvolutionFwdAlgo_t conv_fwd_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
cudnnConvolutionFwdAlgo_t conv_fwd_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
if (find_algo)
{
CUDNN_SAFE_CALL(cudnnGetConvolutionForwardAlgorithm(*m_ctx->cudnn_handle,
tensor_desc_0,
filter_desc,
conv_desc,
tensor_desc_1,
CUDNN_CONVOLUTION_FWD_PREFER_FASTEST,
/*memoryLimitInBytes=*/0,
&conv_fwd_algo));
}
void* alpha = m_host_parameters.allocate_by_datatype(data_type, 1.0); void* alpha = m_host_parameters.allocate_by_datatype(data_type, 1.0);
void* beta = m_host_parameters.allocate_by_datatype(data_type, 0); void* beta = m_host_parameters.allocate_by_datatype(data_type, 0);
...@@ -1243,7 +1228,7 @@ size_t runtime::gpu::CUDNNEmitter::build_convolution_backward_data( ...@@ -1243,7 +1228,7 @@ size_t runtime::gpu::CUDNNEmitter::build_convolution_backward_data(
const Strides& window_movement_strides, const Strides& window_movement_strides,
const Strides& window_dilation_strides, const Strides& window_dilation_strides,
const Shape& padding_below, const Shape& padding_below,
const bool find_algo) const algo_search find_algo)
{ {
const cudnnDataType_t data_type = get_cudnn_datatype(dtype); const cudnnDataType_t data_type = get_cudnn_datatype(dtype);
const cudnnTensorFormat_t tensor_format = CUDNN_TENSOR_NCHW; const cudnnTensorFormat_t tensor_format = CUDNN_TENSOR_NCHW;
...@@ -1256,18 +1241,30 @@ size_t runtime::gpu::CUDNNEmitter::build_convolution_backward_data( ...@@ -1256,18 +1241,30 @@ size_t runtime::gpu::CUDNNEmitter::build_convolution_backward_data(
auto& filter_desc = get_cudnn_filter_descriptor(input_filter_shape, data_type, tensor_format); auto& filter_desc = get_cudnn_filter_descriptor(input_filter_shape, data_type, tensor_format);
auto& conv_desc = get_cudnn_convolution_descriptor( auto& conv_desc = get_cudnn_convolution_descriptor(
padding_below, window_movement_strides, window_dilation_strides, mode, data_type); padding_below, window_movement_strides, window_dilation_strides, mode, data_type);
cudnnConvolutionBwdDataAlgo_t conv_bwd_data_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0; cudnnConvolutionBwdDataAlgo_t conv_bwd_data_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
if (find_algo) if (find_algo != algo_search::NONE)
{ {
int num_algos;
int max_algos = 0;
CUDNN_SAFE_CALL( CUDNN_SAFE_CALL(
cudnnGetConvolutionBackwardDataAlgorithm(*m_ctx->cudnn_handle, cudnnGetConvolutionBackwardDataAlgorithmMaxCount(*m_ctx->cudnn_handle, &max_algos));
filter_desc, std::vector<cudnnConvolutionBwdDataAlgoPerf_t> results(max_algos);
tensor_desc_0, auto cudnn_algo_search = (find_algo == algo_search::EXPLICIT)
conv_desc, ? cudnnFindConvolutionBackwardDataAlgorithm
tensor_desc_1, : cudnnGetConvolutionBackwardDataAlgorithm_v7;
CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST, CUDNN_SAFE_CALL((*cudnn_algo_search)(*m_ctx->cudnn_handle,
0, filter_desc,
&conv_bwd_data_algo)); tensor_desc_0,
conv_desc,
tensor_desc_1,
static_cast<int>(results.size()),
&num_algos,
results.data()));
results.resize(num_algos);
conv_bwd_data_algo =
select_cudnn_algo<cudnnConvolutionBwdDataAlgoPerf_t, cudnnConvolutionBwdDataAlgo_t>(
results);
} }
void* alpha = m_host_parameters.allocate_by_datatype(data_type, 1.0); void* alpha = m_host_parameters.allocate_by_datatype(data_type, 1.0);
...@@ -1318,7 +1315,7 @@ size_t runtime::gpu::CUDNNEmitter::build_convolution_backward_filter( ...@@ -1318,7 +1315,7 @@ size_t runtime::gpu::CUDNNEmitter::build_convolution_backward_filter(
const Strides& window_movement_strides, const Strides& window_movement_strides,
const Strides& window_dilation_strides, const Strides& window_dilation_strides,
const Shape& padding_below, const Shape& padding_below,
const bool find_algo) const algo_search find_algo)
{ {
const cudnnDataType_t data_type = get_cudnn_datatype(dtype); const cudnnDataType_t data_type = get_cudnn_datatype(dtype);
const cudnnTensorFormat_t tensor_format = CUDNN_TENSOR_NCHW; const cudnnTensorFormat_t tensor_format = CUDNN_TENSOR_NCHW;
...@@ -1331,19 +1328,32 @@ size_t runtime::gpu::CUDNNEmitter::build_convolution_backward_filter( ...@@ -1331,19 +1328,32 @@ size_t runtime::gpu::CUDNNEmitter::build_convolution_backward_filter(
auto& filter_desc = get_cudnn_filter_descriptor(output_filter_shape, data_type, tensor_format); auto& filter_desc = get_cudnn_filter_descriptor(output_filter_shape, data_type, tensor_format);
auto& conv_desc = get_cudnn_convolution_descriptor( auto& conv_desc = get_cudnn_convolution_descriptor(
padding_below, window_movement_strides, window_dilation_strides, mode, data_type); padding_below, window_movement_strides, window_dilation_strides, mode, data_type);
cudnnConvolutionBwdFilterAlgo_t conv_bwd_filter_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0; cudnnConvolutionBwdFilterAlgo_t conv_bwd_filter_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
if (find_algo) if (find_algo != algo_search::NONE)
{ {
int num_algos;
int max_algos = 0;
CUDNN_SAFE_CALL( CUDNN_SAFE_CALL(
cudnnGetConvolutionBackwardFilterAlgorithm(*m_ctx->cudnn_handle, cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(*m_ctx->cudnn_handle, &max_algos));
tensor_desc_0, std::vector<cudnnConvolutionBwdFilterAlgoPerf_t> results(max_algos);
tensor_desc_1, auto cudnn_algo_search = (find_algo == algo_search::EXPLICIT)
conv_desc, ? cudnnFindConvolutionBackwardFilterAlgorithm
filter_desc, : cudnnGetConvolutionBackwardFilterAlgorithm_v7;
CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST, CUDNN_SAFE_CALL((*cudnn_algo_search)(*m_ctx->cudnn_handle,
0, tensor_desc_0,
&conv_bwd_filter_algo)); tensor_desc_1,
conv_desc,
filter_desc,
static_cast<int>(results.size()),
&num_algos,
results.data()));
results.resize(num_algos);
conv_bwd_filter_algo =
select_cudnn_algo<cudnnConvolutionBwdFilterAlgoPerf_t, cudnnConvolutionBwdFilterAlgo_t>(
results);
} }
size_t workspace_size_in_bytes = 0; size_t workspace_size_in_bytes = 0;
CUDNN_SAFE_CALL(cudnnGetConvolutionBackwardFilterWorkspaceSize(*m_ctx->cudnn_handle, CUDNN_SAFE_CALL(cudnnGetConvolutionBackwardFilterWorkspaceSize(*m_ctx->cudnn_handle,
tensor_desc_0, tensor_desc_0,
......
...@@ -72,6 +72,13 @@ namespace ngraph ...@@ -72,6 +72,13 @@ namespace ngraph
Backward Backward
}; };
enum class algo_search
{
HEURISTIC,
EXPLICIT,
NONE
};
size_t build_convolution(const std::string& dtype, size_t build_convolution(const std::string& dtype,
const Shape& input_tensor_shape, const Shape& input_tensor_shape,
const Shape& input_filter_shape, const Shape& input_filter_shape,
...@@ -79,25 +86,27 @@ namespace ngraph ...@@ -79,25 +86,27 @@ namespace ngraph
const Strides& window_movement_strides, const Strides& window_movement_strides,
const Strides& window_dilation_strides, const Strides& window_dilation_strides,
const Shape& padding_below, const Shape& padding_below,
const bool find_algo = false); const algo_search find_algo = algo_search::NONE);
size_t build_convolution_backward_data(const std::string& dtype, size_t build_convolution_backward_data(
const Shape& input_filter_shape, const std::string& dtype,
const Shape& input_tensor_shape, const Shape& input_filter_shape,
const Shape& output_tensor_shape, const Shape& input_tensor_shape,
const Strides& window_movement_strides, const Shape& output_tensor_shape,
const Strides& window_dilation_strides, const Strides& window_movement_strides,
const Shape& padding_below, const Strides& window_dilation_strides,
const bool find_algo = false); const Shape& padding_below,
const algo_search find_algo = algo_search::NONE);
size_t build_convolution_backward_filter(const std::string& dtype,
const Shape& input_tensor_shape_0, size_t build_convolution_backward_filter(
const Shape& input_tensor_shape_1, const std::string& dtype,
const Shape& output_filter_shape, const Shape& input_tensor_shape_0,
const Strides& window_movement_strides, const Shape& input_tensor_shape_1,
const Strides& window_dilation_strides, const Shape& output_filter_shape,
const Shape& padding_below, const Strides& window_movement_strides,
const bool find_algo = false); const Strides& window_dilation_strides,
const Shape& padding_below,
const algo_search find_algo = algo_search::NONE);
size_t build_reduce_forward(const cudnnReduceTensorOp_t& reduce_op, size_t build_reduce_forward(const cudnnReduceTensorOp_t& reduce_op,
const std::string& dtype, const std::string& dtype,
...@@ -178,6 +187,24 @@ namespace ngraph ...@@ -178,6 +187,24 @@ namespace ngraph
cudnnConvolutionMode_t mode, cudnnConvolutionMode_t mode,
cudnnDataType_t data_type); cudnnDataType_t data_type);
template <typename PERF_TYPE, typename ALGO_TYPE>
ALGO_TYPE
select_cudnn_algo(const std::vector<PERF_TYPE>& perf_results,
size_t workspace_byte = std::numeric_limits<size_t>::max())
{
for (auto i = 0; i != perf_results.size(); ++i)
{
auto const& result = perf_results[i];
if (result.status == CUDNN_STATUS_SUCCESS &&
result.memory <= workspace_byte)
{
return result.algo;
}
}
throw ngraph_error(
"No suitable cuDNN algorithm was found for the requested operation.");
}
CUDNNDescriptors m_descriptors; CUDNNDescriptors m_descriptors;
CUDNNHostParameters m_host_parameters; CUDNNHostParameters m_host_parameters;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment