Commit 682f7b04 authored by Chris Sullivan's avatar Chris Sullivan Committed by Scott Cyphers

[cuDNN:Part 1] minimal refactoring of op::reduce (#965)

* Refactored the cudnn reduce kernel to use the nGraph Shape -> cudnnTensorDescriptor cudnn helpers that the other kernels use.

* Added cacheing to cudnn reduce op.

* Adding back hashing call before returning primitive index to op::Reduce (bug fix).

* [cuDNN:Part 2] Descriptor Creation/Destruction refactoring (#969)

* Added a cuDNN descriptor factory which manages the construction and destruction of cuDNN descriptors.
It correctly calls Create/Destroy based on the cuDNN descriptor type. Previously the Destroy functions were not being called.

* Removed commented code and changed class to struct on cudnn_descriptor.

* Added comments and updated a few variable names.

* Clang compiled cuDNN kernels (those not part of CUDNNEmitter)
now use the CUDNNDescriptor factory.
parent 11ec6449
/*******************************************************************************
* Copyright 2018 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#pragma once
#include <memory>
#include <vector>
#include <cudnn.h>
#include "ngraph/runtime/gpu/gpu_util.hpp"
namespace ngraph
{
namespace runtime
{
namespace gpu
{
template <typename T>
struct cudnn_descriptor;
/// \brief A factory which builds cuDNN descriptors
/// and manages their creation and destruction.
class CUDNNDescriptors
{
class Descriptor
{
public:
Descriptor() = default;
virtual ~Descriptor() = default;
};
public:
CUDNNDescriptors() = default;
~CUDNNDescriptors() = default;
template <typename T>
T& build()
{
// A function-local class for type erased storage of
// various cudnn descriptor types which is templated
// over function `build`'s specialization type
class descriptor_ : public Descriptor
{
public:
descriptor_() { cudnn_descriptor<T>::create(stored_descriptor); }
~descriptor_() { cudnn_descriptor<T>::destroy(stored_descriptor); }
T& get() { return stored_descriptor; }
private:
T stored_descriptor;
};
m_descriptors.emplace_back(new descriptor_);
return static_cast<descriptor_*>(m_descriptors.back().get())->get();
}
private:
std::vector<std::unique_ptr<Descriptor>> m_descriptors;
};
template <>
struct cudnn_descriptor<cudnnLRNDescriptor_t>
{
static void create(cudnnLRNDescriptor_t& desc)
{
CUDNN_SAFE_CALL(cudnnCreateLRNDescriptor(&desc));
}
static void destroy(cudnnLRNDescriptor_t& desc)
{
CUDNN_SAFE_CALL(cudnnDestroyLRNDescriptor(desc));
}
};
template <>
struct cudnn_descriptor<cudnnTensorDescriptor_t>
{
static void create(cudnnTensorDescriptor_t& desc)
{
CUDNN_SAFE_CALL(cudnnCreateTensorDescriptor(&desc));
}
static void destroy(cudnnTensorDescriptor_t& desc)
{
CUDNN_SAFE_CALL(cudnnDestroyTensorDescriptor(desc));
}
};
template <>
struct cudnn_descriptor<cudnnSpatialTransformerDescriptor_t>
{
static void create(cudnnSpatialTransformerDescriptor_t& desc)
{
CUDNN_SAFE_CALL(cudnnCreateSpatialTransformerDescriptor(&desc));
}
static void destroy(cudnnSpatialTransformerDescriptor_t& desc)
{
CUDNN_SAFE_CALL(cudnnDestroySpatialTransformerDescriptor(desc));
}
};
template <>
struct cudnn_descriptor<cudnnReduceTensorDescriptor_t>
{
static void create(cudnnReduceTensorDescriptor_t& desc)
{
CUDNN_SAFE_CALL(cudnnCreateReduceTensorDescriptor(&desc));
}
static void destroy(cudnnReduceTensorDescriptor_t& desc)
{
CUDNN_SAFE_CALL(cudnnDestroyReduceTensorDescriptor(desc));
}
};
template <>
struct cudnn_descriptor<cudnnRNNDescriptor_t>
{
static void create(cudnnRNNDescriptor_t& desc)
{
CUDNN_SAFE_CALL(cudnnCreateRNNDescriptor(&desc));
}
static void destroy(cudnnRNNDescriptor_t& desc)
{
CUDNN_SAFE_CALL(cudnnDestroyRNNDescriptor(desc));
}
};
template <>
struct cudnn_descriptor<cudnnPoolingDescriptor_t>
{
static void create(cudnnPoolingDescriptor_t& desc)
{
CUDNN_SAFE_CALL(cudnnCreatePoolingDescriptor(&desc));
}
static void destroy(cudnnPoolingDescriptor_t& desc)
{
CUDNN_SAFE_CALL(cudnnDestroyPoolingDescriptor(desc));
}
};
template <>
struct cudnn_descriptor<cudnnOpTensorDescriptor_t>
{
static void create(cudnnOpTensorDescriptor_t& desc)
{
CUDNN_SAFE_CALL(cudnnCreateOpTensorDescriptor(&desc));
}
static void destroy(cudnnOpTensorDescriptor_t& desc)
{
CUDNN_SAFE_CALL(cudnnDestroyOpTensorDescriptor(desc));
}
};
template <>
struct cudnn_descriptor<cudnnFilterDescriptor_t>
{
static void create(cudnnFilterDescriptor_t& desc)
{
CUDNN_SAFE_CALL(cudnnCreateFilterDescriptor(&desc));
}
static void destroy(cudnnFilterDescriptor_t& desc)
{
CUDNN_SAFE_CALL(cudnnDestroyFilterDescriptor(desc));
}
};
template <>
struct cudnn_descriptor<cudnnDropoutDescriptor_t>
{
static void create(cudnnDropoutDescriptor_t& desc)
{
CUDNN_SAFE_CALL(cudnnCreateDropoutDescriptor(&desc));
}
static void destroy(cudnnDropoutDescriptor_t& desc)
{
CUDNN_SAFE_CALL(cudnnDestroyDropoutDescriptor(desc));
}
};
template <>
struct cudnn_descriptor<cudnnConvolutionDescriptor_t>
{
static void create(cudnnConvolutionDescriptor_t& desc)
{
CUDNN_SAFE_CALL(cudnnCreateConvolutionDescriptor(&desc));
}
static void destroy(cudnnConvolutionDescriptor_t& desc)
{
CUDNN_SAFE_CALL(cudnnDestroyConvolutionDescriptor(desc));
}
};
template <>
struct cudnn_descriptor<cudnnCTCLossDescriptor_t>
{
static void create(cudnnCTCLossDescriptor_t& desc)
{
CUDNN_SAFE_CALL(cudnnCreateCTCLossDescriptor(&desc));
}
static void destroy(cudnnCTCLossDescriptor_t& desc)
{
CUDNN_SAFE_CALL(cudnnDestroyCTCLossDescriptor(desc));
}
};
template <>
struct cudnn_descriptor<cudnnActivationDescriptor_t>
{
static void create(cudnnActivationDescriptor_t& desc)
{
CUDNN_SAFE_CALL(cudnnCreateActivationDescriptor(&desc));
}
static void destroy(cudnnActivationDescriptor_t& desc)
{
CUDNN_SAFE_CALL(cudnnDestroyActivationDescriptor(desc));
}
};
}
}
}
......@@ -26,11 +26,10 @@
using namespace ngraph;
cudnnTensorDescriptor_t runtime::gpu::cudnn_util::tensor_descriptor_from_shape(const Shape& shape)
cudnnTensorDescriptor_t&
runtime::gpu::CUDNNEmitter::tensor_descriptor_from_shape(const Shape& shape)
{
cudnnTensorDescriptor_t desc;
CUDNN_SAFE_CALL(cudnnCreateTensorDescriptor(&desc));
cudnnTensorDescriptor_t& desc = m_descriptors.build<cudnnTensorDescriptor_t>();
if (shape.size() < 4)
{
std::array<int, 4> dimensions;
......@@ -116,96 +115,32 @@ size_t runtime::gpu::CUDNNEmitter::build_reduce_forward(const runtime::gpu::GPUR
const Shape& input_shape,
const AxisSet& reduction_axes)
{
std::function<cudnnTensorDescriptor_t(void)> get_input_desc;
std::function<cudnnTensorDescriptor_t(void)> get_output_desc;
if (input_shape.size() <= 4)
{
// construct input tensor descriptor rt impl.
std::array<int, 4> dimensions;
size_t pos = 0;
for (size_t i = input_shape.size(); i < 4; i++)
{
dimensions[pos++] = 1;
}
for (size_t i = 0; i < input_shape.size(); i++)
{
dimensions[pos++] = static_cast<int>(input_shape[i]);
}
get_input_desc = [dimensions]() {
cudnnTensorDescriptor_t desc;
CUDNN_SAFE_CALL(cudnnCreateTensorDescriptor(&desc));
CUDNN_SAFE_CALL(cudnnSetTensor4dDescriptor(desc,
CUDNN_TENSOR_NCHW,
CUDNN_DATA_FLOAT,
dimensions[0],
dimensions[1],
dimensions[2],
dimensions[3]));
return desc;
};
// mark reduced axes of input tensor for output tensor descriptor
for (auto const& idx_dim : reduction_axes)
{
dimensions[(4 - input_shape.size()) + idx_dim] = 1;
}
std::stringstream ss;
ss << "reduce_op" << reduce_op << "_i" << join(input_shape, "_") << "_ra"
<< join(reduction_axes, "_");
std::string hash = ss.str();
get_output_desc = [dimensions]() {
cudnnTensorDescriptor_t desc;
CUDNN_SAFE_CALL(cudnnCreateTensorDescriptor(&desc));
CUDNN_SAFE_CALL(cudnnSetTensor4dDescriptor(desc,
CUDNN_TENSOR_NCHW,
CUDNN_DATA_FLOAT,
dimensions[0],
dimensions[1],
dimensions[2],
dimensions[3]));
return desc;
};
}
// descriptors for Nd tensors
else
// check if the requested kernel is already an inserted primitive
size_t primitive_index = m_primitive_emitter->lookup(hash);
if (primitive_index != std::numeric_limits<size_t>::max())
{
auto dimensions = runtime::gpu::cudnn_util::get_vector_int_from_size_t(input_shape);
get_input_desc = [dimensions]() {
cudnnTensorDescriptor_t desc;
CUDNN_SAFE_CALL(cudnnCreateTensorDescriptor(&desc));
CUDNN_SAFE_CALL(
cudnnSetTensorNdDescriptor(desc,
CUDNN_DATA_FLOAT,
static_cast<int>(dimensions.size()),
dimensions.data(),
cudnn_util::compute_strides(dimensions).data()));
return desc;
};
// mark reduced axes of input tensor for output tensor descriptor
for (auto const& idx_dim : reduction_axes)
{
dimensions[idx_dim] = 1;
}
return primitive_index;
}
get_output_desc = [dimensions]() {
cudnnTensorDescriptor_t desc;
CUDNN_SAFE_CALL(cudnnCreateTensorDescriptor(&desc));
CUDNN_SAFE_CALL(
cudnnSetTensorNdDescriptor(desc,
CUDNN_DATA_FLOAT,
static_cast<int>(dimensions.size()),
dimensions.data(),
cudnn_util::compute_strides(dimensions).data()));
return desc;
};
auto& desc = m_descriptors.build<cudnnReduceTensorDescriptor_t>();
auto& input_desc = tensor_descriptor_from_shape(input_shape);
Shape output_shape = input_shape;
// mark reduced axes of input tensor for output tensor descriptor
for (auto const& idx_dim : reduction_axes)
{
output_shape[idx_dim] = 1;
}
// emit sum reduce operation
std::unique_ptr<gpu::primitive> reduce(new gpu::primitive{
[ctx, reduce_op, get_input_desc, get_output_desc](void** inputs, void** outputs) {
auto input_desc = get_input_desc();
auto output_desc = get_output_desc();
cudnnReduceTensorDescriptor_t reduceTensorDesc;
CUDNN_SAFE_CALL(cudnnCreateReduceTensorDescriptor(&reduceTensorDesc));
CUDNN_SAFE_CALL(cudnnSetReduceTensorDescriptor(reduceTensorDesc,
auto& output_desc = tensor_descriptor_from_shape(output_shape);
// emit reduce operation
std::unique_ptr<gpu::primitive> reduce(
new gpu::primitive{[=, &desc, &input_desc, &output_desc](void** inputs, void** outputs) {
CUDNN_SAFE_CALL(cudnnSetReduceTensorDescriptor(desc,
reduce_op,
CUDNN_DATA_FLOAT,
CUDNN_NOT_PROPAGATE_NAN,
......@@ -213,11 +148,11 @@ size_t runtime::gpu::CUDNNEmitter::build_reduce_forward(const runtime::gpu::GPUR
CUDNN_32BIT_INDICES));
size_t workspace_size = 0;
CUDNN_SAFE_CALL(cudnnGetReductionWorkspaceSize(
*ctx->cudnn_handle, reduceTensorDesc, input_desc, output_desc, &workspace_size));
*ctx->cudnn_handle, desc, input_desc, output_desc, &workspace_size));
auto workspace_ptr = create_gpu_buffer(workspace_size);
float alpha = 1.0, beta = 0.0;
CUDNN_SAFE_CALL(cudnnReduceTensor(*ctx->cudnn_handle,
reduceTensorDesc,
desc,
nullptr,
0,
workspace_ptr,
......@@ -231,7 +166,9 @@ size_t runtime::gpu::CUDNNEmitter::build_reduce_forward(const runtime::gpu::GPUR
free_gpu_buffer(workspace_ptr);
}});
return this->m_primitive_emitter->insert(std::move(reduce));
primitive_index = this->m_primitive_emitter->insert(std::move(reduce));
m_primitive_emitter->cache(hash, primitive_index);
return primitive_index;
}
size_t runtime::gpu::CUDNNEmitter::build_pooling(const runtime::gpu::GPURuntimeContext* ctx,
......@@ -260,12 +197,12 @@ size_t runtime::gpu::CUDNNEmitter::build_pooling(const runtime::gpu::GPURuntimeC
return primitive_index;
}
cudnnPoolingDescriptor_t desc;
auto input_desc = runtime::gpu::cudnn_util::tensor_descriptor_from_shape(input_shape);
auto output_desc = runtime::gpu::cudnn_util::tensor_descriptor_from_shape(output_shape);
auto& desc = m_descriptors.build<cudnnPoolingDescriptor_t>();
auto& input_desc = tensor_descriptor_from_shape(input_shape);
auto& output_desc = tensor_descriptor_from_shape(output_shape);
if (input_shape.size() == 4)
{
CUDNN_SAFE_CALL(cudnnCreatePoolingDescriptor(&desc));
CUDNN_SAFE_CALL(cudnnSetPooling2dDescriptor(desc,
pool_op,
CUDNN_NOT_PROPAGATE_NAN,
......@@ -287,7 +224,6 @@ size_t runtime::gpu::CUDNNEmitter::build_pooling(const runtime::gpu::GPURuntimeC
w_strides[i] = static_cast<int>(window_strides[i]);
w_padding[i] = static_cast<int>(padding_below[i]);
}
CUDNN_SAFE_CALL(cudnnCreatePoolingDescriptor(&desc));
CUDNN_SAFE_CALL(cudnnSetPoolingNdDescriptor(desc,
pool_op,
CUDNN_NOT_PROPAGATE_NAN,
......@@ -308,43 +244,45 @@ size_t runtime::gpu::CUDNNEmitter::build_pooling(const runtime::gpu::GPURuntimeC
case (Prop::Inference):
case (Prop::Forward):
{
pool.reset(new gpu::primitive{[=](void** inputs, void** outputs) {
float alpha = 1.0, beta = 0.0;
CUDNN_SAFE_CALL(cudnnPoolingForward(*ctx->cudnn_handle,
desc,
&alpha,
input_desc,
inputs[0],
&beta,
output_desc,
outputs[0]));
}});
pool.reset(new gpu::primitive{
[=, &desc, &input_desc, &output_desc](void** inputs, void** outputs) {
float alpha = 1.0, beta = 0.0;
CUDNN_SAFE_CALL(cudnnPoolingForward(*ctx->cudnn_handle,
desc,
&alpha,
input_desc,
inputs[0],
&beta,
output_desc,
outputs[0]));
}});
break;
}
case (Prop::Backward):
{
pool.reset(new gpu::primitive{[=](void** inputs, void** outputs) {
float alpha = 1.0, beta = 0.0;
// cuDNN requires the output tensor of the maxpool fprop to be passed even though
// it is not mathematically necessary. It appears, however, that it is not actually
// used as the adjoints are passed in place and the correct result is achieved.
CUDNN_SAFE_CALL(cudnnPoolingBackward(*ctx->cudnn_handle,
desc,
&alpha,
// output (wrt maxpool) tensor
output_desc,
inputs[1],
// adjoint of output
output_desc,
inputs[1],
// input (wrt maxpool) tensor
input_desc,
inputs[0],
&beta,
// adjoint of input
input_desc,
outputs[0]));
}});
pool.reset(new gpu::primitive{
[=, &desc, &input_desc, &output_desc](void** inputs, void** outputs) {
float alpha = 1.0, beta = 0.0;
// cuDNN requires the output tensor of the maxpool fprop to be passed even though
// it is not mathematically necessary. It appears, however, that it is not actually
// used as the adjoints are passed in place and the correct result is achieved.
CUDNN_SAFE_CALL(cudnnPoolingBackward(*ctx->cudnn_handle,
desc,
&alpha,
// output (wrt maxpool) tensor
output_desc,
inputs[1],
// adjoint of output
output_desc,
inputs[1],
// input (wrt maxpool) tensor
input_desc,
inputs[0],
&beta,
// adjoint of input
input_desc,
outputs[0]));
}});
break;
}
}
......@@ -381,9 +319,8 @@ size_t runtime::gpu::CUDNNEmitter::build_batchnorm(const runtime::gpu::GPURuntim
throw std::runtime_error("Batch Norm epsilon is less than CUDNN_BN_MIN_EPSILON");
}
cudnnTensorDescriptor_t derived_param_desc;
CUDNN_SAFE_CALL(cudnnCreateTensorDescriptor(&derived_param_desc));
auto tensor_desc = runtime::gpu::cudnn_util::tensor_descriptor_from_shape(tensor_shape);
auto& derived_param_desc = m_descriptors.build<cudnnTensorDescriptor_t>();
auto& tensor_desc = tensor_descriptor_from_shape(tensor_shape);
CUDNN_SAFE_CALL(cudnnDeriveBNTensorDescriptor(derived_param_desc, tensor_desc, bn_op));
float alpha = 1.0, beta = 0.0;
......@@ -392,28 +329,28 @@ size_t runtime::gpu::CUDNNEmitter::build_batchnorm(const runtime::gpu::GPURuntim
{
case Prop::Inference:
{
batchnorm.reset(new gpu::primitive{[=](void** inputs, void** outputs) {
CUDNN_SAFE_CALL(cudnnBatchNormalizationForwardInference(*ctx->cudnn_handle,
bn_op,
&alpha,
&beta,
tensor_desc,
inputs[2], // tensor
tensor_desc,
outputs[0], // tensor
derived_param_desc,
inputs[0], // gain
inputs[1], // bias
inputs[3], // mean
inputs[4], // variance
epsilon));
}});
batchnorm.reset(new gpu::primitive{
[=, &tensor_desc, &derived_param_desc](void** inputs, void** outputs) {
CUDNN_SAFE_CALL(cudnnBatchNormalizationForwardInference(*ctx->cudnn_handle,
bn_op,
&alpha,
&beta,
tensor_desc,
inputs[2], // tensor
tensor_desc,
outputs[0], // tensor
derived_param_desc,
inputs[0], // gain
inputs[1], // bias
inputs[3], // mean
inputs[4], // variance
epsilon));
}});
break;
}
case Prop::Forward:
{
cudnnOpTensorDescriptor_t op_desc;
CUDNN_SAFE_CALL(cudnnCreateOpTensorDescriptor(&op_desc));
auto& op_desc = m_descriptors.build<cudnnOpTensorDescriptor_t>();
CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(
op_desc, CUDNN_OP_TENSOR_MUL, CUDNN_DATA_FLOAT, CUDNN_NOT_PROPAGATE_NAN));
......@@ -427,64 +364,66 @@ size_t runtime::gpu::CUDNNEmitter::build_batchnorm(const runtime::gpu::GPURuntim
// during inference. see commit note for 3b081ce for more details.
float m = shape_size(tensor_shape) / tensor_shape[1];
float bias_factor = (m - 1) / m;
batchnorm.reset(new gpu::primitive{[=](void** inputs, void** outputs) {
CUDNN_SAFE_CALL(cudnnBatchNormalizationForwardTraining(*ctx->cudnn_handle,
bn_op,
&alpha,
&beta,
tensor_desc,
inputs[2],
tensor_desc,
outputs[0],
derived_param_desc,
inputs[0],
inputs[1],
exp_avg_factor,
outputs[1],
outputs[2],
epsilon,
NULL,
NULL));
// convert to biased variance
CUDNN_SAFE_CALL(cudnnOpTensor(*ctx->cudnn_handle,
op_desc,
&beta,
derived_param_desc,
outputs[2],
&beta,
derived_param_desc,
outputs[2],
&bias_factor,
derived_param_desc,
outputs[2]));
}});
batchnorm.reset(new gpu::primitive{
[=, &op_desc, &tensor_desc, &derived_param_desc](void** inputs, void** outputs) {
CUDNN_SAFE_CALL(cudnnBatchNormalizationForwardTraining(*ctx->cudnn_handle,
bn_op,
&alpha,
&beta,
tensor_desc,
inputs[2],
tensor_desc,
outputs[0],
derived_param_desc,
inputs[0],
inputs[1],
exp_avg_factor,
outputs[1],
outputs[2],
epsilon,
NULL,
NULL));
// convert to biased variance
CUDNN_SAFE_CALL(cudnnOpTensor(*ctx->cudnn_handle,
op_desc,
&beta,
derived_param_desc,
outputs[2],
&beta,
derived_param_desc,
outputs[2],
&bias_factor,
derived_param_desc,
outputs[2]));
}});
break;
}
case Prop::Backward:
{
batchnorm.reset(new gpu::primitive{[=](void** inputs, void** outputs) {
CUDNN_SAFE_CALL(cudnnBatchNormalizationBackward(
*ctx->cudnn_handle,
bn_op,
&alpha,
&beta,
&alpha,
&beta,
tensor_desc,
inputs[2 /* input tensor x */],
tensor_desc,
inputs[5 /* dy */],
tensor_desc,
outputs[0 /* dx */],
derived_param_desc,
inputs[0 /* gamma */],
outputs[1 /* dgamma */],
outputs[2 /* dbeta */],
epsilon,
NULL, // inputs[3 /* mu batch mean*/],
NULL)); // inputs[4 /* 1/sig**2 batch inverse variance*/]);
}});
batchnorm.reset(new gpu::primitive{
[=, &tensor_desc, &derived_param_desc](void** inputs, void** outputs) {
CUDNN_SAFE_CALL(cudnnBatchNormalizationBackward(
*ctx->cudnn_handle,
bn_op,
&alpha,
&beta,
&alpha,
&beta,
tensor_desc,
inputs[2 /* input tensor x */],
tensor_desc,
inputs[5 /* dy */],
tensor_desc,
outputs[0 /* dx */],
derived_param_desc,
inputs[0 /* gamma */],
outputs[1 /* dgamma */],
outputs[2 /* dbeta */],
epsilon,
NULL, // inputs[3 /* mu batch mean*/],
NULL)); // inputs[4 /* 1/sig**2 batch inverse variance*/]);
}});
break;
}
}
......
......@@ -25,6 +25,7 @@
#include <cudnn.h>
#include "ngraph/axis_set.hpp"
#include "ngraph/runtime/gpu/cudnn_descriptors.hpp"
#include "ngraph/runtime/gpu/gpu_runtime_context.hpp"
#include "ngraph/shape.hpp"
......@@ -39,7 +40,6 @@ namespace ngraph
std::vector<int> compute_strides(const Shape&);
std::vector<int> compute_strides(const std::vector<int>&);
std::vector<int> get_vector_int_from_size_t(const std::vector<size_t>&);
cudnnTensorDescriptor_t tensor_descriptor_from_shape(const Shape& shape);
}
class GPUPrimitiveEmitter;
......@@ -77,8 +77,12 @@ namespace ngraph
const Shape& param_shape,
double epsilon);
cudnnTensorDescriptor_t& tensor_descriptor_from_shape(const Shape& shape);
private:
CUDNNEmitter(GPUPrimitiveEmitter* emitter);
CUDNNDescriptors m_descriptors;
GPUPrimitiveEmitter* m_primitive_emitter;
};
}
......
......@@ -151,8 +151,7 @@ namespace ngraph
writer << "int count = " << out[0].get_size() << ";\n";
writer += R"(
float alpha1 = 1.0, alpha2 = 1.0, beta = 0;
cudnnTensorDescriptor_t descriptor;
CUDNN_SAFE_CALL(cudnnCreateTensorDescriptor(&descriptor));
auto& descriptor = descriptors.build<cudnnTensorDescriptor_t>();
CUDNN_SAFE_CALL(cudnnSetTensor4dDescriptor(descriptor,
/*format=*/CUDNN_TENSOR_NCHW,
/*dataType=*/CUDNN_DATA_FLOAT,
......@@ -161,8 +160,7 @@ CUDNN_SAFE_CALL(cudnnSetTensor4dDescriptor(descriptor,
/*image_height=*/1,
/*image_width=*/count));
cudnnOpTensorDescriptor_t opTensorDesc;
CUDNN_SAFE_CALL(cudnnCreateOpTensorDescriptor(&opTensorDesc));
auto& opTensorDesc = descriptors.build<cudnnOpTensorDescriptor_t>();
CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
CUDNN_OP_TENSOR_ADD,
CUDNN_DATA_FLOAT,
......@@ -611,8 +609,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
writer << "int count = " << out[0].get_size() << ";\n";
writer += R"(
float alpha1 = 1.0, alpha2 = 1.0, beta = 0;
cudnnTensorDescriptor_t descriptor;
CUDNN_SAFE_CALL(cudnnCreateTensorDescriptor(&descriptor));
auto& descriptor = descriptors.build<cudnnTensorDescriptor_t>();
CUDNN_SAFE_CALL(cudnnSetTensor4dDescriptor(descriptor,
/*format=*/CUDNN_TENSOR_NCHW,
/*dataType=*/CUDNN_DATA_FLOAT,
......@@ -621,8 +618,7 @@ CUDNN_SAFE_CALL(cudnnSetTensor4dDescriptor(descriptor,
/*image_height=*/1,
/*image_width=*/count));
cudnnOpTensorDescriptor_t opTensorDesc;
CUDNN_SAFE_CALL(cudnnCreateOpTensorDescriptor(&opTensorDesc));
auto& opTensorDesc = descriptors.build<cudnnOpTensorDescriptor_t>();
CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
CUDNN_OP_TENSOR_MAX,
CUDNN_DATA_FLOAT,
......@@ -651,8 +647,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
writer << "int count = " << out[0].get_size() << ";\n";
writer += R"(
float alpha1 = 1.0, alpha2 = 1.0, beta = 0;
cudnnTensorDescriptor_t descriptor;
CUDNN_SAFE_CALL(cudnnCreateTensorDescriptor(&descriptor));
auto& descriptor = descriptors.build<cudnnTensorDescriptor_t>();
CUDNN_SAFE_CALL(cudnnSetTensor4dDescriptor(descriptor,
/*format=*/CUDNN_TENSOR_NCHW,
/*dataType=*/CUDNN_DATA_FLOAT,
......@@ -661,8 +656,7 @@ CUDNN_SAFE_CALL(cudnnSetTensor4dDescriptor(descriptor,
/*image_height=*/1,
/*image_width=*/count));
cudnnOpTensorDescriptor_t opTensorDesc;
CUDNN_SAFE_CALL(cudnnCreateOpTensorDescriptor(&opTensorDesc));
auto& opTensorDesc = descriptors.build<cudnnOpTensorDescriptor_t>();
CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
CUDNN_OP_TENSOR_MIN,
CUDNN_DATA_FLOAT,
......@@ -691,8 +685,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
writer << "int count = " << out[0].get_size() << ";\n";
writer += R"(
float alpha1 = -1.0, alpha2 = 0, beta = 0;
cudnnTensorDescriptor_t descriptor;
CUDNN_SAFE_CALL(cudnnCreateTensorDescriptor(&descriptor));
auto& descriptor = descriptors.build<cudnnTensorDescriptor_t>();
CUDNN_SAFE_CALL(cudnnSetTensor4dDescriptor(descriptor,
/*format=*/CUDNN_TENSOR_NCHW,
/*dataType=*/CUDNN_DATA_FLOAT,
......@@ -701,8 +694,7 @@ CUDNN_SAFE_CALL(cudnnSetTensor4dDescriptor(descriptor,
/*image_height=*/1,
/*image_width=*/count));
cudnnOpTensorDescriptor_t opTensorDesc;
CUDNN_SAFE_CALL(cudnnCreateOpTensorDescriptor(&opTensorDesc));
auto& opTensorDesc = descriptors.build<cudnnOpTensorDescriptor_t>();
CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
CUDNN_OP_TENSOR_ADD,
CUDNN_DATA_FLOAT,
......@@ -1095,8 +1087,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
writer << "int count = " << out[0].get_size() << ";\n";
writer += R"(
float alpha1 = 1.0, alpha2 = 1.0, beta = 0;
cudnnTensorDescriptor_t descriptor;
CUDNN_SAFE_CALL(cudnnCreateTensorDescriptor(&descriptor));
auto& descriptor = descriptors.build<cudnnTensorDescriptor_t>();
CUDNN_SAFE_CALL(cudnnSetTensor4dDescriptor(descriptor,
/*format=*/CUDNN_TENSOR_NCHW,
/*dataType=*/CUDNN_DATA_FLOAT,
......@@ -1105,8 +1096,7 @@ CUDNN_SAFE_CALL(cudnnSetTensor4dDescriptor(descriptor,
/*image_height=*/1,
/*image_width=*/count));
cudnnOpTensorDescriptor_t opTensorDesc;
CUDNN_SAFE_CALL(cudnnCreateOpTensorDescriptor(&opTensorDesc));
auto& opTensorDesc = descriptors.build<cudnnOpTensorDescriptor_t>();
CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
CUDNN_OP_TENSOR_MUL,
CUDNN_DATA_FLOAT,
......@@ -1166,8 +1156,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
writer << "int count = " << out[0].get_size() << ";\n";
writer += R"(
float alpha1 = 1.0, alpha2 = 0, beta = 0;
cudnnTensorDescriptor_t descriptor;
CUDNN_SAFE_CALL(cudnnCreateTensorDescriptor(&descriptor));
auto& descriptor = descriptors.build<cudnnTensorDescriptor_t>();
CUDNN_SAFE_CALL(cudnnSetTensor4dDescriptor(descriptor,
/*format=*/CUDNN_TENSOR_NCHW,
/*dataType=*/CUDNN_DATA_FLOAT,
......@@ -1176,8 +1165,7 @@ CUDNN_SAFE_CALL(cudnnSetTensor4dDescriptor(descriptor,
/*image_height=*/1,
/*image_width=*/count));
cudnnOpTensorDescriptor_t opTensorDesc;
CUDNN_SAFE_CALL(cudnnCreateOpTensorDescriptor(&opTensorDesc));
auto& opTensorDesc = descriptors.build<cudnnOpTensorDescriptor_t>();
CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
CUDNN_OP_TENSOR_SQRT,
CUDNN_DATA_FLOAT,
......
......@@ -307,6 +307,7 @@ void runtime::gpu::GPU_ExternalFunction::compile()
#include "ngraph/pass/manager.hpp"
#include "ngraph/pass/memory_layout.hpp"
#include "ngraph/runtime/aligned_buffer.hpp"
#include "ngraph/runtime/gpu/cudnn_descriptors.hpp"
#include "ngraph/runtime/gpu/gpu_cuda_kernel_emitters.hpp"
#include "ngraph/runtime/gpu/gpu_cuda_kernel_ops.hpp"
#include "ngraph/runtime/gpu/gpu_invoke.hpp"
......@@ -421,6 +422,10 @@ using namespace std;
}
}
}
// Add cudnn descriptor factory for descriptor management.
// After the cuDNN code emitted in gpu_emitter.cc is refactored
// into the CUDNNEmitter class, this can be removed.
writer << "static runtime::gpu::CUDNNDescriptors descriptors;\n";
writer << "// Declare all functions\n";
for (shared_ptr<Function> f : pass_manager.get_state().get_functions())
......
......@@ -60,8 +60,7 @@ void runtime::gpu::kernel::emit_cudnnConvolutionDescriptor(codegen::CodeWriter&
const std::string& mode,
const std::string& data_type)
{
writer << "cudnnConvolutionDescriptor_t " << name << ";\n";
writer << "CUDNN_SAFE_CALL(cudnnCreateConvolutionDescriptor(&" << name << "));\n";
writer << "auto& " << name << " = descriptors.build<cudnnConvolutionDescriptor_t>();\n";
if (padding.size() == 2)
{
......@@ -98,8 +97,7 @@ void runtime::gpu::kernel::emit_cudnnFilterDescriptor(codegen::CodeWriter& write
dimensions[i] = shape[idx++];
}
writer << "cudnnFilterDescriptor_t " << name << ";\n";
writer << "CUDNN_SAFE_CALL(cudnnCreateFilterDescriptor(&" << name << "));\n";
writer << "auto& " << name << " = descriptors.build<cudnnFilterDescriptor_t>();\n";
if (dimensions.size() <= 4)
{
......@@ -135,9 +133,7 @@ void runtime::gpu::kernel::emit_cudnnTensorDescriptor(codegen::CodeWriter& write
dimensions[i] = shape[idx++];
}
writer << "cudnnTensorDescriptor_t " << name << ";\n";
writer << "CUDNN_SAFE_CALL(cudnnCreateTensorDescriptor(&" << name << "));\n";
writer << "auto& " << name << " = descriptors.build<cudnnTensorDescriptor_t>();\n";
if (dimensions.size() <= 4)
{
writer << "CUDNN_SAFE_CALL(cudnnSetTensor4dDescriptor(" << name << ",\n";
......@@ -167,8 +163,7 @@ void runtime::gpu::kernel::emit_cudnnTensor4dDescriptor(codegen::CodeWriter& wri
const std::string& data_type,
const std::array<size_t, 4>& axes)
{
writer << "cudnnTensorDescriptor_t " << name << ";\n";
writer << "CUDNN_SAFE_CALL(cudnnCreateTensorDescriptor(&" << name << "));\n";
writer << "auto& " << name << " = descriptors.build<cudnnTensorDescriptor_t>();\n";
writer << "CUDNN_SAFE_CALL(cudnnSetTensor4dDescriptor(" << name << ",\n";
writer << " /*format=*/" << format << ",\n";
writer << " /*dataType=*/" << data_type;
......@@ -188,8 +183,7 @@ void runtime::gpu::kernel::emit_cudnnTensorNdDescriptor(codegen::CodeWriter& wri
{
writer << "const int " << name << "_axes[] = {" << join(axes) << "};\n";
writer << "const int " << name << "_strides[] = {" << join(strides) << "};\n";
writer << "cudnnTensorDescriptor_t " << name << ";\n";
writer << "CUDNN_SAFE_CALL(cudnnCreateTensorDescriptor(&" << name << "));\n";
writer << "auto& " << name << " = descriptors.build<cudnnTensorDescriptor_t>();\n";
writer << "CUDNN_SAFE_CALL(cudnnSetTensorNdDescriptor(" << name << ",\n";
writer << " /*dataType=*/" << data_type << ",\n";
writer << " /*num_dimensions=*/" << num_axes << ",\n";
......@@ -208,8 +202,7 @@ void runtime::gpu::kernel::emit_cudnnReduceTensor(codegen::CodeWriter& writer,
const float& alpha,
const float& beta)
{
writer << "cudnnReduceTensorDescriptor_t reduceTensorDesc;\n";
writer << "CUDNN_SAFE_CALL(cudnnCreateReduceTensorDescriptor(&reduceTensorDesc));\n";
writer << "auto& reduceTensorDesc = descriptors.build<cudnnReduceTensorDescriptor_t>();\n";
writer << "CUDNN_SAFE_CALL(cudnnSetReduceTensorDescriptor(reduceTensorDesc,\n";
writer << " " << reduce_op << ",\n";
writer << " " << data_type << ",\n";
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment