Commit 682f7b04 authored by Chris Sullivan's avatar Chris Sullivan Committed by Scott Cyphers

[cuDNN:Part 1] minimal refactoring of op::reduce (#965)

* Refactored the cudnn reduce kernel to use the nGraph Shape -> cudnnTensorDescriptor cudnn helpers that the other kernels use.

* Added cacheing to cudnn reduce op.

* Adding back hashing call before returning primitive index to op::Reduce (bug fix).

* [cuDNN:Part 2] Descriptor Creation/Destruction refactoring (#969)

* Added a cuDNN descriptor factory which manages the construction and destruction of cuDNN descriptors.
It correctly calls Create/Destroy based on the cuDNN descriptor type. Previously the Destroy functions were not being called.

* Removed commented code and changed class to struct on cudnn_descriptor.

* Added comments and updated a few variable names.

* Clang compiled cuDNN kernels (those not part of CUDNNEmitter)
now use the CUDNNDescriptor factory.
parent 11ec6449
/*******************************************************************************
* Copyright 2018 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#pragma once
#include <memory>
#include <vector>
#include <cudnn.h>
#include "ngraph/runtime/gpu/gpu_util.hpp"
namespace ngraph
{
namespace runtime
{
namespace gpu
{
template <typename T>
struct cudnn_descriptor;
/// \brief A factory which builds cuDNN descriptors
/// and manages their creation and destruction.
class CUDNNDescriptors
{
class Descriptor
{
public:
Descriptor() = default;
virtual ~Descriptor() = default;
};
public:
CUDNNDescriptors() = default;
~CUDNNDescriptors() = default;
template <typename T>
T& build()
{
// A function-local class for type erased storage of
// various cudnn descriptor types which is templated
// over function `build`'s specialization type
class descriptor_ : public Descriptor
{
public:
descriptor_() { cudnn_descriptor<T>::create(stored_descriptor); }
~descriptor_() { cudnn_descriptor<T>::destroy(stored_descriptor); }
T& get() { return stored_descriptor; }
private:
T stored_descriptor;
};
m_descriptors.emplace_back(new descriptor_);
return static_cast<descriptor_*>(m_descriptors.back().get())->get();
}
private:
std::vector<std::unique_ptr<Descriptor>> m_descriptors;
};
template <>
struct cudnn_descriptor<cudnnLRNDescriptor_t>
{
static void create(cudnnLRNDescriptor_t& desc)
{
CUDNN_SAFE_CALL(cudnnCreateLRNDescriptor(&desc));
}
static void destroy(cudnnLRNDescriptor_t& desc)
{
CUDNN_SAFE_CALL(cudnnDestroyLRNDescriptor(desc));
}
};
template <>
struct cudnn_descriptor<cudnnTensorDescriptor_t>
{
static void create(cudnnTensorDescriptor_t& desc)
{
CUDNN_SAFE_CALL(cudnnCreateTensorDescriptor(&desc));
}
static void destroy(cudnnTensorDescriptor_t& desc)
{
CUDNN_SAFE_CALL(cudnnDestroyTensorDescriptor(desc));
}
};
template <>
struct cudnn_descriptor<cudnnSpatialTransformerDescriptor_t>
{
static void create(cudnnSpatialTransformerDescriptor_t& desc)
{
CUDNN_SAFE_CALL(cudnnCreateSpatialTransformerDescriptor(&desc));
}
static void destroy(cudnnSpatialTransformerDescriptor_t& desc)
{
CUDNN_SAFE_CALL(cudnnDestroySpatialTransformerDescriptor(desc));
}
};
template <>
struct cudnn_descriptor<cudnnReduceTensorDescriptor_t>
{
static void create(cudnnReduceTensorDescriptor_t& desc)
{
CUDNN_SAFE_CALL(cudnnCreateReduceTensorDescriptor(&desc));
}
static void destroy(cudnnReduceTensorDescriptor_t& desc)
{
CUDNN_SAFE_CALL(cudnnDestroyReduceTensorDescriptor(desc));
}
};
template <>
struct cudnn_descriptor<cudnnRNNDescriptor_t>
{
static void create(cudnnRNNDescriptor_t& desc)
{
CUDNN_SAFE_CALL(cudnnCreateRNNDescriptor(&desc));
}
static void destroy(cudnnRNNDescriptor_t& desc)
{
CUDNN_SAFE_CALL(cudnnDestroyRNNDescriptor(desc));
}
};
template <>
struct cudnn_descriptor<cudnnPoolingDescriptor_t>
{
static void create(cudnnPoolingDescriptor_t& desc)
{
CUDNN_SAFE_CALL(cudnnCreatePoolingDescriptor(&desc));
}
static void destroy(cudnnPoolingDescriptor_t& desc)
{
CUDNN_SAFE_CALL(cudnnDestroyPoolingDescriptor(desc));
}
};
template <>
struct cudnn_descriptor<cudnnOpTensorDescriptor_t>
{
static void create(cudnnOpTensorDescriptor_t& desc)
{
CUDNN_SAFE_CALL(cudnnCreateOpTensorDescriptor(&desc));
}
static void destroy(cudnnOpTensorDescriptor_t& desc)
{
CUDNN_SAFE_CALL(cudnnDestroyOpTensorDescriptor(desc));
}
};
template <>
struct cudnn_descriptor<cudnnFilterDescriptor_t>
{
static void create(cudnnFilterDescriptor_t& desc)
{
CUDNN_SAFE_CALL(cudnnCreateFilterDescriptor(&desc));
}
static void destroy(cudnnFilterDescriptor_t& desc)
{
CUDNN_SAFE_CALL(cudnnDestroyFilterDescriptor(desc));
}
};
template <>
struct cudnn_descriptor<cudnnDropoutDescriptor_t>
{
static void create(cudnnDropoutDescriptor_t& desc)
{
CUDNN_SAFE_CALL(cudnnCreateDropoutDescriptor(&desc));
}
static void destroy(cudnnDropoutDescriptor_t& desc)
{
CUDNN_SAFE_CALL(cudnnDestroyDropoutDescriptor(desc));
}
};
template <>
struct cudnn_descriptor<cudnnConvolutionDescriptor_t>
{
static void create(cudnnConvolutionDescriptor_t& desc)
{
CUDNN_SAFE_CALL(cudnnCreateConvolutionDescriptor(&desc));
}
static void destroy(cudnnConvolutionDescriptor_t& desc)
{
CUDNN_SAFE_CALL(cudnnDestroyConvolutionDescriptor(desc));
}
};
template <>
struct cudnn_descriptor<cudnnCTCLossDescriptor_t>
{
static void create(cudnnCTCLossDescriptor_t& desc)
{
CUDNN_SAFE_CALL(cudnnCreateCTCLossDescriptor(&desc));
}
static void destroy(cudnnCTCLossDescriptor_t& desc)
{
CUDNN_SAFE_CALL(cudnnDestroyCTCLossDescriptor(desc));
}
};
template <>
struct cudnn_descriptor<cudnnActivationDescriptor_t>
{
static void create(cudnnActivationDescriptor_t& desc)
{
CUDNN_SAFE_CALL(cudnnCreateActivationDescriptor(&desc));
}
static void destroy(cudnnActivationDescriptor_t& desc)
{
CUDNN_SAFE_CALL(cudnnDestroyActivationDescriptor(desc));
}
};
}
}
}
This diff is collapsed.
......@@ -25,6 +25,7 @@
#include <cudnn.h>
#include "ngraph/axis_set.hpp"
#include "ngraph/runtime/gpu/cudnn_descriptors.hpp"
#include "ngraph/runtime/gpu/gpu_runtime_context.hpp"
#include "ngraph/shape.hpp"
......@@ -39,7 +40,6 @@ namespace ngraph
std::vector<int> compute_strides(const Shape&);
std::vector<int> compute_strides(const std::vector<int>&);
std::vector<int> get_vector_int_from_size_t(const std::vector<size_t>&);
cudnnTensorDescriptor_t tensor_descriptor_from_shape(const Shape& shape);
}
class GPUPrimitiveEmitter;
......@@ -77,8 +77,12 @@ namespace ngraph
const Shape& param_shape,
double epsilon);
cudnnTensorDescriptor_t& tensor_descriptor_from_shape(const Shape& shape);
private:
CUDNNEmitter(GPUPrimitiveEmitter* emitter);
CUDNNDescriptors m_descriptors;
GPUPrimitiveEmitter* m_primitive_emitter;
};
}
......
......@@ -151,8 +151,7 @@ namespace ngraph
writer << "int count = " << out[0].get_size() << ";\n";
writer += R"(
float alpha1 = 1.0, alpha2 = 1.0, beta = 0;
cudnnTensorDescriptor_t descriptor;
CUDNN_SAFE_CALL(cudnnCreateTensorDescriptor(&descriptor));
auto& descriptor = descriptors.build<cudnnTensorDescriptor_t>();
CUDNN_SAFE_CALL(cudnnSetTensor4dDescriptor(descriptor,
/*format=*/CUDNN_TENSOR_NCHW,
/*dataType=*/CUDNN_DATA_FLOAT,
......@@ -161,8 +160,7 @@ CUDNN_SAFE_CALL(cudnnSetTensor4dDescriptor(descriptor,
/*image_height=*/1,
/*image_width=*/count));
cudnnOpTensorDescriptor_t opTensorDesc;
CUDNN_SAFE_CALL(cudnnCreateOpTensorDescriptor(&opTensorDesc));
auto& opTensorDesc = descriptors.build<cudnnOpTensorDescriptor_t>();
CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
CUDNN_OP_TENSOR_ADD,
CUDNN_DATA_FLOAT,
......@@ -611,8 +609,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
writer << "int count = " << out[0].get_size() << ";\n";
writer += R"(
float alpha1 = 1.0, alpha2 = 1.0, beta = 0;
cudnnTensorDescriptor_t descriptor;
CUDNN_SAFE_CALL(cudnnCreateTensorDescriptor(&descriptor));
auto& descriptor = descriptors.build<cudnnTensorDescriptor_t>();
CUDNN_SAFE_CALL(cudnnSetTensor4dDescriptor(descriptor,
/*format=*/CUDNN_TENSOR_NCHW,
/*dataType=*/CUDNN_DATA_FLOAT,
......@@ -621,8 +618,7 @@ CUDNN_SAFE_CALL(cudnnSetTensor4dDescriptor(descriptor,
/*image_height=*/1,
/*image_width=*/count));
cudnnOpTensorDescriptor_t opTensorDesc;
CUDNN_SAFE_CALL(cudnnCreateOpTensorDescriptor(&opTensorDesc));
auto& opTensorDesc = descriptors.build<cudnnOpTensorDescriptor_t>();
CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
CUDNN_OP_TENSOR_MAX,
CUDNN_DATA_FLOAT,
......@@ -651,8 +647,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
writer << "int count = " << out[0].get_size() << ";\n";
writer += R"(
float alpha1 = 1.0, alpha2 = 1.0, beta = 0;
cudnnTensorDescriptor_t descriptor;
CUDNN_SAFE_CALL(cudnnCreateTensorDescriptor(&descriptor));
auto& descriptor = descriptors.build<cudnnTensorDescriptor_t>();
CUDNN_SAFE_CALL(cudnnSetTensor4dDescriptor(descriptor,
/*format=*/CUDNN_TENSOR_NCHW,
/*dataType=*/CUDNN_DATA_FLOAT,
......@@ -661,8 +656,7 @@ CUDNN_SAFE_CALL(cudnnSetTensor4dDescriptor(descriptor,
/*image_height=*/1,
/*image_width=*/count));
cudnnOpTensorDescriptor_t opTensorDesc;
CUDNN_SAFE_CALL(cudnnCreateOpTensorDescriptor(&opTensorDesc));
auto& opTensorDesc = descriptors.build<cudnnOpTensorDescriptor_t>();
CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
CUDNN_OP_TENSOR_MIN,
CUDNN_DATA_FLOAT,
......@@ -691,8 +685,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
writer << "int count = " << out[0].get_size() << ";\n";
writer += R"(
float alpha1 = -1.0, alpha2 = 0, beta = 0;
cudnnTensorDescriptor_t descriptor;
CUDNN_SAFE_CALL(cudnnCreateTensorDescriptor(&descriptor));
auto& descriptor = descriptors.build<cudnnTensorDescriptor_t>();
CUDNN_SAFE_CALL(cudnnSetTensor4dDescriptor(descriptor,
/*format=*/CUDNN_TENSOR_NCHW,
/*dataType=*/CUDNN_DATA_FLOAT,
......@@ -701,8 +694,7 @@ CUDNN_SAFE_CALL(cudnnSetTensor4dDescriptor(descriptor,
/*image_height=*/1,
/*image_width=*/count));
cudnnOpTensorDescriptor_t opTensorDesc;
CUDNN_SAFE_CALL(cudnnCreateOpTensorDescriptor(&opTensorDesc));
auto& opTensorDesc = descriptors.build<cudnnOpTensorDescriptor_t>();
CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
CUDNN_OP_TENSOR_ADD,
CUDNN_DATA_FLOAT,
......@@ -1095,8 +1087,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
writer << "int count = " << out[0].get_size() << ";\n";
writer += R"(
float alpha1 = 1.0, alpha2 = 1.0, beta = 0;
cudnnTensorDescriptor_t descriptor;
CUDNN_SAFE_CALL(cudnnCreateTensorDescriptor(&descriptor));
auto& descriptor = descriptors.build<cudnnTensorDescriptor_t>();
CUDNN_SAFE_CALL(cudnnSetTensor4dDescriptor(descriptor,
/*format=*/CUDNN_TENSOR_NCHW,
/*dataType=*/CUDNN_DATA_FLOAT,
......@@ -1105,8 +1096,7 @@ CUDNN_SAFE_CALL(cudnnSetTensor4dDescriptor(descriptor,
/*image_height=*/1,
/*image_width=*/count));
cudnnOpTensorDescriptor_t opTensorDesc;
CUDNN_SAFE_CALL(cudnnCreateOpTensorDescriptor(&opTensorDesc));
auto& opTensorDesc = descriptors.build<cudnnOpTensorDescriptor_t>();
CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
CUDNN_OP_TENSOR_MUL,
CUDNN_DATA_FLOAT,
......@@ -1166,8 +1156,7 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
writer << "int count = " << out[0].get_size() << ";\n";
writer += R"(
float alpha1 = 1.0, alpha2 = 0, beta = 0;
cudnnTensorDescriptor_t descriptor;
CUDNN_SAFE_CALL(cudnnCreateTensorDescriptor(&descriptor));
auto& descriptor = descriptors.build<cudnnTensorDescriptor_t>();
CUDNN_SAFE_CALL(cudnnSetTensor4dDescriptor(descriptor,
/*format=*/CUDNN_TENSOR_NCHW,
/*dataType=*/CUDNN_DATA_FLOAT,
......@@ -1176,8 +1165,7 @@ CUDNN_SAFE_CALL(cudnnSetTensor4dDescriptor(descriptor,
/*image_height=*/1,
/*image_width=*/count));
cudnnOpTensorDescriptor_t opTensorDesc;
CUDNN_SAFE_CALL(cudnnCreateOpTensorDescriptor(&opTensorDesc));
auto& opTensorDesc = descriptors.build<cudnnOpTensorDescriptor_t>();
CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
CUDNN_OP_TENSOR_SQRT,
CUDNN_DATA_FLOAT,
......
......@@ -307,6 +307,7 @@ void runtime::gpu::GPU_ExternalFunction::compile()
#include "ngraph/pass/manager.hpp"
#include "ngraph/pass/memory_layout.hpp"
#include "ngraph/runtime/aligned_buffer.hpp"
#include "ngraph/runtime/gpu/cudnn_descriptors.hpp"
#include "ngraph/runtime/gpu/gpu_cuda_kernel_emitters.hpp"
#include "ngraph/runtime/gpu/gpu_cuda_kernel_ops.hpp"
#include "ngraph/runtime/gpu/gpu_invoke.hpp"
......@@ -421,6 +422,10 @@ using namespace std;
}
}
}
// Add cudnn descriptor factory for descriptor management.
// After the cuDNN code emitted in gpu_emitter.cc is refactored
// into the CUDNNEmitter class, this can be removed.
writer << "static runtime::gpu::CUDNNDescriptors descriptors;\n";
writer << "// Declare all functions\n";
for (shared_ptr<Function> f : pass_manager.get_state().get_functions())
......
......@@ -60,8 +60,7 @@ void runtime::gpu::kernel::emit_cudnnConvolutionDescriptor(codegen::CodeWriter&
const std::string& mode,
const std::string& data_type)
{
writer << "cudnnConvolutionDescriptor_t " << name << ";\n";
writer << "CUDNN_SAFE_CALL(cudnnCreateConvolutionDescriptor(&" << name << "));\n";
writer << "auto& " << name << " = descriptors.build<cudnnConvolutionDescriptor_t>();\n";
if (padding.size() == 2)
{
......@@ -98,8 +97,7 @@ void runtime::gpu::kernel::emit_cudnnFilterDescriptor(codegen::CodeWriter& write
dimensions[i] = shape[idx++];
}
writer << "cudnnFilterDescriptor_t " << name << ";\n";
writer << "CUDNN_SAFE_CALL(cudnnCreateFilterDescriptor(&" << name << "));\n";
writer << "auto& " << name << " = descriptors.build<cudnnFilterDescriptor_t>();\n";
if (dimensions.size() <= 4)
{
......@@ -135,9 +133,7 @@ void runtime::gpu::kernel::emit_cudnnTensorDescriptor(codegen::CodeWriter& write
dimensions[i] = shape[idx++];
}
writer << "cudnnTensorDescriptor_t " << name << ";\n";
writer << "CUDNN_SAFE_CALL(cudnnCreateTensorDescriptor(&" << name << "));\n";
writer << "auto& " << name << " = descriptors.build<cudnnTensorDescriptor_t>();\n";
if (dimensions.size() <= 4)
{
writer << "CUDNN_SAFE_CALL(cudnnSetTensor4dDescriptor(" << name << ",\n";
......@@ -167,8 +163,7 @@ void runtime::gpu::kernel::emit_cudnnTensor4dDescriptor(codegen::CodeWriter& wri
const std::string& data_type,
const std::array<size_t, 4>& axes)
{
writer << "cudnnTensorDescriptor_t " << name << ";\n";
writer << "CUDNN_SAFE_CALL(cudnnCreateTensorDescriptor(&" << name << "));\n";
writer << "auto& " << name << " = descriptors.build<cudnnTensorDescriptor_t>();\n";
writer << "CUDNN_SAFE_CALL(cudnnSetTensor4dDescriptor(" << name << ",\n";
writer << " /*format=*/" << format << ",\n";
writer << " /*dataType=*/" << data_type;
......@@ -188,8 +183,7 @@ void runtime::gpu::kernel::emit_cudnnTensorNdDescriptor(codegen::CodeWriter& wri
{
writer << "const int " << name << "_axes[] = {" << join(axes) << "};\n";
writer << "const int " << name << "_strides[] = {" << join(strides) << "};\n";
writer << "cudnnTensorDescriptor_t " << name << ";\n";
writer << "CUDNN_SAFE_CALL(cudnnCreateTensorDescriptor(&" << name << "));\n";
writer << "auto& " << name << " = descriptors.build<cudnnTensorDescriptor_t>();\n";
writer << "CUDNN_SAFE_CALL(cudnnSetTensorNdDescriptor(" << name << ",\n";
writer << " /*dataType=*/" << data_type << ",\n";
writer << " /*num_dimensions=*/" << num_axes << ",\n";
......@@ -208,8 +202,7 @@ void runtime::gpu::kernel::emit_cudnnReduceTensor(codegen::CodeWriter& writer,
const float& alpha,
const float& beta)
{
writer << "cudnnReduceTensorDescriptor_t reduceTensorDesc;\n";
writer << "CUDNN_SAFE_CALL(cudnnCreateReduceTensorDescriptor(&reduceTensorDesc));\n";
writer << "auto& reduceTensorDesc = descriptors.build<cudnnReduceTensorDescriptor_t>();\n";
writer << "CUDNN_SAFE_CALL(cudnnSetReduceTensorDescriptor(reduceTensorDesc,\n";
writer << " " << reduce_op << ",\n";
writer << " " << data_type << ",\n";
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment