enable cudnn datatype support (#1122)

* enable multi datatpye support for Cudnn. refactor binary ops using cudnn * fix bugs * add tests to skip list that CUDNN does not support * not int support on cudnn for backward pooling * no GPU.dot_4d_5d_multi_axis_big_fp64_VERY_SLOW test anymore * clang format * throw if datatype is int8 or int32 for backward pooling * comments * fix list in unit_test.manifest * add type support for alpha, beta * fix bugs * datatype support for alpha, beta * missing () * clang format * batchnorm backward bug fix * remove debug info * change member function name to snake case. remove comments * use nullptr instead of NULL * code style, use cuDNN everywhere in comments * add cudnn host parameters memory manager. * change name to allocate_by_datatype * compiled * debug * fix bug: using list instead of vector, vector address will change each time it resize * add CUDNN_DATA_UINT8 and CUDNN_DATA_UINT8x4

enable cudnn datatype support (#1122)
* enable multi datatpye support for Cudnn. refactor binary ops using cudnn * fix bugs * add tests to skip list that CUDNN does not support * not int support on cudnn for backward pooling * no GPU.dot_4d_5d_multi_axis_big_fp64_VERY_SLOW test anymore * clang format * throw if datatype is int8 or int32 for backward pooling * comments * fix list in unit_test.manifest * add type support for alpha, beta * fix bugs * datatype support for alpha, beta * missing () * clang format * batchnorm backward bug fix * remove debug info * change member function name to snake case. remove comments * use nullptr instead of NULL * code style, use cuDNN everywhere in comments * add cudnn host parameters memory manager. * change name to allocate_by_datatype * compiled * debug * fix bug: using list instead of vector, vector address will change each time it resize * add CUDNN_DATA_UINT8 and CUDNN_DATA_UINT8x4
eef2b19d · Fenglei · Nick Korovaiko · 35b04e6a · eef2b19d · eef2b19d
Commit eef2b19d authored Jun 28, 2018 by Fenglei Committed by Nick Korovaiko Jun 28, 2018
7 changed files
--- a/src/ngraph/runtime/gpu/cudnn_emitter.cpp
+++ b/src/ngraph/runtime/gpu/cudnn_emitter.cpp
--- a/src/ngraph/runtime/gpu/cudnn_emitter.hpp
+++ b/src/ngraph/runtime/gpu/cudnn_emitter.hpp
@@ -26,6 +26,7 @@

 #include "ngraph/axis_set.hpp"
 #include "ngraph/runtime/gpu/cudnn_descriptors.hpp"
+#include "ngraph/runtime/gpu/cudnn_host_parameters.hpp"
 #include "ngraph/runtime/gpu/gpu_runtime_context.hpp"
 #include "ngraph/shape.hpp"

@@ -56,7 +57,7 @@ namespace ngraph
                };

                size_t build_convolution(const runtime::gpu::GPURuntimeContext* ctx,
-                                         const cudnnDataType_t data_type,
+                                         const std::string& dtype,
                                         const Shape& input_tensor_shape,
                                         const Shape& input_filter_shape,
                                         const Shape& output_tensor_shape,
@@ -65,7 +66,7 @@ namespace ngraph
                                         const Shape& padding_below);

                size_t build_convolution_backward_data(const runtime::gpu::GPURuntimeContext* ctx,
-                                                       const cudnnDataType_t data_type,
+                                                       const std::string& dtype,
                                                       const Shape& input_filter_shape,
                                                       const Shape& input_tensor_shape,
                                                       const Shape& output_tensor_shape,
@@ -74,7 +75,7 @@ namespace ngraph
                                                       const Shape& padding_below);

                size_t build_convolution_backward_filter(const runtime::gpu::GPURuntimeContext* ctx,
-                                                         const cudnnDataType_t data_type,
+                                                         const std::string& dtype,
                                                         const Shape& input_tensor_shape_0,
                                                         const Shape& input_tensor_shape_1,
                                                         const Shape& output_filter_shape,
@@ -84,11 +85,21 @@ namespace ngraph

                size_t build_reduce_forward(const GPURuntimeContext* ctx,
                                            const cudnnReduceTensorOp_t& reduce_op,
+                                            const std::string& dtype,
                                            const Shape& input_shape,
                                            const AxisSet& reduction_axes);

+                size_t build_tensor_op(const GPURuntimeContext* ctx,
+                                       const cudnnOpTensorOp_t& tensor_op,
+                                       const std::string& dtype,
+                                       const Shape& input_shape,
+                                       const double alpha0,
+                                       const double alpha1,
+                                       const double beta);
+
                size_t build_pooling(const GPURuntimeContext* ctx,
                                     const cudnnPoolingMode_t& pool_op,
+                                     const std::string& dtype,
                                     const Prop& direction,
                                     const ngraph::Shape& input_shape,
                                     const ngraph::Shape& output_shape,
@@ -99,6 +110,7 @@ namespace ngraph

                size_t build_batchnorm(const runtime::gpu::GPURuntimeContext* ctx,
                                       const cudnnBatchNormMode_t& bn_op,
+                                       const std::string& dtype,
                                       const Prop& direction,
                                       const Shape& tensor_shape,
                                       const Shape& param_shape,
@@ -107,10 +119,21 @@ namespace ngraph
                size_t build_softmax(const runtime::gpu::GPURuntimeContext* ctx,
                                     const cudnnSoftmaxAlgorithm_t& algorithm,
                                     const cudnnSoftmaxMode_t& mode,
+                                     const std::string& dtype,
                                     const Prop& direction,
                                     const Shape& tensor_shape);

-                cudnnTensorDescriptor_t& tensor_descriptor_from_shape(const Shape& shape);
+            private:
+                CUDNNEmitter(GPUPrimitiveEmitter* emitter);
+
+                void* get_data_by_type(cudnnDataType_t data_type, double value);
+
+                cudnnDataType_t get_cudnn_datatype(std::string dtype);
+
+                cudnnTensorDescriptor_t&
+                    tensor_descriptor_from_shape(const Shape& shape,
+                                                 const cudnnDataType_t data_type,
+                                                 const cudnnTensorFormat_t tensor_format);
                cudnnFilterDescriptor_t&
                    get_cudnn_filter_descriptor(const Shape& shape,
                                                const cudnnDataType_t data_type,
@@ -122,10 +145,9 @@ namespace ngraph
                                                     cudnnConvolutionMode_t mode,
                                                     cudnnDataType_t data_type);

-            private:
-                CUDNNEmitter(GPUPrimitiveEmitter* emitter);
-
                CUDNNDescriptors m_descriptors;
+                CUDNNHostParameters m_host_parameters;
+
                GPUPrimitiveEmitter* m_primitive_emitter;
            };
        }

--- a/src/ngraph/runtime/gpu/cudnn_host_parameters.hpp
+++ b/src/ngraph/runtime/gpu/cudnn_host_parameters.hpp
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#pragma once
+
+#include <list>
+#include <memory>
+
+#include <cudnn.h>
+
+#include "ngraph/log.hpp"
+#include "ngraph/runtime/gpu/gpu_util.hpp"
+
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace gpu
+        {
+            /// \brief A factory which builds cuDNN host parameters
+            /// and manages their creation and destruction.
+            class CUDNNHostParameters
+            {
+            public:
+                CUDNNHostParameters() = default;
+                ~CUDNNHostParameters() = default;
+
+                void* allocate_by_datatype(const cudnnDataType_t data_type, const double value)
+                {
+                    void* r = nullptr;
+                    switch (data_type)
+                    {
+                    case CUDNN_DATA_FLOAT:
+                        m_host_parameters_float.push_back(static_cast<float>(value));
+                        r = static_cast<void*>(&m_host_parameters_float.back());
+                        break;
+                    case CUDNN_DATA_DOUBLE:
+                        m_host_parameters_double.push_back(value);
+                        r = static_cast<void*>(&m_host_parameters_double.back());
+                        break;
+                    case CUDNN_DATA_INT8:
+                        m_host_parameters_int8_t.push_back(static_cast<int8_t>(value));
+                        r = static_cast<void*>(&m_host_parameters_int8_t.back());
+                        break;
+                    case CUDNN_DATA_INT32:
+                        m_host_parameters_int32_t.push_back(static_cast<int32_t>(value));
+                        r = static_cast<void*>(&m_host_parameters_int32_t.back());
+                        break;
+                    case CUDNN_DATA_HALF:
+                    case CUDNN_DATA_INT8x4:
+                    case CUDNN_DATA_UINT8:
+                    case CUDNN_DATA_UINT8x4:
+                        std::string err = "datatype is not supported by cuDNN";
+                        throw std::runtime_error(err);
+                    }
+                    return r;
+                }
+
+            private:
+                std::list<int8_t> m_host_parameters_int8_t;
+                std::list<int32_t> m_host_parameters_int32_t;
+                std::list<float> m_host_parameters_float;
+                std::list<double> m_host_parameters_double;
+            };
+        }
+    }
+}
--- a/src/ngraph/runtime/gpu/gpu_cuda_kernel_ops.hpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_kernel_ops.hpp
@@ -288,6 +288,13 @@ namespace ngraph
                static constexpr const char* math_kernel = "!x0";
            };

+            template <>
+            struct CudaOpMap<ngraph::op::Negative>
+            {
+                static constexpr const char* op = "negative";
+                static constexpr const char* math_kernel = "-x0";
+            };
+
            template <>
            struct CudaOpMap<ngraph::op::Select>
            {

--- a/src/ngraph/runtime/gpu/gpu_emitter.cpp
+++ b/src/ngraph/runtime/gpu/gpu_emitter.cpp
--- a/src/ngraph/runtime/gpu/gpu_external_function.cpp
+++ b/src/ngraph/runtime/gpu/gpu_external_function.cpp
@@ -187,7 +187,7 @@ static const runtime::gpu::OpMap dispatcher{
    {TI(ngraph::op::Log), &runtime::gpu::GPU_Emitter::emit_elementwise<ngraph::op::Log>},
    {TI(ngraph::op::Maximum), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Maximum>},
    {TI(ngraph::op::Minimum), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Minimum>},
-    {TI(ngraph::op::Negative), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Negative>},
+    {TI(ngraph::op::Negative), &runtime::gpu::GPU_Emitter::emit_elementwise<ngraph::op::Negative>},
    {TI(ngraph::op::NotEqual), &runtime::gpu::GPU_Emitter::emit_elementwise<ngraph::op::NotEqual>},
    {TI(ngraph::op::Power), &runtime::gpu::GPU_Emitter::emit_elementwise<ngraph::op::Power>},
    {TI(ngraph::op::Select), &runtime::gpu::GPU_Emitter::emit_elementwise<ngraph::op::Select>},
@@ -431,7 +431,7 @@ using namespace std;
            }
        }
    }
-    // Add cudnn descriptor factory for descriptor management.
+    // Add cuDNN descriptor factory for descriptor management.
    // After the cuDNN code emitted in gpu_emitter.cc is refactored
    // into the CUDNNEmitter class, this can be removed.
    writer << "static runtime::gpu::CUDNNDescriptors descriptors;\n\n";

--- a/src/ngraph/runtime/gpu/unit_test.manifest
+++ b/src/ngraph/runtime/gpu/unit_test.manifest
+#int64 is not supprted by cuDNN
 abc_int64
 batch_norm_one_output
 batch_norm_three_outputs
+#need to check 
 computation_reuse
+#int64 is not supprted 
 concat_matrix_int64
+#convolution 4d is work in progress
 convolution_4d_2items
 convolution_4d_4items
 convolution_4d_4items_dilated
@@ -12,16 +16,27 @@ convolution_4d_4items_strided_dilated
 convolution_4d_4items_strided_dilated_padded
 convolution_4d_4items_strided_dilated_padded_neg
 convolution_4d_4items_strided_dilated_padded_same
+#cuDNN does not have arithmetic exceptions
 divide_by_zero_int32
-dot_4d_5d_multi_axis_big_fp64_VERY_SLOW
+#int64 is not supprted by cuDNN
 dot_matrix_vector_int64
+#no mkldnn on GPU
 mkldnn_layouts
+#error throw is not the same on GPU, not supported yet
 one_hot_scalar_fp_nonint_in_3
 one_hot_scalar_oob_in_3
 one_hot_vector_1_barely_oob
 one_hot_vector_1_far_oob
 one_hot_vector_1_fp_nonint
+#select_and_scatter is deprecated
 select_and_scatter_3d_without_overlap
 select_and_scatter_with_overlap
 select_and_scatter_without_overlap
+#custom_mem is not implemented on GPU
 tensorview_custom_mem
+#integer is not supported by cuDNN on backward pooling
+backwards_maxpool_n4_c1_hw4_2x2_max
+backwards_maxpool_n2_c1_hw5_3x3_str2_max
+backwards_avgpool_n1_c1_hw2x2
+backwards_avgpool_n1_c1_hw4x4
+backwards_avgpool_n2_c2_hw4x4