Nd convolution via blocked GEMM for C{d1,...,dn}N layout (#1131)

* Added blank convolution kernel and refactored coordinate transform kernel helper. * Added op::Reshape to the CUDAEmitter. * Added 2-Nd tiled convolution. * Bug fixes with data_dilation and filter loop. Still need to add test for coverage of register tiling. * Styling. * Removed some comments and code added for testing. * Some tests became enabled in merge, removing them.

Nd convolution via blocked GEMM for C{d1,...,dn}N layout (#1131)
* Added blank convolution kernel and refactored coordinate transform kernel helper. * Added op::Reshape to the CUDAEmitter. * Added 2-Nd tiled convolution. * Bug fixes with data_dilation and filter loop. Still need to add test for coverage of register tiling. * Styling. * Removed some comments and code added for testing. * Some tests became enabled in merge, removing them.
ae45c984 · Chris Sullivan · Scott Cyphers · 3a43bdac · ae45c984 · ae45c984
Commit ae45c984 authored Jun 29, 2018 by Chris Sullivan Committed by Scott Cyphers Jun 29, 2018
10 changed files
--- a/src/ngraph/runtime/gpu/cuda_emitter.cpp
+++ b/src/ngraph/runtime/gpu/cuda_emitter.cpp
--- a/src/ngraph/runtime/gpu/cuda_emitter.hpp
+++ b/src/ngraph/runtime/gpu/cuda_emitter.hpp
@@ -127,6 +127,21 @@ namespace ngraph
                                       GPUShape result_shape,
                                       const std::set<size_t>& bcast_axes);

+                size_t build_reshape(const GPURuntimeContext* ctx,
+                                     const std::array<std::string, 2>& dtypes,
+                                     GPUShape input_shape,
+                                     GPUShape input_order);
+
+                size_t build_convolution(const GPURuntimeContext* ctx,
+                                         const std::array<std::string, 3>& dtypes,
+                                         GPUShape input_shape,
+                                         GPUShape input_pad_below,
+                                         GPUShape input_dilation,
+                                         GPUShape filter_shape,
+                                         GPUShape filter_stride,
+                                         GPUShape filter_dilation,
+                                         GPUShape output_shape);
+
            private:
                CUDAEmitter(GPUPrimitiveEmitter* emitter);
                void print_tensor_from_gpu(codegen::CodeWriter& writer,

--- a/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.cpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.cpp
--- a/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.hpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.hpp
@@ -114,6 +114,16 @@ namespace ngraph
                                         const std::array<std::string, 2>& data_types,
                                         bool include_pad);

+                static void get_convolution_forward(codegen::CodeWriter& writer,
+                                                    const std::string& name,
+                                                    const std::array<std::string, 3>& data_types,
+                                                    int N,
+                                                    int K,
+                                                    int filter_size,
+                                                    int rank,
+                                                    int sm_tile_size = 8,
+                                                    int reg_tile_size = 1);
+
                static void add_pod_typedefs(codegen::CodeWriter& writer);

                /// \brief Given kernel input variables i_* produce register variables o_coordinates{i}
@@ -127,6 +137,13 @@ namespace ngraph
                                                           std::string i_reduced_strides,
                                                           std::string o_coordinates,
                                                           size_t rank);
+                static void coordinate_transform_to_multi_d(codegen::CodeWriter& writer,
+                                                            std::string i_strides,
+                                                            std::string i_stride_magic,
+                                                            std::string i_stride_shift,
+                                                            std::string i_coord_product,
+                                                            std::string o_coordinates,
+                                                            size_t rank);
            };
        }
    }

--- a/src/ngraph/runtime/gpu/gpu_emitter.cpp
+++ b/src/ngraph/runtime/gpu/gpu_emitter.cpp
--- a/src/ngraph/runtime/gpu/gpu_memory_manager.hpp
+++ b/src/ngraph/runtime/gpu/gpu_memory_manager.hpp
@@ -42,6 +42,12 @@ namespace ngraph
                GPUAllocator(const GPUAllocator& g);

                ~GPUAllocator();
+                template <typename T>
+                size_t reserve_argspace(const T& container)
+                {
+                    return reserve_argspace(container.data(),
+                                            container.size() * sizeof(typename T::value_type));
+                }
                size_t reserve_argspace(const void* data, size_t size);
                size_t reserve_workspace(size_t size, bool zero_initialize = true);


--- a/src/ngraph/runtime/gpu/gpu_shape.hpp
+++ b/src/ngraph/runtime/gpu/gpu_shape.hpp
@@ -21,6 +21,7 @@
 #include <vector>

 #include "ngraph/axis_set.hpp"
+#include "ngraph/axis_vector.hpp"
 #include "ngraph/coordinate.hpp"
 #include "ngraph/coordinate_diff.hpp"
 #include "ngraph/shape.hpp"
@@ -30,45 +31,45 @@ namespace ngraph
 {
    class Shape;
    /// \brief Shape for a tensor resident on GPU.
-    class GPUShape : public std::vector<uint32_t>
+    class GPUShape : public std::vector<int32_t>
    {
    public:
-        GPUShape(const std::initializer_list<uint32_t>& axis_lengths)
-            : std::vector<uint32_t>(axis_lengths)
+        GPUShape(const std::initializer_list<int32_t>& axis_lengths)
+            : std::vector<int32_t>(axis_lengths)
        {
        }

-        GPUShape(const std::vector<uint32_t>& axis_lengths)
-            : std::vector<uint32_t>(axis_lengths)
+        GPUShape(const std::vector<int32_t>& axis_lengths)
+            : std::vector<int32_t>(axis_lengths)
        {
        }

        GPUShape(const GPUShape& axis_lengths)
-            : std::vector<uint32_t>(axis_lengths)
+            : std::vector<int32_t>(axis_lengths)
        {
        }

-        explicit GPUShape(size_t n, uint32_t initial_value = 0)
-            : std::vector<uint32_t>(n, initial_value)
+        explicit GPUShape(size_t n, int32_t initial_value = 0)
+            : std::vector<int32_t>(n, initial_value)
        {
        }

        template <class InputIterator>
        GPUShape(InputIterator first, InputIterator last)
-            : std::vector<uint32_t>(first, last)
+            : std::vector<int32_t>(first, last)
        {
        }

        GPUShape() {}
        GPUShape& operator=(const GPUShape& v)
        {
-            static_cast<std::vector<uint32_t>*>(this)->operator=(v);
+            static_cast<std::vector<int32_t>*>(this)->operator=(v);
            return *this;
        }

        GPUShape& operator=(GPUShape&& v)
        {
-            static_cast<std::vector<uint32_t>*>(this)->operator=(v);
+            static_cast<std::vector<int32_t>*>(this)->operator=(v);
            return *this;
        }

@@ -81,7 +82,7 @@ namespace ngraph
                    throw std::runtime_error(
                        "Request exceeds the bitwidth available for GPUShapes (32)");
                }
-                this->push_back(static_cast<uint32_t>(size));
+                this->push_back(static_cast<int32_t>(size));
            }
        }

@@ -95,7 +96,7 @@ namespace ngraph
                        "Request for Shape which exceeds the bitwidth available for GPUShapes "
                        "(32)");
                }
-                this->push_back(static_cast<uint32_t>(size));
+                this->push_back(static_cast<int32_t>(size));
            }
        }

@@ -109,7 +110,7 @@ namespace ngraph
                        "Request for Strides which exceed the bitwidth available for GPUShapes "
                        "(32)");
                }
-                this->push_back(static_cast<uint32_t>(size));
+                this->push_back(static_cast<int32_t>(size));
            }
        }

@@ -123,21 +124,36 @@ namespace ngraph
                        "Request for Coordinate which exceed the bitwidth available for GPUShapes "
                        "(32)");
                }
-                this->push_back(static_cast<uint32_t>(size));
+                this->push_back(static_cast<int32_t>(size));
            }
        }

        GPUShape(const CoordinateDiff& coord)
        {
-            for (auto const& size : coord)
+            for (auto const& dim : coord)
+            {
+                if (dim > 0 && dim >> 32 != 0)
+                {
+                    throw std::runtime_error(
+                        "Request for CoordinateDiff which exceed the bitwidth available for "
+                        "GPUShapes "
+                        "(32)");
+                }
+                this->push_back(static_cast<int32_t>(dim));
+            }
+        }
+
+        GPUShape(const AxisVector& vec)
+        {
+            for (auto const& size : vec)
            {
                if (size >> 32 != 0)
                {
                    throw std::runtime_error(
-                        "Request for Coordinate which exceed the bitwidth available for GPUShapes "
+                        "Request for axis vector which exceed the bitwidth available for GPUShapes "
                        "(32)");
                }
-                this->push_back(static_cast<uint32_t>(size));
+                this->push_back(static_cast<int32_t>(size));
            }
        }
    };

--- a/src/ngraph/runtime/gpu/gpu_util.cpp
+++ b/src/ngraph/runtime/gpu/gpu_util.cpp
@@ -186,3 +186,9 @@ std::pair<uint64_t, uint64_t> runtime::gpu::idiv_magic_u64(uint64_t divisor)
 {
    return magicU64(divisor);
 }
+
+uint32_t runtime::gpu::idiv_ceil(int n, int d)
+{
+    // compiler fused modulo and division
+    return n / d + (n % d > 0);
+}
--- a/src/ngraph/runtime/gpu/gpu_util.hpp
+++ b/src/ngraph/runtime/gpu/gpu_util.hpp
@@ -103,6 +103,7 @@ namespace ngraph
            void cuda_memset(void* dst, int value, size_t buffer_size);
            std::pair<uint64_t, uint64_t> idiv_magic_u32(uint64_t max_numerator, uint64_t divisor);
            std::pair<uint64_t, uint64_t> idiv_magic_u64(uint64_t divisor);
+            uint32_t idiv_ceil(int n, int d);

            template <typename T>
            void print_gpu_tensor(const void* p, size_t element_count)

--- a/src/ngraph/runtime/gpu/unit_test.manifest
+++ b/src/ngraph/runtime/gpu/unit_test.manifest
@@ -6,17 +6,6 @@ batch_norm_three_outputs
 computation_reuse
 #int64 is not supprted 
 concat_matrix_int64
-#convolution 4d is work in progress
-convolution_4d_2items
-convolution_4d_4items
-convolution_4d_4items_dilated
-convolution_4d_4items_padded_neg
-convolution_4d_4items_strided
-convolution_4d_4items_strided_dilated
-convolution_4d_4items_strided_dilated_padded
-convolution_4d_4items_strided_dilated_padded_neg
-convolution_4d_4items_strided_dilated_padded_same
-#cuDNN does not have arithmetic exceptions
 divide_by_zero_int32
 #int64 is not supprted by cuDNN
 dot_matrix_vector_int64