Nd convolution via blocked GEMM for C{d1,...,dn}N layout (#1131)

* Added blank convolution kernel and refactored coordinate transform kernel helper. * Added op::Reshape to the CUDAEmitter. * Added 2-Nd tiled convolution. * Bug fixes with data_dilation and filter loop. Still need to add test for coverage of register tiling. * Styling. * Removed some comments and code added for testing. * Some tests became enabled in merge, removing them.

Nd convolution via blocked GEMM for C{d1,...,dn}N layout (#1131)
* Added blank convolution kernel and refactored coordinate transform kernel helper. * Added op::Reshape to the CUDAEmitter. * Added 2-Nd tiled convolution. * Bug fixes with data_dilation and filter loop. Still need to add test for coverage of register tiling. * Styling. * Removed some comments and code added for testing. * Some tests became enabled in merge, removing them.
ae45c984 · Chris Sullivan · Scott Cyphers · 3a43bdac · ae45c984 · ae45c984
Commit ae45c984 authored Jun 29, 2018 by Chris Sullivan Committed by Scott Cyphers Jun 29, 2018
10 changed files
--- a/src/ngraph/runtime/gpu/cuda_emitter.cpp
+++ b/src/ngraph/runtime/gpu/cuda_emitter.cpp
--- a/src/ngraph/runtime/gpu/cuda_emitter.hpp
+++ b/src/ngraph/runtime/gpu/cuda_emitter.hpp
@@ -127,6 +127,21 @@ namespace ngraph
                                       GPUShape result_shape,
                                       const std::set<size_t>& bcast_axes);

+                size_t build_reshape(const GPURuntimeContext* ctx,
+                                     const std::array<std::string, 2>& dtypes,
+                                     GPUShape input_shape,
+                                     GPUShape input_order);
+
+                size_t build_convolution(const GPURuntimeContext* ctx,
+                                         const std::array<std::string, 3>& dtypes,
+                                         GPUShape input_shape,
+                                         GPUShape input_pad_below,
+                                         GPUShape input_dilation,
+                                         GPUShape filter_shape,
+                                         GPUShape filter_stride,
+                                         GPUShape filter_dilation,
+                                         GPUShape output_shape);
+
            private:
                CUDAEmitter(GPUPrimitiveEmitter* emitter);
                void print_tensor_from_gpu(codegen::CodeWriter& writer,

--- a/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.cpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.cpp
--- a/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.hpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.hpp
@@ -114,6 +114,16 @@ namespace ngraph
                                         const std::array<std::string, 2>& data_types,
                                         bool include_pad);

+                static void get_convolution_forward(codegen::CodeWriter& writer,
+                                                    const std::string& name,
+                                                    const std::array<std::string, 3>& data_types,
+                                                    int N,
+                                                    int K,
+                                                    int filter_size,
+                                                    int rank,
+                                                    int sm_tile_size = 8,
+                                                    int reg_tile_size = 1);
+
                static void add_pod_typedefs(codegen::CodeWriter& writer);

                /// \brief Given kernel input variables i_* produce register variables o_coordinates{i}
@@ -127,6 +137,13 @@ namespace ngraph
                                                           std::string i_reduced_strides,
                                                           std::string o_coordinates,
                                                           size_t rank);
+                static void coordinate_transform_to_multi_d(codegen::CodeWriter& writer,
+                                                            std::string i_strides,
+                                                            std::string i_stride_magic,
+                                                            std::string i_stride_shift,
+                                                            std::string i_coord_product,
+                                                            std::string o_coordinates,
+                                                            size_t rank);
            };
        }
    }

--- a/src/ngraph/runtime/gpu/gpu_emitter.cpp
+++ b/src/ngraph/runtime/gpu/gpu_emitter.cpp
@@ -149,6 +149,11 @@ namespace ngraph
                }

                auto convolution = static_cast<const ngraph::op::Convolution*>(node);
+
+                auto input_shape = args[0].get_shape();
+                auto filter_shape = args[1].get_shape();
+                auto output_shape = out[0].get_shape();
+                auto rank = input_shape.size();
                Strides window_dilation_strides = convolution->get_window_dilation_strides();
                Strides window_movement_strides = convolution->get_window_movement_strides();
                Strides data_dilation_strides = convolution->get_data_dilation_strides();
@@ -157,10 +162,105 @@ namespace ngraph

                if (padding_below_diff.size() > 3)
                {
-                    throw std::runtime_error(node->get_name() +
-                                             "with more than 3D is not implemented.");
+                    // Reshape from NC{d1,..,dn} -> C{d1,...,dn}N
+                    // and from KC{df1,...,dfn} -> C{df1,...,dfn}N.
+
+                    // TODO: This should be done via a pass similar to
+                    // what is done for convolution in the IA transformer
+                    // c.f runtime/cpu/pass/cpu_layout.cpp
+
+                    GPUAllocator allocator =
+                        external_function->get_primitive_emitter()->get_memory_allocator();
+                    size_t transposed_data_idx = allocator.reserve_workspace(
+                        args[0].get_size() * args[0].get_element_type().size());
+                    size_t transposed_filter_idx = allocator.reserve_workspace(
+                        args[1].get_size() * args[1].get_element_type().size());
+                    size_t transposed_output_idx = allocator.reserve_workspace(
+                        out[0].get_size() * out[0].get_element_type().size());
+
+                    GPUShape input_order;
+                    for (int i = 1; i <= rank; i++)
+                    {
+                        input_order.push_back(i % rank);
+                    }
+
+                    auto& cuda_emitter =
+                        external_function->get_primitive_emitter()->get_cuda_emitter();
+
+                    size_t reshape_data_index =
+                        cuda_emitter->build_reshape(external_function->ctx().get(),
+                                                    {{args[0].get_type(), args[0].get_type()}},
+                                                    input_shape,
+                                                    input_order);
+                    writer << "void* data = gpu::invoke_memory_primitive(ctx, "
+                           << transposed_data_idx << ");\n";
+                    writer << "gpu::invoke_primitive(ctx, " << reshape_data_index << ", ";
+                    writer << "std::vector<void*>{" << args[0].get_name() << "}.data(), ";
+                    writer << "std::vector<void*>{data}.data());\n";
+
+                    size_t reshape_filter_index =
+                        cuda_emitter->build_reshape(external_function->ctx().get(),
+                                                    {{args[1].get_type(), args[1].get_type()}},
+                                                    filter_shape,
+                                                    input_order);
+                    writer << "void* filter = gpu::invoke_memory_primitive(ctx, "
+                           << transposed_filter_idx << ");\n";
+                    writer << "gpu::invoke_primitive(ctx, " << reshape_filter_index << ", ";
+                    writer << "std::vector<void*>{" << args[1].get_name() << "}.data(), ";
+                    writer << "std::vector<void*>{filter}.data());\n";
+
+                    // local helper to reshape tensor shape objects
+                    auto reshape = [](const Shape& shape, const GPUShape& order) {
+                        Shape output(shape.size(), 0);
+                        for (size_t i = 0; i < shape.size(); i++)
+                        {
+                            output[i] = shape[order[i]];
+                        }
+                        return output;
+                    };
+
+                    // reorder axes of the input shape (NC{d_1,...,d_n} -> C{d_1,...,d_n}N)
+                    input_shape = reshape(input_shape, input_order);
+                    // reorder axes of the filter shape (KC{df_1,...,df_n} -> C{df_1,...,df_n}K)
+                    filter_shape = reshape(filter_shape, input_order);
+                    // reorder axes of the output shape (NK{do_1,...,do_n} -> K{do_1,...,do_n}N)
+                    output_shape = reshape(output_shape, input_order);
+
+                    size_t conv_index = cuda_emitter->build_convolution(
+                        external_function->ctx().get(),
+                        {{args[0].get_type(), args[1].get_type(), out[0].get_type()}},
+                        input_shape,
+                        padding_below_diff,
+                        data_dilation_strides,
+                        filter_shape,
+                        window_movement_strides,
+                        window_dilation_strides,
+                        output_shape);
+                    writer << "void* output = gpu::invoke_memory_primitive(ctx, "
+                           << transposed_output_idx << ");\n";
+                    writer << "gpu::invoke_primitive(ctx, " << conv_index << ", ";
+                    writer << "std::vector<void*>{data, filter}.data(), ";
+                    writer << "std::vector<void*>{output}.data());\n";
+
+                    // reshape output tensor (K{do_1,...,do_n}N -> NK{do_1,...,do_n})
+                    input_order.clear();
+                    input_order.push_back(static_cast<int>(rank - 1));
+                    for (int i = 0; i < rank - 1; i++)
+                    {
+                        input_order.push_back(i);
                    }

+                    size_t reshape_output_index =
+                        cuda_emitter->build_reshape(external_function->ctx().get(),
+                                                    {{args[1].get_type(), args[1].get_type()}},
+                                                    output_shape,
+                                                    input_order);
+                    writer << "gpu::invoke_primitive(ctx, " << reshape_output_index << ", ";
+                    writer << "std::vector<void*>{output}.data(), ";
+                    writer << "std::vector<void*>{" << out[0].get_name() << "}.data());\n";
+                }
+                else
+                {
                    bool is_deconvolution = false;
                    for (auto a : data_dilation_strides)
                    {
@@ -181,7 +281,6 @@ namespace ngraph
                        padding_above[i] = static_cast<size_t>(padding_above_diff[i]);
                    }

-                auto input_shape = args[0].get_shape();
                    Shape input_shape_padded = input_shape;
                    Shape padding_interior(data_dilation_strides);
                    writer.block_begin();
@@ -199,12 +298,13 @@ namespace ngraph
                               << idx_workspace << ");\n";
                        writer << "std::vector<" << args[0].get_type() << "> pad_buffer_host("
                               << shape_size(input_shape_padded) << ", 0);\n";
-                    writer << "runtime::gpu::cuda_memcpyHtD(pad_buffer, pad_buffer_host.data(), "
+                        writer
+                            << "runtime::gpu::cuda_memcpyHtD(pad_buffer, pad_buffer_host.data(), "
                            << temp_size << ");\n";
                        auto& cuda_emitter =
                            external_function->get_primitive_emitter()->get_cuda_emitter();
-                    auto pad_dynamic_index =
-                        cuda_emitter->build_pad_dynamic(external_function->ctx().get(),
+                        auto pad_dynamic_index = cuda_emitter->build_pad_dynamic(
+                            external_function->ctx().get(),
                            {{args[0].get_type(), out[0].get_type()}},
                            input_shape,
                            input_shape_padded,
@@ -216,7 +316,7 @@ namespace ngraph
                        writer << "std::vector<void*>{pad_buffer}.data()";
                        writer << ");\n";
                        // asymetric padding has been applied, zero out padding vectors to
-                    // ensure cuDNN does not assume padding
+                        // ensure cudnn does not assume padding
                        std::fill(padding_below.begin(), padding_below.end(), 0);
                    }
                    auto& cudnn_emitter =
@@ -246,6 +346,7 @@ namespace ngraph
                    writer << ");\n";
                    writer.block_end();
                }
+            }

            template <>
            void GPU_Emitter::EMITTER_DECL(ngraph::op::ConvolutionBackpropData)

--- a/src/ngraph/runtime/gpu/gpu_memory_manager.hpp
+++ b/src/ngraph/runtime/gpu/gpu_memory_manager.hpp
@@ -42,6 +42,12 @@ namespace ngraph
                GPUAllocator(const GPUAllocator& g);

                ~GPUAllocator();
+                template <typename T>
+                size_t reserve_argspace(const T& container)
+                {
+                    return reserve_argspace(container.data(),
+                                            container.size() * sizeof(typename T::value_type));
+                }
                size_t reserve_argspace(const void* data, size_t size);
                size_t reserve_workspace(size_t size, bool zero_initialize = true);


--- a/src/ngraph/runtime/gpu/gpu_shape.hpp
+++ b/src/ngraph/runtime/gpu/gpu_shape.hpp
@@ -21,6 +21,7 @@
 #include <vector>

 #include "ngraph/axis_set.hpp"
+#include "ngraph/axis_vector.hpp"
 #include "ngraph/coordinate.hpp"
 #include "ngraph/coordinate_diff.hpp"
 #include "ngraph/shape.hpp"
@@ -30,45 +31,45 @@ namespace ngraph
 {
    class Shape;
    /// \brief Shape for a tensor resident on GPU.
-    class GPUShape : public std::vector<uint32_t>
+    class GPUShape : public std::vector<int32_t>
    {
    public:
-        GPUShape(const std::initializer_list<uint32_t>& axis_lengths)
-            : std::vector<uint32_t>(axis_lengths)
+        GPUShape(const std::initializer_list<int32_t>& axis_lengths)
+            : std::vector<int32_t>(axis_lengths)
        {
        }

-        GPUShape(const std::vector<uint32_t>& axis_lengths)
-            : std::vector<uint32_t>(axis_lengths)
+        GPUShape(const std::vector<int32_t>& axis_lengths)
+            : std::vector<int32_t>(axis_lengths)
        {
        }

        GPUShape(const GPUShape& axis_lengths)
-            : std::vector<uint32_t>(axis_lengths)
+            : std::vector<int32_t>(axis_lengths)
        {
        }

-        explicit GPUShape(size_t n, uint32_t initial_value = 0)
-            : std::vector<uint32_t>(n, initial_value)
+        explicit GPUShape(size_t n, int32_t initial_value = 0)
+            : std::vector<int32_t>(n, initial_value)
        {
        }

        template <class InputIterator>
        GPUShape(InputIterator first, InputIterator last)
-            : std::vector<uint32_t>(first, last)
+            : std::vector<int32_t>(first, last)
        {
        }

        GPUShape() {}
        GPUShape& operator=(const GPUShape& v)
        {
-            static_cast<std::vector<uint32_t>*>(this)->operator=(v);
+            static_cast<std::vector<int32_t>*>(this)->operator=(v);
            return *this;
        }

        GPUShape& operator=(GPUShape&& v)
        {
-            static_cast<std::vector<uint32_t>*>(this)->operator=(v);
+            static_cast<std::vector<int32_t>*>(this)->operator=(v);
            return *this;
        }

@@ -81,7 +82,7 @@ namespace ngraph
                    throw std::runtime_error(
                        "Request exceeds the bitwidth available for GPUShapes (32)");
                }
-                this->push_back(static_cast<uint32_t>(size));
+                this->push_back(static_cast<int32_t>(size));
            }
        }

@@ -95,7 +96,7 @@ namespace ngraph
                        "Request for Shape which exceeds the bitwidth available for GPUShapes "
                        "(32)");
                }
-                this->push_back(static_cast<uint32_t>(size));
+                this->push_back(static_cast<int32_t>(size));
            }
        }

@@ -109,7 +110,7 @@ namespace ngraph
                        "Request for Strides which exceed the bitwidth available for GPUShapes "
                        "(32)");
                }
-                this->push_back(static_cast<uint32_t>(size));
+                this->push_back(static_cast<int32_t>(size));
            }
        }

@@ -123,21 +124,36 @@ namespace ngraph
                        "Request for Coordinate which exceed the bitwidth available for GPUShapes "
                        "(32)");
                }
-                this->push_back(static_cast<uint32_t>(size));
+                this->push_back(static_cast<int32_t>(size));
            }
        }

        GPUShape(const CoordinateDiff& coord)
        {
-            for (auto const& size : coord)
+            for (auto const& dim : coord)
+            {
+                if (dim > 0 && dim >> 32 != 0)
+                {
+                    throw std::runtime_error(
+                        "Request for CoordinateDiff which exceed the bitwidth available for "
+                        "GPUShapes "
+                        "(32)");
+                }
+                this->push_back(static_cast<int32_t>(dim));
+            }
+        }
+
+        GPUShape(const AxisVector& vec)
+        {
+            for (auto const& size : vec)
            {
                if (size >> 32 != 0)
                {
                    throw std::runtime_error(
-                        "Request for Coordinate which exceed the bitwidth available for GPUShapes "
+                        "Request for axis vector which exceed the bitwidth available for GPUShapes "
                        "(32)");
                }
-                this->push_back(static_cast<uint32_t>(size));
+                this->push_back(static_cast<int32_t>(size));
            }
        }
    };

--- a/src/ngraph/runtime/gpu/gpu_util.cpp
+++ b/src/ngraph/runtime/gpu/gpu_util.cpp
@@ -186,3 +186,9 @@ std::pair<uint64_t, uint64_t> runtime::gpu::idiv_magic_u64(uint64_t divisor)
 {
    return magicU64(divisor);
 }
+
+uint32_t runtime::gpu::idiv_ceil(int n, int d)
+{
+    // compiler fused modulo and division
+    return n / d + (n % d > 0);
+}
--- a/src/ngraph/runtime/gpu/gpu_util.hpp
+++ b/src/ngraph/runtime/gpu/gpu_util.hpp
@@ -103,6 +103,7 @@ namespace ngraph
            void cuda_memset(void* dst, int value, size_t buffer_size);
            std::pair<uint64_t, uint64_t> idiv_magic_u32(uint64_t max_numerator, uint64_t divisor);
            std::pair<uint64_t, uint64_t> idiv_magic_u64(uint64_t divisor);
+            uint32_t idiv_ceil(int n, int d);

            template <typename T>
            void print_gpu_tensor(const void* p, size_t element_count)

--- a/src/ngraph/runtime/gpu/unit_test.manifest
+++ b/src/ngraph/runtime/gpu/unit_test.manifest
@@ -6,17 +6,6 @@ batch_norm_three_outputs
 computation_reuse
 #int64 is not supprted 
 concat_matrix_int64
-#convolution 4d is work in progress
-convolution_4d_2items
-convolution_4d_4items
-convolution_4d_4items_dilated
-convolution_4d_4items_padded_neg
-convolution_4d_4items_strided
-convolution_4d_4items_strided_dilated
-convolution_4d_4items_strided_dilated_padded
-convolution_4d_4items_strided_dilated_padded_neg
-convolution_4d_4items_strided_dilated_padded_same
-#cuDNN does not have arithmetic exceptions
 divide_by_zero_int32
 #int64 is not supprted by cuDNN
 dot_matrix_vector_int64