Update pad on nvpgu (#1759)

* Add pad with fill operator using the outward-in index pattern. * Remove static pad and rename build_pad_dynamic -> build_pad. Update maxpool 1d padding. * Formatting. * Split build_pad_dynamic into build_pad and build_pad_fill. * Add test coverage for fixed bug in op::Pad for gpu.

Update pad on nvpgu (#1759)
* Add pad with fill operator using the outward-in index pattern. * Remove static pad and rename build_pad_dynamic -> build_pad. Update maxpool 1d padding. * Formatting. * Split build_pad_dynamic into build_pad and build_pad_fill. * Add test coverage for fixed bug in op::Pad for gpu.
40ff77bd · Chris Sullivan · Robert Kimball · 519b18ac · 40ff77bd · 40ff77bd
Commit 40ff77bd authored Oct 08, 2018 by Chris Sullivan Committed by Robert Kimball Oct 08, 2018
7 changed files
--- a/src/ngraph/runtime/gpu/cuda_emitter.cpp
+++ b/src/ngraph/runtime/gpu/cuda_emitter.cpp
--- a/src/ngraph/runtime/gpu/cuda_emitter.hpp
+++ b/src/ngraph/runtime/gpu/cuda_emitter.hpp
@@ -50,19 +50,17 @@ namespace ngraph
                size_t build_primitive(const op::ReplaceSlice* node, bool in_place_op);

            public:
-                size_t build_pad(const std::array<std::string, 2>& dtypes,
+                size_t build_pad(const std::vector<std::string>& dtypes,
                                 NVShape input_shape,
                                 NVShape output_shape,
-                                 NVShape pad_below,
-                                 NVShape pad_above,
-                                 NVShape pad_interior,
-                                 const std::string& pad_value = "");
+                                 NVShape padding_below,
+                                 NVShape padding_interior);

-                size_t build_pad_dynamic(const std::array<std::string, 2>& dtypes,
-                                         NVShape input_shape,
-                                         NVShape output_shape,
-                                         NVShape padding_below,
-                                         NVShape padding_interior);
+                size_t build_pad_fill(const std::vector<std::string>& dtypes,
+                                      NVShape input_shape,
+                                      NVShape output_shape,
+                                      NVShape padding_below,
+                                      NVShape padding_interior);

                size_t build_1d_max_pool(const std::array<std::string, 2>& dtypes,
                                         NVShape input_shape,

--- a/src/ngraph/runtime/gpu/cudnn_emitter.cpp
+++ b/src/ngraph/runtime/gpu/cudnn_emitter.cpp
@@ -418,7 +418,7 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::Convolution* node)
    Shape padding_interior(data_dilation_strides);

    size_t idx_workspace = std::numeric_limits<size_t>::max();
-    size_t pad_dynamic_index = std::numeric_limits<size_t>::max();
+    size_t pad_index = std::numeric_limits<size_t>::max();
    bool can_find_algo = true;
    if (pad_required || is_deconvolution)
    {
@@ -431,8 +431,7 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::Convolution* node)
        idx_workspace = allocator.reserve_workspace(temp_size, true);

        auto& cuda_emitter = m_primitive_emitter->get_cuda_emitter();
-        pad_dynamic_index =
-            cuda_emitter->build_pad_dynamic({{args[0].get_element_type().c_type_string(),
+        pad_index = cuda_emitter->build_pad({{args[0].get_element_type().c_type_string(),
                                              out[0].get_element_type().c_type_string()}},
                                            input_shape,
                                            input_shape_padded,
@@ -458,11 +457,11 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::Convolution* node)
    std::unique_ptr<gpu::primitive> kernel_launch(
        new gpu::primitive{[=](void** inputs, void** outputs) mutable {
            if (idx_workspace != std::numeric_limits<size_t>::max() &&
-                pad_dynamic_index != std::numeric_limits<size_t>::max())
+                pad_index != std::numeric_limits<size_t>::max())
            {
                void* pad_buffer = runtime::gpu::invoke_memory_primitive(m_ctx, idx_workspace);
                gpu::invoke_primitive(m_ctx,
-                                      pad_dynamic_index,
+                                      pad_index,
                                      std::vector<void*>{inputs[0]}.data(),
                                      std::vector<void*>{pad_buffer}.data());
                gpu::invoke_primitive(
@@ -542,7 +541,7 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::ConvolutionBackprop
    Shape padding_interior(data_dilation_strides);

    size_t idx_workspace = std::numeric_limits<size_t>::max();
-    size_t pad_dynamic_index = std::numeric_limits<size_t>::max();
+    size_t pad_index = std::numeric_limits<size_t>::max();
    size_t slice_index = std::numeric_limits<size_t>::max();
    bool can_find_algo = true;
    if (pad_required || is_deconvolution)
@@ -556,11 +555,11 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::ConvolutionBackprop
        idx_workspace = allocator.reserve_workspace(temp_size, true);

        auto& cuda_emitter = m_primitive_emitter->get_cuda_emitter();
-        pad_dynamic_index = cuda_emitter->build_pad_dynamic({{input_type, output_type}},
-                                                            output_shape,
-                                                            output_shape_padded,
-                                                            padding_below,
-                                                            padding_interior);
+        pad_index = cuda_emitter->build_pad({{input_type, output_type}},
+                                            output_shape,
+                                            output_shape_padded,
+                                            padding_below,
+                                            padding_interior);

        slice_index = cuda_emitter->build_slice({{input_type, output_type}},
                                                output_shape_padded,
@@ -587,12 +586,12 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::ConvolutionBackprop
    std::unique_ptr<gpu::primitive> kernel_launch(new gpu::primitive{[=](void** inputs,
                                                                         void** outputs) mutable {
        if (idx_workspace != std::numeric_limits<size_t>::max() &&
-            pad_dynamic_index != std::numeric_limits<size_t>::max() &&
+            pad_index != std::numeric_limits<size_t>::max() &&
            slice_index != std::numeric_limits<size_t>::max())
        {
            void* pad_buffer = runtime::gpu::invoke_memory_primitive(m_ctx, idx_workspace);
            gpu::invoke_primitive(m_ctx,
-                                  pad_dynamic_index,
+                                  pad_index,
                                  std::vector<void*>{inputs[0]}.data(),
                                  std::vector<void*>{pad_buffer}.data());
            gpu::invoke_primitive(m_ctx, conv_index, inputs, std::vector<void*>{pad_buffer}.data());
@@ -662,7 +661,7 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::ConvolutionBackprop
    Shape padding_interior(data_dilation_strides);

    size_t idx_workspace = std::numeric_limits<size_t>::max();
-    size_t pad_dynamic_index = std::numeric_limits<size_t>::max();
+    size_t pad_index = std::numeric_limits<size_t>::max();
    bool can_find_algo = true;
    if (pad_required || is_deconvolution)
    {
@@ -675,11 +674,11 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::ConvolutionBackprop
        idx_workspace = allocator.reserve_workspace(temp_size, true);

        auto& cuda_emitter = m_primitive_emitter->get_cuda_emitter();
-        pad_dynamic_index = cuda_emitter->build_pad_dynamic({{input_type, output_type}},
-                                                            input_shape_0,
-                                                            input_shape_padded,
-                                                            padding_below,
-                                                            padding_interior);
+        pad_index = cuda_emitter->build_pad({{input_type, output_type}},
+                                            input_shape_0,
+                                            input_shape_padded,
+                                            padding_below,
+                                            padding_interior);

        // asymetric padding has been applied, zero out padding vectors to
        // ensure cudnn does not assume padding
@@ -700,11 +699,11 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::ConvolutionBackprop
    std::unique_ptr<gpu::primitive> kernel_launch(
        new gpu::primitive{[=](void** inputs, void** outputs) mutable {
            if (idx_workspace != std::numeric_limits<size_t>::max() &&
-                pad_dynamic_index != std::numeric_limits<size_t>::max())
+                pad_index != std::numeric_limits<size_t>::max())
            {
                void* pad_buffer = runtime::gpu::invoke_memory_primitive(m_ctx, idx_workspace);
                gpu::invoke_primitive(m_ctx,
-                                      pad_dynamic_index,
+                                      pad_index,
                                      std::vector<void*>{inputs[0]}.data(),
                                      std::vector<void*>{pad_buffer}.data());
                gpu::invoke_primitive(
@@ -768,11 +767,11 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::MaxPool* node)
                                                   padded_size * args[0].get_element_type().size());

        auto& cuda_emitter = m_primitive_emitter->get_cuda_emitter();
-        pad_index = cuda_emitter->build_pad_dynamic({{input_type, output_type}},
-                                                    input_shape,
-                                                    input_shape_padded,
-                                                    padding_below,
-                                                    padding_interior);
+        pad_index = cuda_emitter->build_pad({{input_type, output_type}},
+                                            input_shape,
+                                            input_shape_padded,
+                                            padding_below,
+                                            padding_interior);

        // asymetric padding has been applied, zero out padding vectors to
        // ensure cuDNN does not assume padding during pooling

--- a/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.cpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.cpp
@@ -638,12 +638,10 @@ void runtime::gpu::CudaKernelBuilder::get_concat_op(codegen::CodeWriter& writer,
    writer.block_end();
 }

-void runtime::gpu::CudaKernelBuilder::get_pad_dynamic_op(
-    codegen::CodeWriter& writer,
-    const std::string& name,
-    GPUKernelArgs& args,
-    const std::array<std::string, 2>& data_types,
-    size_t rank)
+void runtime::gpu::CudaKernelBuilder::get_pad_op(codegen::CodeWriter& writer,
+                                                 const std::string& name,
+                                                 GPUKernelArgs& args,
+                                                 size_t rank)
 {
    writer << "extern \"C\" __global__ void cuda_" << name << args.get_input_signature();
    writer.block_begin();
@@ -673,6 +671,44 @@ void runtime::gpu::CudaKernelBuilder::get_pad_dynamic_op(
    writer.block_end();
 }

+void runtime::gpu::CudaKernelBuilder::get_pad_fill_op(codegen::CodeWriter& writer,
+                                                      const std::string& name,
+                                                      GPUKernelArgs& args,
+                                                      size_t rank)
+{
+    writer << "extern \"C\" __global__ void cuda_" << name << args.get_input_signature();
+    writer.block_begin();
+    {
+        writer << "uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n";
+        writer << "if (tid < n)\n";
+        writer.block_begin();
+        {
+            writer << "bool in_bounds = true;\n";
+            writer << "uint32_t output_pixel = tid;\n";
+            writer << "uint32_t input_pixel = 0;\n";
+            writer << "int32_t input, input_dil;\n";
+            for (size_t i = 0; i < rank; i++)
+            {
+                if (i != 0)
+                {
+                    writer << "output_pixel %= output_strides" << i - 1 << ";\n";
+                }
+                writer << "input_dil = output_pixel / output_strides" << i << " - padding_below"
+                       << i << ";\n";
+
+                writer << "input = input_dil / (padding_interior" << i << " + 1);\n";
+                writer << "input_dil %= (padding_interior" << i << " + 1);\n";
+                writer << "in_bounds = in_bounds && (input >= 0) && (input < input_shape" << i
+                       << ") && (input_dil == 0);\n";
+                writer << "input_pixel += input * input_strides" << i << ";\n";
+            }
+            writer << "out[tid] = (in_bounds) ? in[input_pixel] : *pad;\n";
+        }
+        writer.block_end();
+    }
+    writer.block_end();
+}
+
 void runtime::gpu::CudaKernelBuilder::get_reverse_sequence_op(
    codegen::CodeWriter& writer,
    const std::string& name,

--- a/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.hpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.hpp
@@ -130,11 +130,15 @@ namespace ngraph
                                              const std::string& math_kernel,
                                              const std::vector<std::string>& data_types);

-                static void get_pad_dynamic_op(codegen::CodeWriter& writer,
-                                               const std::string& name,
-                                               GPUKernelArgs& args,
-                                               const std::array<std::string, 2>& data_types,
-                                               size_t rank);
+                static void get_pad_op(codegen::CodeWriter& writer,
+                                       const std::string& name,
+                                       GPUKernelArgs& args,
+                                       size_t rank);
+
+                static void get_pad_fill_op(codegen::CodeWriter& writer,
+                                            const std::string& name,
+                                            GPUKernelArgs& args,
+                                            size_t rank);

                static void get_ew_collective_op(codegen::CodeWriter& writer,
                                                 const std::string& name,

--- a/src/ngraph/runtime/gpu/gpu_emitter.cpp
+++ b/src/ngraph/runtime/gpu/gpu_emitter.cpp
@@ -827,12 +827,12 @@ void runtime::gpu::GPU_Emitter::emit_Pad(EMIT_ARGS)

        auto& cuda_emitter = external_function->get_primitive_emitter()->get_cuda_emitter();

-        auto pad_index = cuda_emitter->build_pad({{args[0].get_type(), out[0].get_type()}},
-                                                 input_shape,
-                                                 output_shape,
-                                                 padding_below,
-                                                 padding_above,
-                                                 padding_interior);
+        auto pad_index = cuda_emitter->build_pad_fill(
+            {{args[0].get_type(), args[1].get_type(), out[0].get_type()}},
+            input_shape,
+            output_shape,
+            padding_below,
+            padding_interior);
        writer << "void* input[] = {" << node_names(args) << "};\n";
        writer << "void* output[] = {" << node_names(out) << "};\n";
        writer << "gpu::invoke_primitive(ctx, " << pad_index << ", input, output);\n";

--- a/test/backend_test.in.cpp
+++ b/test/backend_test.in.cpp
@@ -7497,6 +7497,86 @@ NGRAPH_TEST(${BACKEND_NAME}, pad_interior_exterior_4d_2x0x3x2)
    EXPECT_EQ(expected, read_vector<float>(result));
 }

+// This test covers the case with multiple image and with asymetric pad
+// bug has been found on nvGPU side now covered by this test
+NGRAPH_TEST(${BACKEND_NAME}, pad_2channel_2image_asym)
+{
+    Shape shape_a{2, 2, 4, 4};
+    auto window_movement_strides = Strides{2, 2};
+    Shape padding_below{0, 0, 0, 0};
+    Shape padding_above{0, 0, 2, 2};
+    Shape padding_interior{0, 0, 0, 0};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_b{};
+    auto B = make_shared<op::Parameter>(element::f32, shape_b);
+    Shape shape_r{2, 2, 6, 6};
+    auto f = make_shared<Function>(
+        make_shared<op::Pad>(A, B, padding_below, padding_above, padding_interior),
+        op::ParameterVector{A, B});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a,
+              test::NDArray<float, 4>({{{{0, 1, 0, 2}, // img 0 chan 0
+                                         {0, 3, 2, 0},
+                                         {2, 0, 0, 0},
+                                         {0, 2, 1, 0}},
+
+                                        {{0, 0, 0, 2}, // img 0 chan 1
+                                         {0, 2, 3, 0},
+                                         {2, 0, 1, 0},
+                                         {2, 0, 0, 0}}},
+
+                                       {{{0, 2, 1, 1}, // img 1 chan 0
+                                         {0, 0, 2, 0},
+                                         {0, 0, 1, 2},
+                                         {0, 0, 0, 0}},
+
+                                        {{2, 1, 0, 0}, // img 1 chan 1
+                                         {0, 2, 0, 0},
+                                         {1, 1, 2, 0},
+                                         {1, 0, 0, 0}}}})
+                  .get_vector());
+
+    auto b = backend->create_tensor(element::f32, shape_b);
+    copy_data(b, vector<float>{42});
+
+    auto result = backend->create_tensor(element::f32, shape_r);
+
+    backend->call_with_validate(f, {result}, {a, b});
+    EXPECT_EQ((test::NDArray<float, 4>({{{{0, 1, 0, 2, 42, 42}, // img 0 chan 0
+                                          {0, 3, 2, 0, 42, 42},
+                                          {2, 0, 0, 0, 42, 42},
+                                          {0, 2, 1, 0, 42, 42},
+                                          {42, 42, 42, 42, 42, 42},
+                                          {42, 42, 42, 42, 42, 42}},
+
+                                         {{0, 0, 0, 2, 42, 42}, // img 1 chan 0
+                                          {0, 2, 3, 0, 42, 42},
+                                          {2, 0, 1, 0, 42, 42},
+                                          {2, 0, 0, 0, 42, 42},
+                                          {42, 42, 42, 42, 42, 42},
+                                          {42, 42, 42, 42, 42, 42}}},
+
+                                        {{{0, 2, 1, 1, 42, 42}, // img 1 chan 0
+                                          {0, 0, 2, 0, 42, 42},
+                                          {0, 0, 1, 2, 42, 42},
+                                          {0, 0, 0, 0, 42, 42},
+                                          {42, 42, 42, 42, 42, 42},
+                                          {42, 42, 42, 42, 42, 42}},
+
+                                         {{2, 1, 0, 0, 42, 42}, // img 1 chan 1
+                                          {0, 2, 0, 0, 42, 42},
+                                          {1, 1, 2, 0, 42, 42},
+                                          {1, 0, 0, 0, 42, 42},
+                                          {42, 42, 42, 42, 42, 42},
+                                          {42, 42, 42, 42, 42, 42}}}})
+                   .get_vector()),
+              read_vector<float>(result));
+}
+
 // Trivial case with no reduced axes.
 NGRAPH_TEST(${BACKEND_NAME}, product_trivial)
 {