gpu reshape n-dimension (n>2) (#716)

* add nd reshape * compiler and no crash with wrong result version * change output_stride to trans_stride, which transform input idx to output idx * using vector instead of c array * remove delete * using const and reference to pass string and array * change 'unimplement' comments, remove extra indents * format and cast size_t to int

gpu reshape n-dimension (n>2) (#716)
* add nd reshape * compiler and no crash with wrong result version * change output_stride to trans_stride, which transform input idx to output idx * using vector instead of c array * remove delete * using const and reference to pass string and array * change 'unimplement' comments, remove extra indents * format and cast size_t to int
4e78f25d · Fenglei · Robert Kimball · b3896731 · 4e78f25d · 4e78f25d
Commit 4e78f25d authored Mar 27, 2018 by Fenglei Committed by Robert Kimball Mar 27, 2018
8 changed files
--- a/src/ngraph/builder/tensor_mask.hpp
+++ b/src/ngraph/builder/tensor_mask.hpp
@@ -29,4 +29,4 @@ namespace ngraph
                                          size_t batch_axis,
                                          Shape mask_shape);
    }
 }
\ No newline at end of file
--- a/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.cpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.cpp
@@ -104,6 +104,36 @@ void runtime::gpu::CudaKernelBuilder::get_onehot_op(codegen::CodeWriter& writer,
    writer << "}\n";
 }
+void runtime::gpu::CudaKernelBuilder::get_reshape_op(codegen::CodeWriter& writer,
+                                                     const std::string& name,
+                                                     const std::array<std::string, 2>& data_types)
+{
+    writer << "extern \"C\" __global__ void cuda_" << name << "(" << data_types[0] << "* in, "
+           << data_types[1]
+           << "* out, size_t* input_strides, size_t* trans_strides, size_t rank, size_t n)\n";
+    writer.block_begin();
+    {
+        writer << "size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n";
+        writer << "if (tid < n)\n";
+        writer.block_begin();
+        {
+            writer << "size_t idx_in = tid;\n";
+            writer << "size_t idx_out = 0;\n";
+            writer << "for(size_t i = 0; i < rank; i++)\n";
+            writer.block_begin();
+            {
+                writer << "idx_out += (idx_in / input_strides[i]) * trans_strides[i];\n";
+                writer << "idx_in %= input_strides[i];\n";
+            }
+            writer.block_end();
+            writer << "out[idx_out] = in[tid];\n";
+        }
+        writer.block_end();
+    }
+    writer.block_end();
+}
 void runtime::gpu::CudaKernelBuilder::get_device_helper(
    codegen::CodeWriter& writer,
    const std::string& name,

--- a/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.hpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.hpp
@@ -47,6 +47,10 @@ namespace ngraph
                                          const std::string& name,
                                          const std::array<std::string, 2>& data_types);
+                static void get_reshape_op(codegen::CodeWriter& writer,
+                                           const std::string& name,
+                                           const std::array<std::string, 2>& data_types);
                static void get_device_helper(codegen::CodeWriter& writer,
                                              const std::string& name,
                                              const std::string& math_kernel,

--- a/src/ngraph/runtime/gpu/gpu_cuda_kernel_emitters.cpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_kernel_emitters.cpp
@@ -23,10 +23,10 @@
 using namespace ngraph;
 using namespace ngraph::runtime::gpu;
-void runtime::gpu::emit_broadcast(std::string name,
+void runtime::gpu::emit_broadcast(const std::string& name,
                                  CUdeviceptr in,
                                  CUdeviceptr out,
-                                  std::array<std::string, 2> data_types,
+                                  const std::array<std::string, 2>& data_types,
                                  size_t repeat_size,
                                  size_t repeat_times,
                                  size_t count)
@@ -58,10 +58,10 @@ void runtime::gpu::emit_broadcast(std::string name,
    CUDA_SAFE_CALL(cuCtxSynchronize()); // Retrieve and print output.
 }
-void runtime::gpu::emit_onehot(std::string name,
+void runtime::gpu::emit_onehot(const std::string& name,
                               CUdeviceptr in,
                               CUdeviceptr out,
-                               std::array<std::string, 2> data_types,
+                               const std::array<std::string, 2>& data_types,
                               size_t repeat_size,
                               size_t repeat_times,
                               size_t count)
@@ -92,3 +92,39 @@ void runtime::gpu::emit_onehot(std::string name,
                                  0));  // arguments
    CUDA_SAFE_CALL(cuCtxSynchronize()); // Retrieve and print output.
 }
+void runtime::gpu::emit_reshape(const std::string& name,
+                                CUdeviceptr in,
+                                CUdeviceptr out,
+                                const std::array<std::string, 2>& data_types,
+                                CUdeviceptr input_strides,
+                                CUdeviceptr trans_strides,
+                                size_t rank,
+                                size_t count)
+{
+    std::string name_signature = name + "_" + data_types[0] + "_" + data_types[1];
+    std::replace(name_signature.begin(), name_signature.end(), ' ', '_');
+    // Create an instance of nvrtcProgram with the code string.
+    if (CudaFunctionPool::instance().get(name_signature) == nullptr)
+    {
+        codegen::CodeWriter writer;
+        CudaKernelBuilder::add_pod_typedefs(writer);
+        CudaKernelBuilder::get_reshape_op(writer, name_signature, data_types);
+        std::string kernel = writer.get_code();
+        CudaFunctionPool::instance().set(name_signature, kernel);
+    }
+    void* args_list[] = {&in, &out, &input_strides, &trans_strides, &rank, &count};
+    CUDA_SAFE_CALL(cuLaunchKernel(*CudaFunctionPool::instance().get(name_signature).get(),
+                                  static_cast<unsigned int>(count),
+                                  1,
+                                  1, // grid dim
+                                  1,
+                                  1,
+                                  1, // block dim
+                                  0,
+                                  NULL, // shared mem and stream
+                                  args_list,
+                                  0));  // arguments
+    CUDA_SAFE_CALL(cuCtxSynchronize()); // Retrieve and print output.
+}
--- a/src/ngraph/runtime/gpu/gpu_cuda_kernel_emitters.hpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_kernel_emitters.hpp
@@ -34,25 +34,34 @@ namespace ngraph
            template <typename T>
            struct CudaOpMap;
-            void emit_broadcast(std::string name,
+            void emit_broadcast(const std::string& name,
                                CUdeviceptr in,
                                CUdeviceptr out,
-                                std::array<std::string, 2> data_types,
+                                const std::array<std::string, 2>& data_types,
                                size_t repeat_size,
                                size_t repeat_times,
                                size_t count);
-            void emit_onehot(std::string name,
+            void emit_onehot(const std::string& name,
                             CUdeviceptr in,
                             CUdeviceptr out,
-                             std::array<std::string, 2> data_types,
+                             const std::array<std::string, 2>& data_types,
                             size_t repeat_size,
                             size_t repeat_times,
                             size_t count);
+            void emit_reshape(const std::string& name,
+                              CUdeviceptr in,
+                              CUdeviceptr out,
+                              const std::array<std::string, 2>& data_types,
+                              CUdeviceptr input_strides,
+                              CUdeviceptr trans_strides,
+                              size_t rank,
+                              size_t count);
            template <typename T, typename... Inputs>
-            void emit_elementwise_op(std::string name,
+            void emit_elementwise_op(const std::string& name,
-                                     std::array<std::string, 2> data_types,
+                                     const std::array<std::string, 2>& data_types,
                                     size_t count,
                                     CUdeviceptr out,
                                     Inputs&&... inputs)

--- a/src/ngraph/runtime/gpu/gpu_emitter.cpp
+++ b/src/ngraph/runtime/gpu/gpu_emitter.cpp
@@ -553,7 +553,6 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
                else if (arg_rank == 2)
                {
                    // TODO Assert arg0_shape[0] == arg1_shape[0]?
-                    writer.indent++;
                    writer << "const float alpha = 1.0;\n";
                    writer << "const float beta = 0;\n";
                    writer << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_HOST);\n";
@@ -568,12 +567,63 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
                           << "," << result_shape[1] << ");\n";
                    writer << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_DEVICE);\n";
                }
-                // Other cases (reordering of axes for tensors with rank>2) are not handled yet.
+                // Other cases (reordering of axes for tensors with rank>2).
                else
                {
-                    throw runtime_error(
+                    std::vector<size_t> input_strides(arg_rank);
-                        "Axis permutation in reshape is not implemented yet for tensors with "
+                    std::vector<size_t> output_strides(arg_rank);
-                        "rank>2");
+                    std::vector<size_t> trans_strides(arg_rank);
+                    size_t stride = 1;
+                    for (int i = static_cast<int>(arg_rank) - 1; i >= 0; i--)
+                    {
+                        input_strides[i] = stride;
+                        stride *= arg_shape[i];
+                    }
+                    stride = 1;
+                    for (int i = static_cast<int>(arg_rank) - 1; i >= 0; i--)
+                    {
+                        output_strides[i] = stride;
+                        stride *= arg_shape[input_order[i]];
+                    }
+                    for (int i = 0; i < arg_rank; i++)
+                    {
+                        trans_strides[input_order[i]] = output_strides[i];
+                    }
+                    writer << "size_t rank = " << arg_rank << ";\n";
+                    writer << "std::vector<size_t> input_strides_h = {" << input_strides[0] << "UL";
+                    for (int i = 1; i < arg_rank; i++)
+                    {
+                        writer << ", " << input_strides[i] << "UL";
+                    }
+                    writer << "};\n";
+                    writer << "std::vector<size_t> trans_strides_h = {" << trans_strides[0] << "UL";
+                    for (int i = 1; i < arg_rank; i++)
+                    {
+                        writer << ", " << trans_strides[i] << "UL";
+                    }
+                    writer << "};\n";
+                    writer << "void* input_strides_d = "
+                              "runtime::gpu::create_gpu_buffer(sizeof(size_t) * rank);\n";
+                    writer << "void* trans_strides_d = "
+                              "runtime::gpu::create_gpu_buffer(sizeof(size_t) * rank);\n";
+                    writer
+                        << "runtime::gpu::cuda_memcpyHtD(input_strides_d, input_strides_h.data(), "
+                           "sizeof(size_t) * rank);\n";
+                    writer
+                        << "runtime::gpu::cuda_memcpyHtD(trans_strides_d, trans_strides_h.data(), "
+                           "sizeof(size_t) * rank);\n";
+                    writer << "runtime::gpu::emit_reshape(\"" << node->description()
+                           << "\", CUdeviceptr(" << args[0].get_name() << "), CUdeviceptr("
+                           << out[0].get_name() << ")"
+                           << ", {\"" << args[0].get_type() << "\", \"" << out[0].get_type()
+                           << "\"}"
+                           << ", "
+                           << "CUdeviceptr(input_strides_d), CUdeviceptr(trans_strides_d)"
+                           << ", " << arg_rank << ", " << args[0].get_size() << ");\n";
+                    writer << "runtime::gpu::free_gpu_buffer(input_strides_d);\n";
+                    writer << "runtime::gpu::free_gpu_buffer(trans_strides_d);\n";
                }
                writer.block_end();
            }

--- a/test/backend_test.in.cpp
+++ b/test/backend_test.in.cpp
@@ -2340,7 +2340,6 @@ TEST(${BACKEND_NAME}, reduce_3d_to_vector)
 TEST(${BACKEND_NAME}, reshape_t2v_012)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    Shape shape_a{2, 2, 3};
    auto A = make_shared<op::Parameter>(element::f32, shape_a);
    Shape shape_r{12};
@@ -2363,7 +2362,6 @@ TEST(${BACKEND_NAME}, reshape_t2v_012)
 TEST(${BACKEND_NAME}, reshape_t2s_012)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    Shape shape_a{1, 1, 1};
    auto A = make_shared<op::Parameter>(element::f32, shape_a);
    Shape shape_r{};
@@ -2386,7 +2384,6 @@ TEST(${BACKEND_NAME}, reshape_t2s_012)
 TEST(${BACKEND_NAME}, reshape_t2s_120)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    Shape shape_a{1, 1, 1};
    auto A = make_shared<op::Parameter>(element::f32, shape_a);
    Shape shape_r{};
@@ -2409,7 +2406,6 @@ TEST(${BACKEND_NAME}, reshape_t2s_120)
 TEST(${BACKEND_NAME}, reshape_s2t)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    Shape shape_a{};
    auto A = make_shared<op::Parameter>(element::f32, shape_a);
    Shape shape_r{1, 1, 1, 1, 1, 1};
@@ -2476,7 +2472,6 @@ TEST(${BACKEND_NAME}, reshape_v2m_row)
 TEST(${BACKEND_NAME}, reshape_v2t_middle)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    Shape shape_a{3};
    auto A = make_shared<op::Parameter>(element::f32, shape_a);
    Shape shape_r{1, 3, 1};
@@ -2606,7 +2601,6 @@ TEST(${BACKEND_NAME}, reshape_m2m_dim_change_transpose)
 //
 TEST(${BACKEND_NAME}, reshape_6d)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    vector<float> a_data(2 * 2 * 3 * 3 * 2 * 4);
    for (int i = 0; i < 2 * 2 * 3 * 3 * 2 * 4; i++)
    {

--- a/test/builder.cpp
+++ b/test/builder.cpp
@@ -147,9 +147,8 @@ TEST(builder, tensor_mask)
    auto sequence_lengths = make_shared<op::Parameter>(element::u32, max_sequence_length);
    Shape mask_shape{3, 5};
-    auto f =
+    auto f = make_shared<Function>(builder::tensor_mask(sequence_lengths, 1, 0, mask_shape),
-        make_shared<Function>(builder::tensor_mask(sequence_lengths, 1, 0, mask_shape),
+                                   op::ParameterVector{sequence_lengths});
-                              op::ParameterVector{sequence_lengths});
    auto manager = runtime::Manager::get("INTERPRETER");
    auto external = manager->compile(f);