gpu reverse sequence (#1109)

* add reverse_sequence * fix bugs, compiled * fix index bug * fix bug and clang format * correct function name * clang format * remove extra ; * remove tests from skip list * add backward support, skip tests * add back template<> line * remove unecessary lines in kernel

gpu reverse sequence (#1109)
* add reverse_sequence * fix bugs, compiled * fix index bug * fix bug and clang format * correct function name * clang format * remove extra ; * remove tests from skip list * add backward support, skip tests * add back template<> line * remove unecessary lines in kernel
bdfcf5b4 · Fenglei · Adam Procter · 94844d13 · bdfcf5b4 · bdfcf5b4
Commit bdfcf5b4 authored Jun 16, 2018 by Fenglei Committed by Adam Procter Jun 16, 2018
7 changed files
--- a/src/ngraph/runtime/gpu/cuda_emitter.cpp
+++ b/src/ngraph/runtime/gpu/cuda_emitter.cpp
@@ -331,6 +331,87 @@ size_t runtime::gpu::CUDAEmitter::build_pad_dynamic(const runtime::gpu::GPURunti
    return primitive_index;
 }
+size_t runtime::gpu::CUDAEmitter::build_reverse_sequence(const runtime::gpu::GPURuntimeContext* ctx,
+                                                         const std::array<std::string, 3>& dtypes,
+                                                         GPUShape input_shape0,
+                                                         GPUShape input_shape1,
+                                                         GPUShape output_shape,
+                                                         size_t batch_axis,
+                                                         size_t sequence_axis)
+{
+    std::stringstream kernel_name;
+    kernel_name << "reverse_sequence_" << join(dtypes, "_") << "_bi_" << batch_axis << "_si_"
+                << sequence_axis << "_r_" << output_shape.size();
+    std::string hash = kernel_name.str() + "_i" + join(input_shape0, "_") + "_i" +
+                       join(input_shape1, "_") + "_o" + join(output_shape);
+    // For backwards compatability we currently use two unordered maps
+    // 1. one looks up the compiled cuda kernel (CudaFunctionPool)
+    // 2. the other looks to see if this kernel is already in the primitive list
+    // check if the requested kernel is already an inserted primitive
+    size_t primitive_index = m_primitive_emitter->lookup(hash);
+    if (primitive_index != std::numeric_limits<size_t>::max())
+    {
+        return primitive_index;
+    }
+    // check if the kernel has already been compiled. if so, create
+    // a launch primitive for it based on the input tensor shape
+    // but do not recompile the kernel. otherwise, do it all:
+    // recompile the kernel and then create the primitive
+    auto compiled_kernel = ctx->compiled_kernel_pool->get(kernel_name.str());
+    if (compiled_kernel == nullptr)
+    {
+        codegen::CodeWriter writer;
+        CudaKernelBuilder::add_pod_typedefs(writer);
+        CudaKernelBuilder::get_reverse_sequence_op(
+            writer, kernel_name.str(), dtypes, batch_axis, sequence_axis, output_shape.size());
+        compiled_kernel = ctx->compiled_kernel_pool->set(kernel_name.str(), writer.get_code());
+    }
+    uint32_t nthreads = static_cast<uint32_t>(shape_size(output_shape));
+    GPUShape output_strides = row_major_strides(output_shape);
+    // get an allocator for transient per kernel gpu memory
+    GPUAllocator allocator = this->m_primitive_emitter->get_memory_allocator();
+    size_t idx_output_shape =
+        allocator.reserve_argspace(output_shape.data(), output_shape.size() * sizeof(uint32_t));
+    size_t idx_output_strides =
+        allocator.reserve_argspace(output_strides.data(), output_strides.size() * sizeof(uint32_t));
+    // create the launch primitive
+    std::unique_ptr<gpu::primitive> reserve_sequence(
+        new gpu::primitive{[=](void** inputs, void** outputs) mutable {
+            void* param_output_shape = runtime::gpu::invoke_memory_primitive(ctx, idx_output_shape);
+            void* param_output_strides =
+                runtime::gpu::invoke_memory_primitive(ctx, idx_output_strides);
+            std::vector<void*> args_list{&inputs[0],
+                                         &inputs[1],
+                                         &outputs[0],
+                                         &param_output_shape,
+                                         &param_output_strides,
+                                         &nthreads};
+            CUDA_SAFE_CALL(cuLaunchKernel(*compiled_kernel.get(),
+                                          static_cast<uint32_t>(nthreads),
+                                          1,
+                                          1, // grid dim
+                                          1,
+                                          1,
+                                          1, // block dim
+                                          0,
+                                          NULL, // shared mem and stream
+                                          args_list.data(),
+                                          0));  // arguments
+            CUDA_SAFE_CALL(cuCtxSynchronize()); // Retrieve and print output.
+        }});
+    primitive_index = this->m_primitive_emitter->insert(std::move(reserve_sequence));
+    m_primitive_emitter->cache(hash, primitive_index);
+    return primitive_index;
+}
 size_t runtime::gpu::CUDAEmitter::build_1d_max_pool(const GPURuntimeContext* ctx,
                                                    const std::array<std::string, 2>& dtypes,
                                                    GPUShape input_shape,
@@ -1377,6 +1458,15 @@ __device__ __forceinline__ float  load(const float*  __restrict__ in, int i=0, b
    }
    return v;
 }
+__device__ __forceinline__ int32_t  load(const int32_t*  __restrict__ in, int i=0, bool b=true)
+{
+    int32_t v = 0;
+    if (b)
+    {
+        v = __ldg(in + i);
+    }
+    return v;
+}
 __device__ __forceinline__ int64_t  load(const int64_t*  __restrict__ in, int i=0, bool b=true)
 {
    int64_t v = 0;

--- a/src/ngraph/runtime/gpu/cuda_emitter.hpp
+++ b/src/ngraph/runtime/gpu/cuda_emitter.hpp
@@ -78,6 +78,14 @@ namespace ngraph
                                           GPUShape reduce_window_shape,
                                           GPUShape reduce_window_strides);
+                size_t build_reverse_sequence(const runtime::gpu::GPURuntimeContext* ctx,
+                                              const std::array<std::string, 3>& dtypes,
+                                              GPUShape input_shape0,
+                                              GPUShape input_shape1,
+                                              GPUShape output_shape,
+                                              size_t batch_axis,
+                                              size_t sequence_axis);
                template <typename T>
                size_t build_elementwise(const GPURuntimeContext* ctx,
                                         const std::vector<std::string>& dtypes,

--- a/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.cpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.cpp
@@ -313,6 +313,53 @@ void runtime::gpu::CudaKernelBuilder::get_pad_dynamic_op(
    writer.block_end();
 }
+void runtime::gpu::CudaKernelBuilder::get_reverse_sequence_op(
+    codegen::CodeWriter& writer,
+    const std::string& name,
+    const std::array<std::string, 3>& data_types,
+    const size_t batch_axis,
+    const size_t sequence_axis,
+    const size_t rank)
+{
+    writer << "extern \"C\" __global__ void cuda_" << name << "(" << data_types[0] << "* in, "
+           << data_types[1] << "* sequence, " << data_types[2] << "* out, "
+           << "uint32_t* output_shape, uint32_t* output_strides, uint32_t n)\n";
+    writer.block_begin();
+    {
+        writer << "uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n";
+        writer << "if (tid < n)\n";
+        writer.block_begin();
+        {
+            writer << "uint32_t input_idx = tid;\n";
+            for (size_t i = 0; i < rank - 1; i++)
+            {
+                writer << "uint32_t output_idx_" << i << " = input_idx / output_strides[" << i
+                       << "];\n";
+                writer << "input_idx %= output_strides[" << i << "];\n";
+            }
+            writer << "uint32_t output_idx_" << rank - 1 << " = input_idx / output_strides["
+                   << rank - 1 << "];\n";
+            writer << "uint32_t sequence_length = sequence[output_idx_" << batch_axis << "];\n";
+            writer << "assert(sequence_length <= output_shape[" << sequence_axis << "]);\n";
+            writer << "bool need_reverse = (output_idx_" << sequence_axis
+                   << " < sequence_length) && (sequence_length > 1);\n";
+            writer << "output_idx_" << sequence_axis
+                   << " = need_reverse ? sequence_length - output_idx_" << sequence_axis
+                   << " - 1 : output_idx_" << sequence_axis << ";\n";
+            writer << "uint32_t output_idx = need_reverse ? ";
+            writer << "output_idx_" << 0 << " * output_strides[" << 0 << "]";
+            for (size_t i = 1; i < rank; i++)
+            {
+                writer << " + output_idx_" << i << " * output_strides[" << i << "]";
+            }
+            writer << " : tid;\n";
+            writer << "out[output_idx] = in[tid];\n";
+        }
+        writer.block_end();
+    }
+    writer.block_end();
+}
 void runtime::gpu::CudaKernelBuilder::get_slice_op(codegen::CodeWriter& writer,
                                                   const std::string& name,
                                                   const std::array<std::string, 2>& data_types)

--- a/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.hpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.hpp
@@ -76,6 +76,13 @@ namespace ngraph
                                                 const std::vector<std::string>& data_types,
                                                 const size_t rank);
+                static void get_reverse_sequence_op(codegen::CodeWriter& writer,
+                                                    const std::string& name,
+                                                    const std::array<std::string, 3>& data_types,
+                                                    const size_t batch_axis,
+                                                    const size_t sequence_axis,
+                                                    const size_t rank);
                static void get_device_helper(codegen::CodeWriter& writer,
                                              const std::string& name,
                                              const std::string& math_kernel,

--- a/src/ngraph/runtime/gpu/gpu_emitter.cpp
+++ b/src/ngraph/runtime/gpu/gpu_emitter.cpp
@@ -80,6 +80,7 @@
 #include "ngraph/op/reshape.hpp"
 #include "ngraph/op/result.hpp"
 #include "ngraph/op/reverse.hpp"
+#include "ngraph/op/reverse_sequence.hpp"
 #include "ngraph/op/select.hpp"
 #include "ngraph/op/select_and_scatter.hpp"
 #include "ngraph/op/sign.hpp"
@@ -1151,6 +1152,38 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
                writer.block_end();
            }
+            template <>
+            void GPU_Emitter::EMITTER_DECL(ngraph::op::ReverseSequence)
+            {
+                if (out[0].get_size() == 0)
+                {
+                    return;
+                }
+                auto rs = static_cast<const ngraph::op::ReverseSequence*>(node);
+                size_t bi = rs->get_batch_axis();
+                size_t si = rs->get_sequence_axis();
+                auto arg_shape0 = args[0].get_shape();
+                auto arg_shape1 = args[1].get_shape();
+                auto out_shape = out[0].get_shape();
+                auto& cuda_emitter = external_function->get_primitive_emitter()->get_cuda_emitter();
+                auto rs_index = cuda_emitter->build_reverse_sequence(
+                    external_function->ctx().get(),
+                    {{args[0].get_type(), args[1].get_type(), out[0].get_type()}},
+                    arg_shape0,
+                    arg_shape1,
+                    out_shape,
+                    bi,
+                    si);
+                writer << "gpu::invoke_primitive(ctx, " << rs_index << ", ";
+                writer << "std::vector<void*>{" << args[0].get_name() << ", " << args[1].get_name()
+                       << "}.data(), ";
+                writer << "std::vector<void*>{" << out[0].get_name() << "}.data()";
+                writer << ");\n";
+            }
            template <>
            void GPU_Emitter::EMITTER_DECL(ngraph::op::Multiply)
            {

--- a/src/ngraph/runtime/gpu/gpu_external_function.cpp
+++ b/src/ngraph/runtime/gpu/gpu_external_function.cpp
@@ -91,6 +91,7 @@
 #include "ngraph/op/reshape.hpp"
 #include "ngraph/op/result.hpp"
 #include "ngraph/op/reverse.hpp"
+#include "ngraph/op/reverse_sequence.hpp"
 #include "ngraph/op/select.hpp"
 #include "ngraph/op/select_and_scatter.hpp"
 #include "ngraph/op/sign.hpp"
@@ -223,6 +224,8 @@ static const runtime::gpu::OpMap dispatcher{
    {TI(ngraph::op::Not), &runtime::gpu::GPU_Emitter::emit_elementwise<ngraph::op::Not>},
    {TI(ngraph::op::MaxPool), &runtime::gpu::GPU_Emitter::emit<ngraph::op::MaxPool>},
    {TI(ngraph::op::Reverse), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Reverse>},
+    {TI(ngraph::op::ReverseSequence),
+     &runtime::gpu::GPU_Emitter::emit<ngraph::op::ReverseSequence>},
    {TI(ngraph::op::Result), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Result>},
    {TI(ngraph::op::ReduceWindow), &runtime::gpu::GPU_Emitter::emit<ngraph::op::ReduceWindow>},
    {TI(ngraph::op::SelectAndScatter),

--- a/src/ngraph/runtime/gpu/unit_test.manifest
+++ b/src/ngraph/runtime/gpu/unit_test.manifest
 abc_int64
-backwards_reverse_sequence_n4d2c3h2w2
-backwards_reverse_sequence_n3_c2_h3
 backwards_slice
 batch_norm_one_output
 batch_norm_three_outputs
@@ -31,9 +29,6 @@ one_hot_vector_1_barely_oob
 one_hot_vector_1_far_oob
 one_hot_vector_1_fp_nonint
 parameter_as_output
-reverse_sequence_n4d2c3h2w2
-reverse_sequence_n4c3h2w2
-reverse_sequence_n2c3h4w2
 scalar_constant_float32
 scalar_constant_int64
 select_and_scatter_3d_without_overlap