Commit bdfcf5b4 authored by Fenglei's avatar Fenglei Committed by Adam Procter

gpu reverse sequence (#1109)

* add reverse_sequence

* fix bugs, compiled

* fix index bug

* fix bug and clang format

* correct function name

* clang format

* remove extra ;

* remove tests from skip list

* add backward support, skip tests

* add back template<> line

* remove unecessary lines in kernel
parent 94844d13
...@@ -331,6 +331,87 @@ size_t runtime::gpu::CUDAEmitter::build_pad_dynamic(const runtime::gpu::GPURunti ...@@ -331,6 +331,87 @@ size_t runtime::gpu::CUDAEmitter::build_pad_dynamic(const runtime::gpu::GPURunti
return primitive_index; return primitive_index;
} }
size_t runtime::gpu::CUDAEmitter::build_reverse_sequence(const runtime::gpu::GPURuntimeContext* ctx,
const std::array<std::string, 3>& dtypes,
GPUShape input_shape0,
GPUShape input_shape1,
GPUShape output_shape,
size_t batch_axis,
size_t sequence_axis)
{
std::stringstream kernel_name;
kernel_name << "reverse_sequence_" << join(dtypes, "_") << "_bi_" << batch_axis << "_si_"
<< sequence_axis << "_r_" << output_shape.size();
std::string hash = kernel_name.str() + "_i" + join(input_shape0, "_") + "_i" +
join(input_shape1, "_") + "_o" + join(output_shape);
// For backwards compatability we currently use two unordered maps
// 1. one looks up the compiled cuda kernel (CudaFunctionPool)
// 2. the other looks to see if this kernel is already in the primitive list
// check if the requested kernel is already an inserted primitive
size_t primitive_index = m_primitive_emitter->lookup(hash);
if (primitive_index != std::numeric_limits<size_t>::max())
{
return primitive_index;
}
// check if the kernel has already been compiled. if so, create
// a launch primitive for it based on the input tensor shape
// but do not recompile the kernel. otherwise, do it all:
// recompile the kernel and then create the primitive
auto compiled_kernel = ctx->compiled_kernel_pool->get(kernel_name.str());
if (compiled_kernel == nullptr)
{
codegen::CodeWriter writer;
CudaKernelBuilder::add_pod_typedefs(writer);
CudaKernelBuilder::get_reverse_sequence_op(
writer, kernel_name.str(), dtypes, batch_axis, sequence_axis, output_shape.size());
compiled_kernel = ctx->compiled_kernel_pool->set(kernel_name.str(), writer.get_code());
}
uint32_t nthreads = static_cast<uint32_t>(shape_size(output_shape));
GPUShape output_strides = row_major_strides(output_shape);
// get an allocator for transient per kernel gpu memory
GPUAllocator allocator = this->m_primitive_emitter->get_memory_allocator();
size_t idx_output_shape =
allocator.reserve_argspace(output_shape.data(), output_shape.size() * sizeof(uint32_t));
size_t idx_output_strides =
allocator.reserve_argspace(output_strides.data(), output_strides.size() * sizeof(uint32_t));
// create the launch primitive
std::unique_ptr<gpu::primitive> reserve_sequence(
new gpu::primitive{[=](void** inputs, void** outputs) mutable {
void* param_output_shape = runtime::gpu::invoke_memory_primitive(ctx, idx_output_shape);
void* param_output_strides =
runtime::gpu::invoke_memory_primitive(ctx, idx_output_strides);
std::vector<void*> args_list{&inputs[0],
&inputs[1],
&outputs[0],
&param_output_shape,
&param_output_strides,
&nthreads};
CUDA_SAFE_CALL(cuLaunchKernel(*compiled_kernel.get(),
static_cast<uint32_t>(nthreads),
1,
1, // grid dim
1,
1,
1, // block dim
0,
NULL, // shared mem and stream
args_list.data(),
0)); // arguments
CUDA_SAFE_CALL(cuCtxSynchronize()); // Retrieve and print output.
}});
primitive_index = this->m_primitive_emitter->insert(std::move(reserve_sequence));
m_primitive_emitter->cache(hash, primitive_index);
return primitive_index;
}
size_t runtime::gpu::CUDAEmitter::build_1d_max_pool(const GPURuntimeContext* ctx, size_t runtime::gpu::CUDAEmitter::build_1d_max_pool(const GPURuntimeContext* ctx,
const std::array<std::string, 2>& dtypes, const std::array<std::string, 2>& dtypes,
GPUShape input_shape, GPUShape input_shape,
...@@ -1377,6 +1458,15 @@ __device__ __forceinline__ float load(const float* __restrict__ in, int i=0, b ...@@ -1377,6 +1458,15 @@ __device__ __forceinline__ float load(const float* __restrict__ in, int i=0, b
} }
return v; return v;
} }
__device__ __forceinline__ int32_t load(const int32_t* __restrict__ in, int i=0, bool b=true)
{
int32_t v = 0;
if (b)
{
v = __ldg(in + i);
}
return v;
}
__device__ __forceinline__ int64_t load(const int64_t* __restrict__ in, int i=0, bool b=true) __device__ __forceinline__ int64_t load(const int64_t* __restrict__ in, int i=0, bool b=true)
{ {
int64_t v = 0; int64_t v = 0;
......
...@@ -78,6 +78,14 @@ namespace ngraph ...@@ -78,6 +78,14 @@ namespace ngraph
GPUShape reduce_window_shape, GPUShape reduce_window_shape,
GPUShape reduce_window_strides); GPUShape reduce_window_strides);
size_t build_reverse_sequence(const runtime::gpu::GPURuntimeContext* ctx,
const std::array<std::string, 3>& dtypes,
GPUShape input_shape0,
GPUShape input_shape1,
GPUShape output_shape,
size_t batch_axis,
size_t sequence_axis);
template <typename T> template <typename T>
size_t build_elementwise(const GPURuntimeContext* ctx, size_t build_elementwise(const GPURuntimeContext* ctx,
const std::vector<std::string>& dtypes, const std::vector<std::string>& dtypes,
......
...@@ -313,6 +313,53 @@ void runtime::gpu::CudaKernelBuilder::get_pad_dynamic_op( ...@@ -313,6 +313,53 @@ void runtime::gpu::CudaKernelBuilder::get_pad_dynamic_op(
writer.block_end(); writer.block_end();
} }
void runtime::gpu::CudaKernelBuilder::get_reverse_sequence_op(
codegen::CodeWriter& writer,
const std::string& name,
const std::array<std::string, 3>& data_types,
const size_t batch_axis,
const size_t sequence_axis,
const size_t rank)
{
writer << "extern \"C\" __global__ void cuda_" << name << "(" << data_types[0] << "* in, "
<< data_types[1] << "* sequence, " << data_types[2] << "* out, "
<< "uint32_t* output_shape, uint32_t* output_strides, uint32_t n)\n";
writer.block_begin();
{
writer << "uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n";
writer << "if (tid < n)\n";
writer.block_begin();
{
writer << "uint32_t input_idx = tid;\n";
for (size_t i = 0; i < rank - 1; i++)
{
writer << "uint32_t output_idx_" << i << " = input_idx / output_strides[" << i
<< "];\n";
writer << "input_idx %= output_strides[" << i << "];\n";
}
writer << "uint32_t output_idx_" << rank - 1 << " = input_idx / output_strides["
<< rank - 1 << "];\n";
writer << "uint32_t sequence_length = sequence[output_idx_" << batch_axis << "];\n";
writer << "assert(sequence_length <= output_shape[" << sequence_axis << "]);\n";
writer << "bool need_reverse = (output_idx_" << sequence_axis
<< " < sequence_length) && (sequence_length > 1);\n";
writer << "output_idx_" << sequence_axis
<< " = need_reverse ? sequence_length - output_idx_" << sequence_axis
<< " - 1 : output_idx_" << sequence_axis << ";\n";
writer << "uint32_t output_idx = need_reverse ? ";
writer << "output_idx_" << 0 << " * output_strides[" << 0 << "]";
for (size_t i = 1; i < rank; i++)
{
writer << " + output_idx_" << i << " * output_strides[" << i << "]";
}
writer << " : tid;\n";
writer << "out[output_idx] = in[tid];\n";
}
writer.block_end();
}
writer.block_end();
}
void runtime::gpu::CudaKernelBuilder::get_slice_op(codegen::CodeWriter& writer, void runtime::gpu::CudaKernelBuilder::get_slice_op(codegen::CodeWriter& writer,
const std::string& name, const std::string& name,
const std::array<std::string, 2>& data_types) const std::array<std::string, 2>& data_types)
......
...@@ -76,6 +76,13 @@ namespace ngraph ...@@ -76,6 +76,13 @@ namespace ngraph
const std::vector<std::string>& data_types, const std::vector<std::string>& data_types,
const size_t rank); const size_t rank);
static void get_reverse_sequence_op(codegen::CodeWriter& writer,
const std::string& name,
const std::array<std::string, 3>& data_types,
const size_t batch_axis,
const size_t sequence_axis,
const size_t rank);
static void get_device_helper(codegen::CodeWriter& writer, static void get_device_helper(codegen::CodeWriter& writer,
const std::string& name, const std::string& name,
const std::string& math_kernel, const std::string& math_kernel,
......
...@@ -80,6 +80,7 @@ ...@@ -80,6 +80,7 @@
#include "ngraph/op/reshape.hpp" #include "ngraph/op/reshape.hpp"
#include "ngraph/op/result.hpp" #include "ngraph/op/result.hpp"
#include "ngraph/op/reverse.hpp" #include "ngraph/op/reverse.hpp"
#include "ngraph/op/reverse_sequence.hpp"
#include "ngraph/op/select.hpp" #include "ngraph/op/select.hpp"
#include "ngraph/op/select_and_scatter.hpp" #include "ngraph/op/select_and_scatter.hpp"
#include "ngraph/op/sign.hpp" #include "ngraph/op/sign.hpp"
...@@ -1151,6 +1152,38 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc, ...@@ -1151,6 +1152,38 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
writer.block_end(); writer.block_end();
} }
template <>
void GPU_Emitter::EMITTER_DECL(ngraph::op::ReverseSequence)
{
if (out[0].get_size() == 0)
{
return;
}
auto rs = static_cast<const ngraph::op::ReverseSequence*>(node);
size_t bi = rs->get_batch_axis();
size_t si = rs->get_sequence_axis();
auto arg_shape0 = args[0].get_shape();
auto arg_shape1 = args[1].get_shape();
auto out_shape = out[0].get_shape();
auto& cuda_emitter = external_function->get_primitive_emitter()->get_cuda_emitter();
auto rs_index = cuda_emitter->build_reverse_sequence(
external_function->ctx().get(),
{{args[0].get_type(), args[1].get_type(), out[0].get_type()}},
arg_shape0,
arg_shape1,
out_shape,
bi,
si);
writer << "gpu::invoke_primitive(ctx, " << rs_index << ", ";
writer << "std::vector<void*>{" << args[0].get_name() << ", " << args[1].get_name()
<< "}.data(), ";
writer << "std::vector<void*>{" << out[0].get_name() << "}.data()";
writer << ");\n";
}
template <> template <>
void GPU_Emitter::EMITTER_DECL(ngraph::op::Multiply) void GPU_Emitter::EMITTER_DECL(ngraph::op::Multiply)
{ {
......
...@@ -91,6 +91,7 @@ ...@@ -91,6 +91,7 @@
#include "ngraph/op/reshape.hpp" #include "ngraph/op/reshape.hpp"
#include "ngraph/op/result.hpp" #include "ngraph/op/result.hpp"
#include "ngraph/op/reverse.hpp" #include "ngraph/op/reverse.hpp"
#include "ngraph/op/reverse_sequence.hpp"
#include "ngraph/op/select.hpp" #include "ngraph/op/select.hpp"
#include "ngraph/op/select_and_scatter.hpp" #include "ngraph/op/select_and_scatter.hpp"
#include "ngraph/op/sign.hpp" #include "ngraph/op/sign.hpp"
...@@ -223,6 +224,8 @@ static const runtime::gpu::OpMap dispatcher{ ...@@ -223,6 +224,8 @@ static const runtime::gpu::OpMap dispatcher{
{TI(ngraph::op::Not), &runtime::gpu::GPU_Emitter::emit_elementwise<ngraph::op::Not>}, {TI(ngraph::op::Not), &runtime::gpu::GPU_Emitter::emit_elementwise<ngraph::op::Not>},
{TI(ngraph::op::MaxPool), &runtime::gpu::GPU_Emitter::emit<ngraph::op::MaxPool>}, {TI(ngraph::op::MaxPool), &runtime::gpu::GPU_Emitter::emit<ngraph::op::MaxPool>},
{TI(ngraph::op::Reverse), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Reverse>}, {TI(ngraph::op::Reverse), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Reverse>},
{TI(ngraph::op::ReverseSequence),
&runtime::gpu::GPU_Emitter::emit<ngraph::op::ReverseSequence>},
{TI(ngraph::op::Result), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Result>}, {TI(ngraph::op::Result), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Result>},
{TI(ngraph::op::ReduceWindow), &runtime::gpu::GPU_Emitter::emit<ngraph::op::ReduceWindow>}, {TI(ngraph::op::ReduceWindow), &runtime::gpu::GPU_Emitter::emit<ngraph::op::ReduceWindow>},
{TI(ngraph::op::SelectAndScatter), {TI(ngraph::op::SelectAndScatter),
......
abc_int64 abc_int64
backwards_reverse_sequence_n4d2c3h2w2
backwards_reverse_sequence_n3_c2_h3
backwards_slice backwards_slice
batch_norm_one_output batch_norm_one_output
batch_norm_three_outputs batch_norm_three_outputs
...@@ -31,9 +29,6 @@ one_hot_vector_1_barely_oob ...@@ -31,9 +29,6 @@ one_hot_vector_1_barely_oob
one_hot_vector_1_far_oob one_hot_vector_1_far_oob
one_hot_vector_1_fp_nonint one_hot_vector_1_fp_nonint
parameter_as_output parameter_as_output
reverse_sequence_n4d2c3h2w2
reverse_sequence_n4c3h2w2
reverse_sequence_n2c3h4w2
scalar_constant_float32 scalar_constant_float32
scalar_constant_int64 scalar_constant_int64
select_and_scatter_3d_without_overlap select_and_scatter_3d_without_overlap
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment