Commit 40ff77bd authored by Chris Sullivan's avatar Chris Sullivan Committed by Robert Kimball

Update pad on nvpgu (#1759)

* Add pad with fill operator using the outward-in index pattern.

* Remove static pad and rename build_pad_dynamic -> build_pad. Update maxpool 1d padding.

* Formatting.

* Split build_pad_dynamic into build_pad and build_pad_fill.

* Add test coverage for fixed bug in op::Pad for gpu.
parent 519b18ac
This diff is collapsed.
...@@ -50,15 +50,13 @@ namespace ngraph ...@@ -50,15 +50,13 @@ namespace ngraph
size_t build_primitive(const op::ReplaceSlice* node, bool in_place_op); size_t build_primitive(const op::ReplaceSlice* node, bool in_place_op);
public: public:
size_t build_pad(const std::array<std::string, 2>& dtypes, size_t build_pad(const std::vector<std::string>& dtypes,
NVShape input_shape, NVShape input_shape,
NVShape output_shape, NVShape output_shape,
NVShape pad_below, NVShape padding_below,
NVShape pad_above, NVShape padding_interior);
NVShape pad_interior,
const std::string& pad_value = "");
size_t build_pad_dynamic(const std::array<std::string, 2>& dtypes, size_t build_pad_fill(const std::vector<std::string>& dtypes,
NVShape input_shape, NVShape input_shape,
NVShape output_shape, NVShape output_shape,
NVShape padding_below, NVShape padding_below,
......
...@@ -418,7 +418,7 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::Convolution* node) ...@@ -418,7 +418,7 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::Convolution* node)
Shape padding_interior(data_dilation_strides); Shape padding_interior(data_dilation_strides);
size_t idx_workspace = std::numeric_limits<size_t>::max(); size_t idx_workspace = std::numeric_limits<size_t>::max();
size_t pad_dynamic_index = std::numeric_limits<size_t>::max(); size_t pad_index = std::numeric_limits<size_t>::max();
bool can_find_algo = true; bool can_find_algo = true;
if (pad_required || is_deconvolution) if (pad_required || is_deconvolution)
{ {
...@@ -431,8 +431,7 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::Convolution* node) ...@@ -431,8 +431,7 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::Convolution* node)
idx_workspace = allocator.reserve_workspace(temp_size, true); idx_workspace = allocator.reserve_workspace(temp_size, true);
auto& cuda_emitter = m_primitive_emitter->get_cuda_emitter(); auto& cuda_emitter = m_primitive_emitter->get_cuda_emitter();
pad_dynamic_index = pad_index = cuda_emitter->build_pad({{args[0].get_element_type().c_type_string(),
cuda_emitter->build_pad_dynamic({{args[0].get_element_type().c_type_string(),
out[0].get_element_type().c_type_string()}}, out[0].get_element_type().c_type_string()}},
input_shape, input_shape,
input_shape_padded, input_shape_padded,
...@@ -458,11 +457,11 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::Convolution* node) ...@@ -458,11 +457,11 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::Convolution* node)
std::unique_ptr<gpu::primitive> kernel_launch( std::unique_ptr<gpu::primitive> kernel_launch(
new gpu::primitive{[=](void** inputs, void** outputs) mutable { new gpu::primitive{[=](void** inputs, void** outputs) mutable {
if (idx_workspace != std::numeric_limits<size_t>::max() && if (idx_workspace != std::numeric_limits<size_t>::max() &&
pad_dynamic_index != std::numeric_limits<size_t>::max()) pad_index != std::numeric_limits<size_t>::max())
{ {
void* pad_buffer = runtime::gpu::invoke_memory_primitive(m_ctx, idx_workspace); void* pad_buffer = runtime::gpu::invoke_memory_primitive(m_ctx, idx_workspace);
gpu::invoke_primitive(m_ctx, gpu::invoke_primitive(m_ctx,
pad_dynamic_index, pad_index,
std::vector<void*>{inputs[0]}.data(), std::vector<void*>{inputs[0]}.data(),
std::vector<void*>{pad_buffer}.data()); std::vector<void*>{pad_buffer}.data());
gpu::invoke_primitive( gpu::invoke_primitive(
...@@ -542,7 +541,7 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::ConvolutionBackprop ...@@ -542,7 +541,7 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::ConvolutionBackprop
Shape padding_interior(data_dilation_strides); Shape padding_interior(data_dilation_strides);
size_t idx_workspace = std::numeric_limits<size_t>::max(); size_t idx_workspace = std::numeric_limits<size_t>::max();
size_t pad_dynamic_index = std::numeric_limits<size_t>::max(); size_t pad_index = std::numeric_limits<size_t>::max();
size_t slice_index = std::numeric_limits<size_t>::max(); size_t slice_index = std::numeric_limits<size_t>::max();
bool can_find_algo = true; bool can_find_algo = true;
if (pad_required || is_deconvolution) if (pad_required || is_deconvolution)
...@@ -556,7 +555,7 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::ConvolutionBackprop ...@@ -556,7 +555,7 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::ConvolutionBackprop
idx_workspace = allocator.reserve_workspace(temp_size, true); idx_workspace = allocator.reserve_workspace(temp_size, true);
auto& cuda_emitter = m_primitive_emitter->get_cuda_emitter(); auto& cuda_emitter = m_primitive_emitter->get_cuda_emitter();
pad_dynamic_index = cuda_emitter->build_pad_dynamic({{input_type, output_type}}, pad_index = cuda_emitter->build_pad({{input_type, output_type}},
output_shape, output_shape,
output_shape_padded, output_shape_padded,
padding_below, padding_below,
...@@ -587,12 +586,12 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::ConvolutionBackprop ...@@ -587,12 +586,12 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::ConvolutionBackprop
std::unique_ptr<gpu::primitive> kernel_launch(new gpu::primitive{[=](void** inputs, std::unique_ptr<gpu::primitive> kernel_launch(new gpu::primitive{[=](void** inputs,
void** outputs) mutable { void** outputs) mutable {
if (idx_workspace != std::numeric_limits<size_t>::max() && if (idx_workspace != std::numeric_limits<size_t>::max() &&
pad_dynamic_index != std::numeric_limits<size_t>::max() && pad_index != std::numeric_limits<size_t>::max() &&
slice_index != std::numeric_limits<size_t>::max()) slice_index != std::numeric_limits<size_t>::max())
{ {
void* pad_buffer = runtime::gpu::invoke_memory_primitive(m_ctx, idx_workspace); void* pad_buffer = runtime::gpu::invoke_memory_primitive(m_ctx, idx_workspace);
gpu::invoke_primitive(m_ctx, gpu::invoke_primitive(m_ctx,
pad_dynamic_index, pad_index,
std::vector<void*>{inputs[0]}.data(), std::vector<void*>{inputs[0]}.data(),
std::vector<void*>{pad_buffer}.data()); std::vector<void*>{pad_buffer}.data());
gpu::invoke_primitive(m_ctx, conv_index, inputs, std::vector<void*>{pad_buffer}.data()); gpu::invoke_primitive(m_ctx, conv_index, inputs, std::vector<void*>{pad_buffer}.data());
...@@ -662,7 +661,7 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::ConvolutionBackprop ...@@ -662,7 +661,7 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::ConvolutionBackprop
Shape padding_interior(data_dilation_strides); Shape padding_interior(data_dilation_strides);
size_t idx_workspace = std::numeric_limits<size_t>::max(); size_t idx_workspace = std::numeric_limits<size_t>::max();
size_t pad_dynamic_index = std::numeric_limits<size_t>::max(); size_t pad_index = std::numeric_limits<size_t>::max();
bool can_find_algo = true; bool can_find_algo = true;
if (pad_required || is_deconvolution) if (pad_required || is_deconvolution)
{ {
...@@ -675,7 +674,7 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::ConvolutionBackprop ...@@ -675,7 +674,7 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::ConvolutionBackprop
idx_workspace = allocator.reserve_workspace(temp_size, true); idx_workspace = allocator.reserve_workspace(temp_size, true);
auto& cuda_emitter = m_primitive_emitter->get_cuda_emitter(); auto& cuda_emitter = m_primitive_emitter->get_cuda_emitter();
pad_dynamic_index = cuda_emitter->build_pad_dynamic({{input_type, output_type}}, pad_index = cuda_emitter->build_pad({{input_type, output_type}},
input_shape_0, input_shape_0,
input_shape_padded, input_shape_padded,
padding_below, padding_below,
...@@ -700,11 +699,11 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::ConvolutionBackprop ...@@ -700,11 +699,11 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::ConvolutionBackprop
std::unique_ptr<gpu::primitive> kernel_launch( std::unique_ptr<gpu::primitive> kernel_launch(
new gpu::primitive{[=](void** inputs, void** outputs) mutable { new gpu::primitive{[=](void** inputs, void** outputs) mutable {
if (idx_workspace != std::numeric_limits<size_t>::max() && if (idx_workspace != std::numeric_limits<size_t>::max() &&
pad_dynamic_index != std::numeric_limits<size_t>::max()) pad_index != std::numeric_limits<size_t>::max())
{ {
void* pad_buffer = runtime::gpu::invoke_memory_primitive(m_ctx, idx_workspace); void* pad_buffer = runtime::gpu::invoke_memory_primitive(m_ctx, idx_workspace);
gpu::invoke_primitive(m_ctx, gpu::invoke_primitive(m_ctx,
pad_dynamic_index, pad_index,
std::vector<void*>{inputs[0]}.data(), std::vector<void*>{inputs[0]}.data(),
std::vector<void*>{pad_buffer}.data()); std::vector<void*>{pad_buffer}.data());
gpu::invoke_primitive( gpu::invoke_primitive(
...@@ -768,7 +767,7 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::MaxPool* node) ...@@ -768,7 +767,7 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::MaxPool* node)
padded_size * args[0].get_element_type().size()); padded_size * args[0].get_element_type().size());
auto& cuda_emitter = m_primitive_emitter->get_cuda_emitter(); auto& cuda_emitter = m_primitive_emitter->get_cuda_emitter();
pad_index = cuda_emitter->build_pad_dynamic({{input_type, output_type}}, pad_index = cuda_emitter->build_pad({{input_type, output_type}},
input_shape, input_shape,
input_shape_padded, input_shape_padded,
padding_below, padding_below,
......
...@@ -638,11 +638,9 @@ void runtime::gpu::CudaKernelBuilder::get_concat_op(codegen::CodeWriter& writer, ...@@ -638,11 +638,9 @@ void runtime::gpu::CudaKernelBuilder::get_concat_op(codegen::CodeWriter& writer,
writer.block_end(); writer.block_end();
} }
void runtime::gpu::CudaKernelBuilder::get_pad_dynamic_op( void runtime::gpu::CudaKernelBuilder::get_pad_op(codegen::CodeWriter& writer,
codegen::CodeWriter& writer,
const std::string& name, const std::string& name,
GPUKernelArgs& args, GPUKernelArgs& args,
const std::array<std::string, 2>& data_types,
size_t rank) size_t rank)
{ {
writer << "extern \"C\" __global__ void cuda_" << name << args.get_input_signature(); writer << "extern \"C\" __global__ void cuda_" << name << args.get_input_signature();
...@@ -673,6 +671,44 @@ void runtime::gpu::CudaKernelBuilder::get_pad_dynamic_op( ...@@ -673,6 +671,44 @@ void runtime::gpu::CudaKernelBuilder::get_pad_dynamic_op(
writer.block_end(); writer.block_end();
} }
void runtime::gpu::CudaKernelBuilder::get_pad_fill_op(codegen::CodeWriter& writer,
const std::string& name,
GPUKernelArgs& args,
size_t rank)
{
writer << "extern \"C\" __global__ void cuda_" << name << args.get_input_signature();
writer.block_begin();
{
writer << "uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n";
writer << "if (tid < n)\n";
writer.block_begin();
{
writer << "bool in_bounds = true;\n";
writer << "uint32_t output_pixel = tid;\n";
writer << "uint32_t input_pixel = 0;\n";
writer << "int32_t input, input_dil;\n";
for (size_t i = 0; i < rank; i++)
{
if (i != 0)
{
writer << "output_pixel %= output_strides" << i - 1 << ";\n";
}
writer << "input_dil = output_pixel / output_strides" << i << " - padding_below"
<< i << ";\n";
writer << "input = input_dil / (padding_interior" << i << " + 1);\n";
writer << "input_dil %= (padding_interior" << i << " + 1);\n";
writer << "in_bounds = in_bounds && (input >= 0) && (input < input_shape" << i
<< ") && (input_dil == 0);\n";
writer << "input_pixel += input * input_strides" << i << ";\n";
}
writer << "out[tid] = (in_bounds) ? in[input_pixel] : *pad;\n";
}
writer.block_end();
}
writer.block_end();
}
void runtime::gpu::CudaKernelBuilder::get_reverse_sequence_op( void runtime::gpu::CudaKernelBuilder::get_reverse_sequence_op(
codegen::CodeWriter& writer, codegen::CodeWriter& writer,
const std::string& name, const std::string& name,
......
...@@ -130,10 +130,14 @@ namespace ngraph ...@@ -130,10 +130,14 @@ namespace ngraph
const std::string& math_kernel, const std::string& math_kernel,
const std::vector<std::string>& data_types); const std::vector<std::string>& data_types);
static void get_pad_dynamic_op(codegen::CodeWriter& writer, static void get_pad_op(codegen::CodeWriter& writer,
const std::string& name,
GPUKernelArgs& args,
size_t rank);
static void get_pad_fill_op(codegen::CodeWriter& writer,
const std::string& name, const std::string& name,
GPUKernelArgs& args, GPUKernelArgs& args,
const std::array<std::string, 2>& data_types,
size_t rank); size_t rank);
static void get_ew_collective_op(codegen::CodeWriter& writer, static void get_ew_collective_op(codegen::CodeWriter& writer,
......
...@@ -827,11 +827,11 @@ void runtime::gpu::GPU_Emitter::emit_Pad(EMIT_ARGS) ...@@ -827,11 +827,11 @@ void runtime::gpu::GPU_Emitter::emit_Pad(EMIT_ARGS)
auto& cuda_emitter = external_function->get_primitive_emitter()->get_cuda_emitter(); auto& cuda_emitter = external_function->get_primitive_emitter()->get_cuda_emitter();
auto pad_index = cuda_emitter->build_pad({{args[0].get_type(), out[0].get_type()}}, auto pad_index = cuda_emitter->build_pad_fill(
{{args[0].get_type(), args[1].get_type(), out[0].get_type()}},
input_shape, input_shape,
output_shape, output_shape,
padding_below, padding_below,
padding_above,
padding_interior); padding_interior);
writer << "void* input[] = {" << node_names(args) << "};\n"; writer << "void* input[] = {" << node_names(args) << "};\n";
writer << "void* output[] = {" << node_names(out) << "};\n"; writer << "void* output[] = {" << node_names(out) << "};\n";
......
...@@ -7497,6 +7497,86 @@ NGRAPH_TEST(${BACKEND_NAME}, pad_interior_exterior_4d_2x0x3x2) ...@@ -7497,6 +7497,86 @@ NGRAPH_TEST(${BACKEND_NAME}, pad_interior_exterior_4d_2x0x3x2)
EXPECT_EQ(expected, read_vector<float>(result)); EXPECT_EQ(expected, read_vector<float>(result));
} }
// This test covers the case with multiple image and with asymetric pad
// bug has been found on nvGPU side now covered by this test
NGRAPH_TEST(${BACKEND_NAME}, pad_2channel_2image_asym)
{
Shape shape_a{2, 2, 4, 4};
auto window_movement_strides = Strides{2, 2};
Shape padding_below{0, 0, 0, 0};
Shape padding_above{0, 0, 2, 2};
Shape padding_interior{0, 0, 0, 0};
auto A = make_shared<op::Parameter>(element::f32, shape_a);
Shape shape_b{};
auto B = make_shared<op::Parameter>(element::f32, shape_b);
Shape shape_r{2, 2, 6, 6};
auto f = make_shared<Function>(
make_shared<op::Pad>(A, B, padding_below, padding_above, padding_interior),
op::ParameterVector{A, B});
auto backend = runtime::Backend::create("${BACKEND_NAME}");
// Create some tensors for input/output
auto a = backend->create_tensor(element::f32, shape_a);
copy_data(a,
test::NDArray<float, 4>({{{{0, 1, 0, 2}, // img 0 chan 0
{0, 3, 2, 0},
{2, 0, 0, 0},
{0, 2, 1, 0}},
{{0, 0, 0, 2}, // img 0 chan 1
{0, 2, 3, 0},
{2, 0, 1, 0},
{2, 0, 0, 0}}},
{{{0, 2, 1, 1}, // img 1 chan 0
{0, 0, 2, 0},
{0, 0, 1, 2},
{0, 0, 0, 0}},
{{2, 1, 0, 0}, // img 1 chan 1
{0, 2, 0, 0},
{1, 1, 2, 0},
{1, 0, 0, 0}}}})
.get_vector());
auto b = backend->create_tensor(element::f32, shape_b);
copy_data(b, vector<float>{42});
auto result = backend->create_tensor(element::f32, shape_r);
backend->call_with_validate(f, {result}, {a, b});
EXPECT_EQ((test::NDArray<float, 4>({{{{0, 1, 0, 2, 42, 42}, // img 0 chan 0
{0, 3, 2, 0, 42, 42},
{2, 0, 0, 0, 42, 42},
{0, 2, 1, 0, 42, 42},
{42, 42, 42, 42, 42, 42},
{42, 42, 42, 42, 42, 42}},
{{0, 0, 0, 2, 42, 42}, // img 1 chan 0
{0, 2, 3, 0, 42, 42},
{2, 0, 1, 0, 42, 42},
{2, 0, 0, 0, 42, 42},
{42, 42, 42, 42, 42, 42},
{42, 42, 42, 42, 42, 42}}},
{{{0, 2, 1, 1, 42, 42}, // img 1 chan 0
{0, 0, 2, 0, 42, 42},
{0, 0, 1, 2, 42, 42},
{0, 0, 0, 0, 42, 42},
{42, 42, 42, 42, 42, 42},
{42, 42, 42, 42, 42, 42}},
{{2, 1, 0, 0, 42, 42}, // img 1 chan 1
{0, 2, 0, 0, 42, 42},
{1, 1, 2, 0, 42, 42},
{1, 0, 0, 0, 42, 42},
{42, 42, 42, 42, 42, 42},
{42, 42, 42, 42, 42, 42}}}})
.get_vector()),
read_vector<float>(result));
}
// Trivial case with no reduced axes. // Trivial case with no reduced axes.
NGRAPH_TEST(${BACKEND_NAME}, product_trivial) NGRAPH_TEST(${BACKEND_NAME}, product_trivial)
{ {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment