Commit 40ff77bd authored by Chris Sullivan's avatar Chris Sullivan Committed by Robert Kimball

Update pad on nvpgu (#1759)

* Add pad with fill operator using the outward-in index pattern.

* Remove static pad and rename build_pad_dynamic -> build_pad. Update maxpool 1d padding.

* Formatting.

* Split build_pad_dynamic into build_pad and build_pad_fill.

* Add test coverage for fixed bug in op::Pad for gpu.
parent 519b18ac
This diff is collapsed.
......@@ -50,19 +50,17 @@ namespace ngraph
size_t build_primitive(const op::ReplaceSlice* node, bool in_place_op);
public:
size_t build_pad(const std::array<std::string, 2>& dtypes,
size_t build_pad(const std::vector<std::string>& dtypes,
NVShape input_shape,
NVShape output_shape,
NVShape pad_below,
NVShape pad_above,
NVShape pad_interior,
const std::string& pad_value = "");
NVShape padding_below,
NVShape padding_interior);
size_t build_pad_dynamic(const std::array<std::string, 2>& dtypes,
NVShape input_shape,
NVShape output_shape,
NVShape padding_below,
NVShape padding_interior);
size_t build_pad_fill(const std::vector<std::string>& dtypes,
NVShape input_shape,
NVShape output_shape,
NVShape padding_below,
NVShape padding_interior);
size_t build_1d_max_pool(const std::array<std::string, 2>& dtypes,
NVShape input_shape,
......
......@@ -418,7 +418,7 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::Convolution* node)
Shape padding_interior(data_dilation_strides);
size_t idx_workspace = std::numeric_limits<size_t>::max();
size_t pad_dynamic_index = std::numeric_limits<size_t>::max();
size_t pad_index = std::numeric_limits<size_t>::max();
bool can_find_algo = true;
if (pad_required || is_deconvolution)
{
......@@ -431,8 +431,7 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::Convolution* node)
idx_workspace = allocator.reserve_workspace(temp_size, true);
auto& cuda_emitter = m_primitive_emitter->get_cuda_emitter();
pad_dynamic_index =
cuda_emitter->build_pad_dynamic({{args[0].get_element_type().c_type_string(),
pad_index = cuda_emitter->build_pad({{args[0].get_element_type().c_type_string(),
out[0].get_element_type().c_type_string()}},
input_shape,
input_shape_padded,
......@@ -458,11 +457,11 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::Convolution* node)
std::unique_ptr<gpu::primitive> kernel_launch(
new gpu::primitive{[=](void** inputs, void** outputs) mutable {
if (idx_workspace != std::numeric_limits<size_t>::max() &&
pad_dynamic_index != std::numeric_limits<size_t>::max())
pad_index != std::numeric_limits<size_t>::max())
{
void* pad_buffer = runtime::gpu::invoke_memory_primitive(m_ctx, idx_workspace);
gpu::invoke_primitive(m_ctx,
pad_dynamic_index,
pad_index,
std::vector<void*>{inputs[0]}.data(),
std::vector<void*>{pad_buffer}.data());
gpu::invoke_primitive(
......@@ -542,7 +541,7 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::ConvolutionBackprop
Shape padding_interior(data_dilation_strides);
size_t idx_workspace = std::numeric_limits<size_t>::max();
size_t pad_dynamic_index = std::numeric_limits<size_t>::max();
size_t pad_index = std::numeric_limits<size_t>::max();
size_t slice_index = std::numeric_limits<size_t>::max();
bool can_find_algo = true;
if (pad_required || is_deconvolution)
......@@ -556,11 +555,11 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::ConvolutionBackprop
idx_workspace = allocator.reserve_workspace(temp_size, true);
auto& cuda_emitter = m_primitive_emitter->get_cuda_emitter();
pad_dynamic_index = cuda_emitter->build_pad_dynamic({{input_type, output_type}},
output_shape,
output_shape_padded,
padding_below,
padding_interior);
pad_index = cuda_emitter->build_pad({{input_type, output_type}},
output_shape,
output_shape_padded,
padding_below,
padding_interior);
slice_index = cuda_emitter->build_slice({{input_type, output_type}},
output_shape_padded,
......@@ -587,12 +586,12 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::ConvolutionBackprop
std::unique_ptr<gpu::primitive> kernel_launch(new gpu::primitive{[=](void** inputs,
void** outputs) mutable {
if (idx_workspace != std::numeric_limits<size_t>::max() &&
pad_dynamic_index != std::numeric_limits<size_t>::max() &&
pad_index != std::numeric_limits<size_t>::max() &&
slice_index != std::numeric_limits<size_t>::max())
{
void* pad_buffer = runtime::gpu::invoke_memory_primitive(m_ctx, idx_workspace);
gpu::invoke_primitive(m_ctx,
pad_dynamic_index,
pad_index,
std::vector<void*>{inputs[0]}.data(),
std::vector<void*>{pad_buffer}.data());
gpu::invoke_primitive(m_ctx, conv_index, inputs, std::vector<void*>{pad_buffer}.data());
......@@ -662,7 +661,7 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::ConvolutionBackprop
Shape padding_interior(data_dilation_strides);
size_t idx_workspace = std::numeric_limits<size_t>::max();
size_t pad_dynamic_index = std::numeric_limits<size_t>::max();
size_t pad_index = std::numeric_limits<size_t>::max();
bool can_find_algo = true;
if (pad_required || is_deconvolution)
{
......@@ -675,11 +674,11 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::ConvolutionBackprop
idx_workspace = allocator.reserve_workspace(temp_size, true);
auto& cuda_emitter = m_primitive_emitter->get_cuda_emitter();
pad_dynamic_index = cuda_emitter->build_pad_dynamic({{input_type, output_type}},
input_shape_0,
input_shape_padded,
padding_below,
padding_interior);
pad_index = cuda_emitter->build_pad({{input_type, output_type}},
input_shape_0,
input_shape_padded,
padding_below,
padding_interior);
// asymetric padding has been applied, zero out padding vectors to
// ensure cudnn does not assume padding
......@@ -700,11 +699,11 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::ConvolutionBackprop
std::unique_ptr<gpu::primitive> kernel_launch(
new gpu::primitive{[=](void** inputs, void** outputs) mutable {
if (idx_workspace != std::numeric_limits<size_t>::max() &&
pad_dynamic_index != std::numeric_limits<size_t>::max())
pad_index != std::numeric_limits<size_t>::max())
{
void* pad_buffer = runtime::gpu::invoke_memory_primitive(m_ctx, idx_workspace);
gpu::invoke_primitive(m_ctx,
pad_dynamic_index,
pad_index,
std::vector<void*>{inputs[0]}.data(),
std::vector<void*>{pad_buffer}.data());
gpu::invoke_primitive(
......@@ -768,11 +767,11 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::MaxPool* node)
padded_size * args[0].get_element_type().size());
auto& cuda_emitter = m_primitive_emitter->get_cuda_emitter();
pad_index = cuda_emitter->build_pad_dynamic({{input_type, output_type}},
input_shape,
input_shape_padded,
padding_below,
padding_interior);
pad_index = cuda_emitter->build_pad({{input_type, output_type}},
input_shape,
input_shape_padded,
padding_below,
padding_interior);
// asymetric padding has been applied, zero out padding vectors to
// ensure cuDNN does not assume padding during pooling
......
......@@ -638,12 +638,10 @@ void runtime::gpu::CudaKernelBuilder::get_concat_op(codegen::CodeWriter& writer,
writer.block_end();
}
void runtime::gpu::CudaKernelBuilder::get_pad_dynamic_op(
codegen::CodeWriter& writer,
const std::string& name,
GPUKernelArgs& args,
const std::array<std::string, 2>& data_types,
size_t rank)
void runtime::gpu::CudaKernelBuilder::get_pad_op(codegen::CodeWriter& writer,
const std::string& name,
GPUKernelArgs& args,
size_t rank)
{
writer << "extern \"C\" __global__ void cuda_" << name << args.get_input_signature();
writer.block_begin();
......@@ -673,6 +671,44 @@ void runtime::gpu::CudaKernelBuilder::get_pad_dynamic_op(
writer.block_end();
}
void runtime::gpu::CudaKernelBuilder::get_pad_fill_op(codegen::CodeWriter& writer,
const std::string& name,
GPUKernelArgs& args,
size_t rank)
{
writer << "extern \"C\" __global__ void cuda_" << name << args.get_input_signature();
writer.block_begin();
{
writer << "uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n";
writer << "if (tid < n)\n";
writer.block_begin();
{
writer << "bool in_bounds = true;\n";
writer << "uint32_t output_pixel = tid;\n";
writer << "uint32_t input_pixel = 0;\n";
writer << "int32_t input, input_dil;\n";
for (size_t i = 0; i < rank; i++)
{
if (i != 0)
{
writer << "output_pixel %= output_strides" << i - 1 << ";\n";
}
writer << "input_dil = output_pixel / output_strides" << i << " - padding_below"
<< i << ";\n";
writer << "input = input_dil / (padding_interior" << i << " + 1);\n";
writer << "input_dil %= (padding_interior" << i << " + 1);\n";
writer << "in_bounds = in_bounds && (input >= 0) && (input < input_shape" << i
<< ") && (input_dil == 0);\n";
writer << "input_pixel += input * input_strides" << i << ";\n";
}
writer << "out[tid] = (in_bounds) ? in[input_pixel] : *pad;\n";
}
writer.block_end();
}
writer.block_end();
}
void runtime::gpu::CudaKernelBuilder::get_reverse_sequence_op(
codegen::CodeWriter& writer,
const std::string& name,
......
......@@ -130,11 +130,15 @@ namespace ngraph
const std::string& math_kernel,
const std::vector<std::string>& data_types);
static void get_pad_dynamic_op(codegen::CodeWriter& writer,
const std::string& name,
GPUKernelArgs& args,
const std::array<std::string, 2>& data_types,
size_t rank);
static void get_pad_op(codegen::CodeWriter& writer,
const std::string& name,
GPUKernelArgs& args,
size_t rank);
static void get_pad_fill_op(codegen::CodeWriter& writer,
const std::string& name,
GPUKernelArgs& args,
size_t rank);
static void get_ew_collective_op(codegen::CodeWriter& writer,
const std::string& name,
......
......@@ -827,12 +827,12 @@ void runtime::gpu::GPU_Emitter::emit_Pad(EMIT_ARGS)
auto& cuda_emitter = external_function->get_primitive_emitter()->get_cuda_emitter();
auto pad_index = cuda_emitter->build_pad({{args[0].get_type(), out[0].get_type()}},
input_shape,
output_shape,
padding_below,
padding_above,
padding_interior);
auto pad_index = cuda_emitter->build_pad_fill(
{{args[0].get_type(), args[1].get_type(), out[0].get_type()}},
input_shape,
output_shape,
padding_below,
padding_interior);
writer << "void* input[] = {" << node_names(args) << "};\n";
writer << "void* output[] = {" << node_names(out) << "};\n";
writer << "gpu::invoke_primitive(ctx, " << pad_index << ", input, output);\n";
......
......@@ -7497,6 +7497,86 @@ NGRAPH_TEST(${BACKEND_NAME}, pad_interior_exterior_4d_2x0x3x2)
EXPECT_EQ(expected, read_vector<float>(result));
}
// This test covers the case with multiple image and with asymetric pad
// bug has been found on nvGPU side now covered by this test
NGRAPH_TEST(${BACKEND_NAME}, pad_2channel_2image_asym)
{
Shape shape_a{2, 2, 4, 4};
auto window_movement_strides = Strides{2, 2};
Shape padding_below{0, 0, 0, 0};
Shape padding_above{0, 0, 2, 2};
Shape padding_interior{0, 0, 0, 0};
auto A = make_shared<op::Parameter>(element::f32, shape_a);
Shape shape_b{};
auto B = make_shared<op::Parameter>(element::f32, shape_b);
Shape shape_r{2, 2, 6, 6};
auto f = make_shared<Function>(
make_shared<op::Pad>(A, B, padding_below, padding_above, padding_interior),
op::ParameterVector{A, B});
auto backend = runtime::Backend::create("${BACKEND_NAME}");
// Create some tensors for input/output
auto a = backend->create_tensor(element::f32, shape_a);
copy_data(a,
test::NDArray<float, 4>({{{{0, 1, 0, 2}, // img 0 chan 0
{0, 3, 2, 0},
{2, 0, 0, 0},
{0, 2, 1, 0}},
{{0, 0, 0, 2}, // img 0 chan 1
{0, 2, 3, 0},
{2, 0, 1, 0},
{2, 0, 0, 0}}},
{{{0, 2, 1, 1}, // img 1 chan 0
{0, 0, 2, 0},
{0, 0, 1, 2},
{0, 0, 0, 0}},
{{2, 1, 0, 0}, // img 1 chan 1
{0, 2, 0, 0},
{1, 1, 2, 0},
{1, 0, 0, 0}}}})
.get_vector());
auto b = backend->create_tensor(element::f32, shape_b);
copy_data(b, vector<float>{42});
auto result = backend->create_tensor(element::f32, shape_r);
backend->call_with_validate(f, {result}, {a, b});
EXPECT_EQ((test::NDArray<float, 4>({{{{0, 1, 0, 2, 42, 42}, // img 0 chan 0
{0, 3, 2, 0, 42, 42},
{2, 0, 0, 0, 42, 42},
{0, 2, 1, 0, 42, 42},
{42, 42, 42, 42, 42, 42},
{42, 42, 42, 42, 42, 42}},
{{0, 0, 0, 2, 42, 42}, // img 1 chan 0
{0, 2, 3, 0, 42, 42},
{2, 0, 1, 0, 42, 42},
{2, 0, 0, 0, 42, 42},
{42, 42, 42, 42, 42, 42},
{42, 42, 42, 42, 42, 42}}},
{{{0, 2, 1, 1, 42, 42}, // img 1 chan 0
{0, 0, 2, 0, 42, 42},
{0, 0, 1, 2, 42, 42},
{0, 0, 0, 0, 42, 42},
{42, 42, 42, 42, 42, 42},
{42, 42, 42, 42, 42, 42}},
{{2, 1, 0, 0, 42, 42}, // img 1 chan 1
{0, 2, 0, 0, 42, 42},
{1, 1, 2, 0, 42, 42},
{1, 0, 0, 0, 42, 42},
{42, 42, 42, 42, 42, 42},
{42, 42, 42, 42, 42, 42}}}})
.get_vector()),
read_vector<float>(result));
}
// Trivial case with no reduced axes.
NGRAPH_TEST(${BACKEND_NAME}, product_trivial)
{
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment