Commit 40ff77bd authored by Chris Sullivan's avatar Chris Sullivan Committed by Robert Kimball

Update pad on nvpgu (#1759)

* Add pad with fill operator using the outward-in index pattern.

* Remove static pad and rename build_pad_dynamic -> build_pad. Update maxpool 1d padding.

* Formatting.

* Split build_pad_dynamic into build_pad and build_pad_fill.

* Add test coverage for fixed bug in op::Pad for gpu.
parent 519b18ac
......@@ -306,27 +306,22 @@ size_t runtime::gpu::CUDAEmitter::build_reverse(const std::array<std::string, 2>
return this->m_primitive_emitter->register_primitive(kernel_launch, hash);
}
size_t runtime::gpu::CUDAEmitter::build_pad(const std::array<std::string, 2>& dtypes,
size_t runtime::gpu::CUDAEmitter::build_pad(const std::vector<std::string>& dtypes,
NVShape input_shape,
NVShape output_shape,
NVShape padding_below,
NVShape padding_above,
NVShape padding_interior,
const std::string& pad_value)
NVShape padding_interior)
{
// Need to check: are there models in which some tensors will have different types? if so, this
// hash needs to include the tensor types.
std::string val_hash = (pad_value == "") ? "0" : "1";
std::string hash = "pad_i" + join(input_shape, "_") + "_pb" + join(padding_below, "_") + "_pa" +
join(padding_above, "_") + "_pi" + join(padding_interior, "_") + "_pv" +
val_hash;
uint32_t rank = static_cast<uint32_t>(input_shape.size());
std::stringstream kernel_name;
kernel_name << "pad_" << join(dtypes, "_") << rank;
std::string hash = kernel_name.str() + "pad_i" + join(input_shape, "_") + "pad_o" +
join(output_shape) + "_pb" + join(padding_below, "_") + "_pi" +
join(padding_interior, "_");
// For backwards compatability we currently use two unordered maps
// 1. one looks up the compiled cuda kernel (CudaFunctionPool)
// 2. the other looks to see if this kernel is already in the primitive list
// Once all previously implemented cuda kernels are refactored to use the
// CUDAEmitter/GPUPrimittiveEmitter interface, only one map (from hash to primitive index)
// will be required.
// check if the requested kernel is already an inserted primitive
size_t primitive_index = m_primitive_emitter->lookup(hash);
......@@ -335,123 +330,50 @@ size_t runtime::gpu::CUDAEmitter::build_pad(const std::array<std::string, 2>& dt
return primitive_index;
}
size_t nthreads = shape_size(output_shape);
// TODO: currently we set it to 64, will add tuning method later
uint32_t block_size_x = 64;
uint32_t aligned_grid_size_x =
align_to_block_size(static_cast<uint32_t>(nthreads), block_size_x);
NVShape pad_below(input_shape.size(), 0);
NVShape pad_interior(input_shape.size(), 1);
// if the kernel has not been compiled, build it
auto compiled_kernel = m_ctx->compiled_kernel_pool->get(hash);
if (compiled_kernel == nullptr)
int64_t i = padding_below.size() - 1;
int64_t j = input_shape.size() - 1;
for (; i >= 0; i--, j--)
{
// normalize pad dimensions to shape dimensions
NVShape pad_below(input_shape.size(), 0);
NVShape pad_above(input_shape.size(), 0);
NVShape pad_interior(input_shape.size(), 0);
pad_below[j] = padding_below[i];
pad_interior[j] = padding_interior[i];
}
// if padding_interior is not zero length, it
// is from op::Pad for which padding_below will
// always be equal in size to padding_above
if (padding_below.size() != input_shape.size())
{
for (int64_t i = padding_below.size() - 1; i >= 0; i--)
{
pad_below[i + input_shape.size() - padding_below.size()] = padding_below[i];
pad_above[i + input_shape.size() - padding_above.size()] = padding_above[i];
}
}
else
{
pad_below = padding_below;
pad_above = padding_above;
pad_interior = padding_interior;
}
NVShape input_strides = row_major_strides(input_shape);
NVShape output_strides = row_major_strides(output_shape);
NVShape input_strides = row_major_strides(input_shape);
NVShape output_strides = row_major_strides(output_shape);
uint32_t nthreads = static_cast<uint32_t>(shape_size(input_shape));
// TODO: currently we set it to 64, will add tuning method later
uint32_t block_size_x = 64;
uint32_t aligned_grid_size_x = align_to_block_size(nthreads, block_size_x);
int offset = 0;
for (size_t i = 0; i < output_strides.size(); i++)
{
offset += (output_strides[i] * pad_below[i]);
}
auto args = m_primitive_emitter->add_kernel_args();
args.add_placeholder(dtypes.front(), "in")
.add_placeholder(dtypes.back(), "out")
.add("input_strides", input_strides)
.add("output_strides", output_strides)
.add("padding_below", pad_below)
.add("padding_interior", pad_interior)
.add("n", nthreads);
auto compiled_kernel = m_ctx->compiled_kernel_pool->get(kernel_name.str());
if (compiled_kernel == nullptr)
{
codegen::CodeWriter writer;
writer << "extern \"C\" __global__ void cuda_" << hash << "(";
// if the pad value is static, a runtime argument isn't necessary
if (pad_value == "")
{
writer << dtypes[0] << "* val, ";
}
writer << dtypes[0] << "* in, " << dtypes[1] << "* out)\n";
writer.block_begin();
{
writer << "size_t tid = blockIdx.x * blockDim.x + threadIdx.x; \n";
// fill kernel
writer << "if (tid < " << nthreads << ")\n";
writer.block_begin();
{
if (pad_value == "")
{
writer << "out[tid] = *val;\n";
}
else
{
writer << "out[tid] = " << pad_value << ";\n";
}
}
writer.block_end();
// pad re-index kernel
writer << "if (tid < " << shape_size(input_shape) << ")\n";
writer.block_begin();
{
writer << "size_t idx = ";
writer << offset << " + (tid % " << input_shape.back() << ") * "
<< 1 + pad_interior.back();
int64_t last = input_strides.size() - 1;
for (int64_t i = last - 1; i >= 0; i--)
{
writer << " + (((tid / " << input_strides[i] << ") % " << input_shape[i + 1]
<< ") * " << 1 + pad_interior[i] << ") * " << output_strides[i];
}
writer << ";\n";
writer << "out[idx] = in[tid];\n";
}
writer.block_end();
}
writer.block_end();
compiled_kernel = m_ctx->compiled_kernel_pool->set(hash, writer.get_code());
CudaKernelBuilder::add_pod_typedefs(writer);
CudaKernelBuilder::get_pad_op(writer, kernel_name.str(), args, rank);
compiled_kernel = m_ctx->compiled_kernel_pool->set(kernel_name.str(), writer.get_code());
}
std::unique_ptr<gpu::primitive> pad;
// if the pad value is statically provided, the kernel call signature is different
if (pad_value == "") // pad value provided at runtime (dynamic)
{
pad.reset(new gpu::primitive{[=](void** inputs, void** outputs) {
void* args_list[] = {&inputs[1], &inputs[0], &outputs[0]};
CUDA_SAFE_CALL(cuLaunchKernel(*compiled_kernel.get(),
aligned_grid_size_x,
1,
1, // grid dim
block_size_x,
1,
1, // block dim
0,
NULL, // shared mem and stream
args_list,
0)); // arguments
debug_sync();
}});
}
else // pad value provided at compile time (static)
{
pad.reset(new gpu::primitive{[=](void** inputs, void** outputs) {
void* args_list[] = {&inputs[0], &outputs[0]};
// create the launch primitive
std::unique_ptr<gpu::primitive> pad(
new gpu::primitive{[=](void** inputs, void** outputs) mutable {
void** args_list = args.resolve_placeholder(0, &inputs[0])
.resolve_placeholder(1, &outputs[0])
.get_argument_list();
CUDA_SAFE_CALL(cuLaunchKernel(*compiled_kernel.get(),
aligned_grid_size_x,
1,
......@@ -465,20 +387,19 @@ size_t runtime::gpu::CUDAEmitter::build_pad(const std::array<std::string, 2>& dt
0)); // arguments
debug_sync();
}});
}
return this->m_primitive_emitter->register_primitive(pad, hash);
}
size_t runtime::gpu::CUDAEmitter::build_pad_dynamic(const std::array<std::string, 2>& dtypes,
NVShape input_shape,
NVShape output_shape,
NVShape padding_below,
NVShape padding_interior)
size_t runtime::gpu::CUDAEmitter::build_pad_fill(const std::vector<std::string>& dtypes,
NVShape input_shape,
NVShape output_shape,
NVShape padding_below,
NVShape padding_interior)
{
uint32_t rank = static_cast<uint32_t>(input_shape.size());
std::stringstream kernel_name;
kernel_name << "pad_dynamic_" << join(dtypes, "_") << rank;
kernel_name << "pad_" << join(dtypes, "_") << rank;
std::string hash = kernel_name.str() + "pad_i" + join(input_shape, "_") + "pad_o" +
join(output_shape) + "_pb" + join(padding_below, "_") + "_pi" +
......@@ -494,11 +415,6 @@ size_t runtime::gpu::CUDAEmitter::build_pad_dynamic(const std::array<std::string
return primitive_index;
}
uint32_t nthreads = static_cast<uint32_t>(shape_size(input_shape));
// TODO: currently we set it to 64, will add tuning method later
uint32_t block_size_x = 64;
uint32_t aligned_grid_size_x = align_to_block_size(nthreads, block_size_x);
NVShape pad_below(input_shape.size(), 0);
NVShape pad_interior(input_shape.size(), 1);
......@@ -513,33 +429,37 @@ size_t runtime::gpu::CUDAEmitter::build_pad_dynamic(const std::array<std::string
NVShape input_strides = row_major_strides(input_shape);
NVShape output_strides = row_major_strides(output_shape);
uint32_t nthreads = static_cast<uint32_t>(shape_size(output_shape));
// TODO: currently we set it to 64, will add tuning method later
uint32_t block_size_x = 64;
uint32_t aligned_grid_size_x = align_to_block_size(nthreads, block_size_x);
auto args = m_primitive_emitter->add_kernel_args();
args.add_placeholder(dtypes[0], "in")
.add_placeholder(dtypes[1], "out")
args.add_placeholder(dtypes.front(), "in")
.add_placeholder(dtypes[1], "pad")
.add_placeholder(dtypes.back(), "out")
.add("input_shape", input_shape)
.add("input_strides", input_strides)
.add("output_strides", output_strides)
.add("padding_below", pad_below)
.add("padding_interior", pad_interior)
.add("n", nthreads);
// check if the kernel has already been compiled. if so, create
// a launch primitive for it based on the input tensor shape
// but do not recompile the kernel. otherwise, do it all:
// recompile the kernel and then create the primitive
auto compiled_kernel = m_ctx->compiled_kernel_pool->get(kernel_name.str());
if (compiled_kernel == nullptr)
{
codegen::CodeWriter writer;
CudaKernelBuilder::add_pod_typedefs(writer);
CudaKernelBuilder::get_pad_dynamic_op(writer, kernel_name.str(), args, dtypes, rank);
CudaKernelBuilder::get_pad_fill_op(writer, kernel_name.str(), args, rank);
compiled_kernel = m_ctx->compiled_kernel_pool->set(kernel_name.str(), writer.get_code());
}
// create the launch primitive
std::unique_ptr<gpu::primitive> pad_dynamic(
std::unique_ptr<gpu::primitive> pad(
new gpu::primitive{[=](void** inputs, void** outputs) mutable {
void** args_list = args.resolve_placeholder(0, &inputs[0])
.resolve_placeholder(1, &outputs[0])
.resolve_placeholder(1, &inputs[1])
.resolve_placeholder(2, &outputs[0])
.get_argument_list();
CUDA_SAFE_CALL(cuLaunchKernel(*compiled_kernel.get(),
......@@ -556,7 +476,7 @@ size_t runtime::gpu::CUDAEmitter::build_pad_dynamic(const std::array<std::string
debug_sync();
}});
return this->m_primitive_emitter->register_primitive(pad_dynamic, hash);
return this->m_primitive_emitter->register_primitive(pad, hash);
}
size_t runtime::gpu::CUDAEmitter::build_reshape(const std::array<std::string, 2>& dtypes,
......@@ -1335,32 +1255,37 @@ size_t runtime::gpu::CUDAEmitter::build_primitive(const op::MaxPool* node)
/// assymetric padding detection
bool pad_required = false;
auto shape_to_pool =
runtime::gpu::get_padded_shape(input_shape, padding_below, padding_above, {});
if (shape_to_pool != input_shape)
{
pad_required = true;
}
auto input_shape_padded = input_shape;
pad_required = pad_required && (padding_below != padding_above);
size_t padded_size;
// asymetric padding
size_t idx_workspace = std::numeric_limits<size_t>::max();
size_t pad_index = std::numeric_limits<size_t>::max();
if (pad_required)
{
auto temp_size = shape_size(shape_to_pool) * args[0].get_element_type().size();
if (padding_below != padding_above)
{
Shape padding_interior(padding_below.size(), 1);
input_shape_padded =
runtime::gpu::get_padded_shape(input_shape, padding_below, padding_above, {});
padded_size = shape_size(input_shape_padded);
//currntly we set this to float point only, need to add other datatype support later
float pad_value = std::numeric_limits<float>::lowest();
std::vector<float> temp(padded_size, pad_value);
GPUAllocator allocator = m_primitive_emitter->get_memory_allocator();
idx_workspace = allocator.reserve_workspace(temp_size);
auto pad_value = TypeInfo::Get(args[0].get_element_type())->lowest();
pad_index = build_pad({{input_type, output_type}},
input_shape,
shape_to_pool,
padding_below,
padding_above,
Shape{},
pad_value);
idx_workspace = allocator.reserve_argspace(temp.data(),
padded_size * args[0].get_element_type().size());
auto& cuda_emitter = m_primitive_emitter->get_cuda_emitter();
pad_index = cuda_emitter->build_pad({{input_type, output_type}},
input_shape,
input_shape_padded,
padding_below,
padding_interior);
// asymetric padding has been applied, zero out padding vectors to
// ensure cuDNN does not assume padding during pooling
std::fill(padding_below.begin(), padding_below.end(), 0);
std::fill(padding_above.begin(), padding_above.end(), 0);
pad_required = true;
}
/// end asymmetric padding detection
......@@ -1377,7 +1302,7 @@ size_t runtime::gpu::CUDAEmitter::build_primitive(const op::MaxPool* node)
{
// void* pad_buffer = runtime::gpu::invoke_memory_primitive(m_ctx, idx_workspace);
// gpu::invoke_primitive(m_ctx,
// pad_dynamic_index,
// pad_index,
// std::vector<void*>{inputs[0]}.data(),
// std::vector<void*>{pad_buffer}.data());
......@@ -2403,7 +2328,7 @@ size_t runtime::gpu::CUDAEmitter::build_primitive(const op::ReplaceSlice* node,
Shape input_strides = row_major_strides(input_shape);
Shape replace_strides = row_major_strides(replace_shape);
size_t pad_index = build_pad_dynamic(
size_t pad_index = build_pad(
{{input_type, output_type}}, replace_shape, input_shape, lower_bounds, slice_strides);
if (in_place_op)
......
......@@ -50,19 +50,17 @@ namespace ngraph
size_t build_primitive(const op::ReplaceSlice* node, bool in_place_op);
public:
size_t build_pad(const std::array<std::string, 2>& dtypes,
size_t build_pad(const std::vector<std::string>& dtypes,
NVShape input_shape,
NVShape output_shape,
NVShape pad_below,
NVShape pad_above,
NVShape pad_interior,
const std::string& pad_value = "");
NVShape padding_below,
NVShape padding_interior);
size_t build_pad_dynamic(const std::array<std::string, 2>& dtypes,
NVShape input_shape,
NVShape output_shape,
NVShape padding_below,
NVShape padding_interior);
size_t build_pad_fill(const std::vector<std::string>& dtypes,
NVShape input_shape,
NVShape output_shape,
NVShape padding_below,
NVShape padding_interior);
size_t build_1d_max_pool(const std::array<std::string, 2>& dtypes,
NVShape input_shape,
......
......@@ -418,7 +418,7 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::Convolution* node)
Shape padding_interior(data_dilation_strides);
size_t idx_workspace = std::numeric_limits<size_t>::max();
size_t pad_dynamic_index = std::numeric_limits<size_t>::max();
size_t pad_index = std::numeric_limits<size_t>::max();
bool can_find_algo = true;
if (pad_required || is_deconvolution)
{
......@@ -431,8 +431,7 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::Convolution* node)
idx_workspace = allocator.reserve_workspace(temp_size, true);
auto& cuda_emitter = m_primitive_emitter->get_cuda_emitter();
pad_dynamic_index =
cuda_emitter->build_pad_dynamic({{args[0].get_element_type().c_type_string(),
pad_index = cuda_emitter->build_pad({{args[0].get_element_type().c_type_string(),
out[0].get_element_type().c_type_string()}},
input_shape,
input_shape_padded,
......@@ -458,11 +457,11 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::Convolution* node)
std::unique_ptr<gpu::primitive> kernel_launch(
new gpu::primitive{[=](void** inputs, void** outputs) mutable {
if (idx_workspace != std::numeric_limits<size_t>::max() &&
pad_dynamic_index != std::numeric_limits<size_t>::max())
pad_index != std::numeric_limits<size_t>::max())
{
void* pad_buffer = runtime::gpu::invoke_memory_primitive(m_ctx, idx_workspace);
gpu::invoke_primitive(m_ctx,
pad_dynamic_index,
pad_index,
std::vector<void*>{inputs[0]}.data(),
std::vector<void*>{pad_buffer}.data());
gpu::invoke_primitive(
......@@ -542,7 +541,7 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::ConvolutionBackprop
Shape padding_interior(data_dilation_strides);
size_t idx_workspace = std::numeric_limits<size_t>::max();
size_t pad_dynamic_index = std::numeric_limits<size_t>::max();
size_t pad_index = std::numeric_limits<size_t>::max();
size_t slice_index = std::numeric_limits<size_t>::max();
bool can_find_algo = true;
if (pad_required || is_deconvolution)
......@@ -556,11 +555,11 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::ConvolutionBackprop
idx_workspace = allocator.reserve_workspace(temp_size, true);
auto& cuda_emitter = m_primitive_emitter->get_cuda_emitter();
pad_dynamic_index = cuda_emitter->build_pad_dynamic({{input_type, output_type}},
output_shape,
output_shape_padded,
padding_below,
padding_interior);
pad_index = cuda_emitter->build_pad({{input_type, output_type}},
output_shape,
output_shape_padded,
padding_below,
padding_interior);
slice_index = cuda_emitter->build_slice({{input_type, output_type}},
output_shape_padded,
......@@ -587,12 +586,12 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::ConvolutionBackprop
std::unique_ptr<gpu::primitive> kernel_launch(new gpu::primitive{[=](void** inputs,
void** outputs) mutable {
if (idx_workspace != std::numeric_limits<size_t>::max() &&
pad_dynamic_index != std::numeric_limits<size_t>::max() &&
pad_index != std::numeric_limits<size_t>::max() &&
slice_index != std::numeric_limits<size_t>::max())
{
void* pad_buffer = runtime::gpu::invoke_memory_primitive(m_ctx, idx_workspace);
gpu::invoke_primitive(m_ctx,
pad_dynamic_index,
pad_index,
std::vector<void*>{inputs[0]}.data(),
std::vector<void*>{pad_buffer}.data());
gpu::invoke_primitive(m_ctx, conv_index, inputs, std::vector<void*>{pad_buffer}.data());
......@@ -662,7 +661,7 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::ConvolutionBackprop
Shape padding_interior(data_dilation_strides);
size_t idx_workspace = std::numeric_limits<size_t>::max();
size_t pad_dynamic_index = std::numeric_limits<size_t>::max();
size_t pad_index = std::numeric_limits<size_t>::max();
bool can_find_algo = true;
if (pad_required || is_deconvolution)
{
......@@ -675,11 +674,11 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::ConvolutionBackprop
idx_workspace = allocator.reserve_workspace(temp_size, true);
auto& cuda_emitter = m_primitive_emitter->get_cuda_emitter();
pad_dynamic_index = cuda_emitter->build_pad_dynamic({{input_type, output_type}},
input_shape_0,
input_shape_padded,
padding_below,
padding_interior);
pad_index = cuda_emitter->build_pad({{input_type, output_type}},
input_shape_0,
input_shape_padded,
padding_below,
padding_interior);
// asymetric padding has been applied, zero out padding vectors to
// ensure cudnn does not assume padding
......@@ -700,11 +699,11 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::ConvolutionBackprop
std::unique_ptr<gpu::primitive> kernel_launch(
new gpu::primitive{[=](void** inputs, void** outputs) mutable {
if (idx_workspace != std::numeric_limits<size_t>::max() &&
pad_dynamic_index != std::numeric_limits<size_t>::max())
pad_index != std::numeric_limits<size_t>::max())
{
void* pad_buffer = runtime::gpu::invoke_memory_primitive(m_ctx, idx_workspace);
gpu::invoke_primitive(m_ctx,
pad_dynamic_index,
pad_index,
std::vector<void*>{inputs[0]}.data(),
std::vector<void*>{pad_buffer}.data());
gpu::invoke_primitive(
......@@ -768,11 +767,11 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::MaxPool* node)
padded_size * args[0].get_element_type().size());
auto& cuda_emitter = m_primitive_emitter->get_cuda_emitter();
pad_index = cuda_emitter->build_pad_dynamic({{input_type, output_type}},
input_shape,
input_shape_padded,
padding_below,
padding_interior);
pad_index = cuda_emitter->build_pad({{input_type, output_type}},
input_shape,
input_shape_padded,
padding_below,
padding_interior);
// asymetric padding has been applied, zero out padding vectors to
// ensure cuDNN does not assume padding during pooling
......
......@@ -638,12 +638,10 @@ void runtime::gpu::CudaKernelBuilder::get_concat_op(codegen::CodeWriter& writer,
writer.block_end();
}
void runtime::gpu::CudaKernelBuilder::get_pad_dynamic_op(
codegen::CodeWriter& writer,
const std::string& name,
GPUKernelArgs& args,
const std::array<std::string, 2>& data_types,
size_t rank)
void runtime::gpu::CudaKernelBuilder::get_pad_op(codegen::CodeWriter& writer,
const std::string& name,
GPUKernelArgs& args,
size_t rank)
{
writer << "extern \"C\" __global__ void cuda_" << name << args.get_input_signature();
writer.block_begin();
......@@ -673,6 +671,44 @@ void runtime::gpu::CudaKernelBuilder::get_pad_dynamic_op(
writer.block_end();
}
void runtime::gpu::CudaKernelBuilder::get_pad_fill_op(codegen::CodeWriter& writer,
const std::string& name,
GPUKernelArgs& args,
size_t rank)
{
writer << "extern \"C\" __global__ void cuda_" << name << args.get_input_signature();
writer.block_begin();
{
writer << "uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n";
writer << "if (tid < n)\n";
writer.block_begin();
{
writer << "bool in_bounds = true;\n";
writer << "uint32_t output_pixel = tid;\n";
writer << "uint32_t input_pixel = 0;\n";
writer << "int32_t input, input_dil;\n";
for (size_t i = 0; i < rank; i++)
{
if (i != 0)
{
writer << "output_pixel %= output_strides" << i - 1 << ";\n";
}
writer << "input_dil = output_pixel / output_strides" << i << " - padding_below"
<< i << ";\n";
writer << "input = input_dil / (padding_interior" << i << " + 1);\n";
writer << "input_dil %= (padding_interior" << i << " + 1);\n";
writer << "in_bounds = in_bounds && (input >= 0) && (input < input_shape" << i
<< ") && (input_dil == 0);\n";
writer << "input_pixel += input * input_strides" << i << ";\n";
}
writer << "out[tid] = (in_bounds) ? in[input_pixel] : *pad;\n";
}
writer.block_end();
}
writer.block_end();
}
void runtime::gpu::CudaKernelBuilder::get_reverse_sequence_op(
codegen::CodeWriter& writer,
const std::string& name,
......
......@@ -130,11 +130,15 @@ namespace ngraph
const std::string& math_kernel,
const std::vector<std::string>& data_types);
static void get_pad_dynamic_op(codegen::CodeWriter& writer,
const std::string& name,
GPUKernelArgs& args,
const std::array<std::string, 2>& data_types,
size_t rank);
static void get_pad_op(codegen::CodeWriter& writer,
const std::string& name,
GPUKernelArgs& args,
size_t rank);
static void get_pad_fill_op(codegen::CodeWriter& writer,
const std::string& name,
GPUKernelArgs& args,
size_t rank);
static void get_ew_collective_op(codegen::CodeWriter& writer,
const std::string& name,
......
......@@ -827,12 +827,12 @@ void runtime::gpu::GPU_Emitter::emit_Pad(EMIT_ARGS)
auto& cuda_emitter = external_function->get_primitive_emitter()->get_cuda_emitter();
auto pad_index = cuda_emitter->build_pad({{args[0].get_type(), out[0].get_type()}},
input_shape,
output_shape,
padding_below,
padding_above,
padding_interior);
auto pad_index = cuda_emitter->build_pad_fill(
{{args[0].get_type(), args[1].get_type(), out[0].get_type()}},
input_shape,
output_shape,
padding_below,
padding_interior);
writer << "void* input[] = {" << node_names(args) << "};\n";
writer << "void* output[] = {" << node_names(out) << "};\n";
writer << "gpu::invoke_primitive(ctx, " << pad_index << ", input, output);\n";
......
......@@ -7497,6 +7497,86 @@ NGRAPH_TEST(${BACKEND_NAME}, pad_interior_exterior_4d_2x0x3x2)
EXPECT_EQ(expected, read_vector<float>(result));
}
// This test covers the case with multiple image and with asymetric pad
// bug has been found on nvGPU side now covered by this test
NGRAPH_TEST(${BACKEND_NAME}, pad_2channel_2image_asym)
{
Shape shape_a{2, 2, 4, 4};
auto window_movement_strides = Strides{2, 2};
Shape padding_below{0, 0, 0, 0};
Shape padding_above{0, 0, 2, 2};
Shape padding_interior{0, 0, 0, 0};
auto A = make_shared<op::Parameter>(element::f32, shape_a);
Shape shape_b{};
auto B = make_shared<op::Parameter>(element::f32, shape_b);
Shape shape_r{2, 2, 6, 6};
auto f = make_shared<Function>(
make_shared<op::Pad>(A, B, padding_below, padding_above, padding_interior),
op::ParameterVector{A, B});
auto backend = runtime::Backend::create("${BACKEND_NAME}");
// Create some tensors for input/output
auto a = backend->create_tensor(element::f32, shape_a);
copy_data(a,
test::NDArray<float, 4>({{{{0, 1, 0, 2}, // img 0 chan 0
{0, 3, 2, 0},
{2, 0, 0, 0},
{0, 2, 1, 0}},
{{0, 0, 0, 2}, // img 0 chan 1
{0, 2, 3, 0},
{2, 0, 1, 0},
{2, 0, 0, 0}}},
{{{0, 2, 1, 1}, // img 1 chan 0
{0, 0, 2, 0},
{0, 0, 1, 2},
{0, 0, 0, 0}},
{{2, 1, 0, 0}, // img 1 chan 1
{0, 2, 0, 0},
{1, 1, 2, 0},
{1, 0, 0, 0}}}})
.get_vector());
auto b = backend->create_tensor(element::f32, shape_b);
copy_data(b, vector<float>{42});
auto result = backend->create_tensor(element::f32, shape_r);
backend->call_with_validate(f, {result}, {a, b});
EXPECT_EQ((test::NDArray<float, 4>({{{{0, 1, 0, 2, 42, 42}, // img 0 chan 0
{0, 3, 2, 0, 42, 42},
{2, 0, 0, 0, 42, 42},
{0, 2, 1, 0, 42, 42},
{42, 42, 42, 42, 42, 42},
{42, 42, 42, 42, 42, 42}},
{{0, 0, 0, 2, 42, 42}, // img 1 chan 0
{0, 2, 3, 0, 42, 42},
{2, 0, 1, 0, 42, 42},
{2, 0, 0, 0, 42, 42},
{42, 42, 42, 42, 42, 42},
{42, 42, 42, 42, 42, 42}}},
{{{0, 2, 1, 1, 42, 42}, // img 1 chan 0
{0, 0, 2, 0, 42, 42},
{0, 0, 1, 2, 42, 42},
{0, 0, 0, 0, 42, 42},
{42, 42, 42, 42, 42, 42},
{42, 42, 42, 42, 42, 42}},
{{2, 1, 0, 0, 42, 42}, // img 1 chan 1
{0, 2, 0, 0, 42, 42},
{1, 1, 2, 0, 42, 42},
{1, 0, 0, 0, 42, 42},
{42, 42, 42, 42, 42, 42},
{42, 42, 42, 42, 42, 42}}}})
.get_vector()),
read_vector<float>(result));
}
// Trivial case with no reduced axes.
NGRAPH_TEST(${BACKEND_NAME}, product_trivial)
{
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment