Commit 40ff77bd authored by Chris Sullivan's avatar Chris Sullivan Committed by Robert Kimball

Update pad on nvpgu (#1759)

* Add pad with fill operator using the outward-in index pattern.

* Remove static pad and rename build_pad_dynamic -> build_pad. Update maxpool 1d padding.

* Formatting.

* Split build_pad_dynamic into build_pad and build_pad_fill.

* Add test coverage for fixed bug in op::Pad for gpu.
parent 519b18ac
...@@ -306,27 +306,22 @@ size_t runtime::gpu::CUDAEmitter::build_reverse(const std::array<std::string, 2> ...@@ -306,27 +306,22 @@ size_t runtime::gpu::CUDAEmitter::build_reverse(const std::array<std::string, 2>
return this->m_primitive_emitter->register_primitive(kernel_launch, hash); return this->m_primitive_emitter->register_primitive(kernel_launch, hash);
} }
size_t runtime::gpu::CUDAEmitter::build_pad(const std::array<std::string, 2>& dtypes, size_t runtime::gpu::CUDAEmitter::build_pad(const std::vector<std::string>& dtypes,
NVShape input_shape, NVShape input_shape,
NVShape output_shape, NVShape output_shape,
NVShape padding_below, NVShape padding_below,
NVShape padding_above, NVShape padding_interior)
NVShape padding_interior,
const std::string& pad_value)
{ {
// Need to check: are there models in which some tensors will have different types? if so, this uint32_t rank = static_cast<uint32_t>(input_shape.size());
// hash needs to include the tensor types. std::stringstream kernel_name;
std::string val_hash = (pad_value == "") ? "0" : "1"; kernel_name << "pad_" << join(dtypes, "_") << rank;
std::string hash = "pad_i" + join(input_shape, "_") + "_pb" + join(padding_below, "_") + "_pa" +
join(padding_above, "_") + "_pi" + join(padding_interior, "_") + "_pv" +
val_hash;
std::string hash = kernel_name.str() + "pad_i" + join(input_shape, "_") + "pad_o" +
join(output_shape) + "_pb" + join(padding_below, "_") + "_pi" +
join(padding_interior, "_");
// For backwards compatability we currently use two unordered maps // For backwards compatability we currently use two unordered maps
// 1. one looks up the compiled cuda kernel (CudaFunctionPool) // 1. one looks up the compiled cuda kernel (CudaFunctionPool)
// 2. the other looks to see if this kernel is already in the primitive list // 2. the other looks to see if this kernel is already in the primitive list
// Once all previously implemented cuda kernels are refactored to use the
// CUDAEmitter/GPUPrimittiveEmitter interface, only one map (from hash to primitive index)
// will be required.
// check if the requested kernel is already an inserted primitive // check if the requested kernel is already an inserted primitive
size_t primitive_index = m_primitive_emitter->lookup(hash); size_t primitive_index = m_primitive_emitter->lookup(hash);
...@@ -335,123 +330,50 @@ size_t runtime::gpu::CUDAEmitter::build_pad(const std::array<std::string, 2>& dt ...@@ -335,123 +330,50 @@ size_t runtime::gpu::CUDAEmitter::build_pad(const std::array<std::string, 2>& dt
return primitive_index; return primitive_index;
} }
size_t nthreads = shape_size(output_shape);
// TODO: currently we set it to 64, will add tuning method later
uint32_t block_size_x = 64;
uint32_t aligned_grid_size_x =
align_to_block_size(static_cast<uint32_t>(nthreads), block_size_x);
// if the kernel has not been compiled, build it
auto compiled_kernel = m_ctx->compiled_kernel_pool->get(hash);
if (compiled_kernel == nullptr)
{
// normalize pad dimensions to shape dimensions
NVShape pad_below(input_shape.size(), 0); NVShape pad_below(input_shape.size(), 0);
NVShape pad_above(input_shape.size(), 0); NVShape pad_interior(input_shape.size(), 1);
NVShape pad_interior(input_shape.size(), 0);
// if padding_interior is not zero length, it int64_t i = padding_below.size() - 1;
// is from op::Pad for which padding_below will int64_t j = input_shape.size() - 1;
// always be equal in size to padding_above for (; i >= 0; i--, j--)
if (padding_below.size() != input_shape.size())
{
for (int64_t i = padding_below.size() - 1; i >= 0; i--)
{
pad_below[i + input_shape.size() - padding_below.size()] = padding_below[i];
pad_above[i + input_shape.size() - padding_above.size()] = padding_above[i];
}
}
else
{ {
pad_below = padding_below; pad_below[j] = padding_below[i];
pad_above = padding_above; pad_interior[j] = padding_interior[i];
pad_interior = padding_interior;
} }
NVShape input_strides = row_major_strides(input_shape); NVShape input_strides = row_major_strides(input_shape);
NVShape output_strides = row_major_strides(output_shape); NVShape output_strides = row_major_strides(output_shape);
int offset = 0; uint32_t nthreads = static_cast<uint32_t>(shape_size(input_shape));
for (size_t i = 0; i < output_strides.size(); i++) // TODO: currently we set it to 64, will add tuning method later
{ uint32_t block_size_x = 64;
offset += (output_strides[i] * pad_below[i]); uint32_t aligned_grid_size_x = align_to_block_size(nthreads, block_size_x);
}
codegen::CodeWriter writer;
writer << "extern \"C\" __global__ void cuda_" << hash << "(";
// if the pad value is static, a runtime argument isn't necessary
if (pad_value == "")
{
writer << dtypes[0] << "* val, ";
}
writer << dtypes[0] << "* in, " << dtypes[1] << "* out)\n";
writer.block_begin();
{
writer << "size_t tid = blockIdx.x * blockDim.x + threadIdx.x; \n";
// fill kernel auto args = m_primitive_emitter->add_kernel_args();
writer << "if (tid < " << nthreads << ")\n"; args.add_placeholder(dtypes.front(), "in")
writer.block_begin(); .add_placeholder(dtypes.back(), "out")
{ .add("input_strides", input_strides)
if (pad_value == "") .add("output_strides", output_strides)
{ .add("padding_below", pad_below)
writer << "out[tid] = *val;\n"; .add("padding_interior", pad_interior)
} .add("n", nthreads);
else
{
writer << "out[tid] = " << pad_value << ";\n";
}
}
writer.block_end();
// pad re-index kernel auto compiled_kernel = m_ctx->compiled_kernel_pool->get(kernel_name.str());
writer << "if (tid < " << shape_size(input_shape) << ")\n"; if (compiled_kernel == nullptr)
writer.block_begin();
{
writer << "size_t idx = ";
writer << offset << " + (tid % " << input_shape.back() << ") * "
<< 1 + pad_interior.back();
int64_t last = input_strides.size() - 1;
for (int64_t i = last - 1; i >= 0; i--)
{ {
writer << " + (((tid / " << input_strides[i] << ") % " << input_shape[i + 1] codegen::CodeWriter writer;
<< ") * " << 1 + pad_interior[i] << ") * " << output_strides[i]; CudaKernelBuilder::add_pod_typedefs(writer);
} CudaKernelBuilder::get_pad_op(writer, kernel_name.str(), args, rank);
writer << ";\n"; compiled_kernel = m_ctx->compiled_kernel_pool->set(kernel_name.str(), writer.get_code());
writer << "out[idx] = in[tid];\n";
}
writer.block_end();
} }
writer.block_end();
compiled_kernel = m_ctx->compiled_kernel_pool->set(hash, writer.get_code()); // create the launch primitive
} std::unique_ptr<gpu::primitive> pad(
std::unique_ptr<gpu::primitive> pad; new gpu::primitive{[=](void** inputs, void** outputs) mutable {
void** args_list = args.resolve_placeholder(0, &inputs[0])
.resolve_placeholder(1, &outputs[0])
.get_argument_list();
// if the pad value is statically provided, the kernel call signature is different
if (pad_value == "") // pad value provided at runtime (dynamic)
{
pad.reset(new gpu::primitive{[=](void** inputs, void** outputs) {
void* args_list[] = {&inputs[1], &inputs[0], &outputs[0]};
CUDA_SAFE_CALL(cuLaunchKernel(*compiled_kernel.get(),
aligned_grid_size_x,
1,
1, // grid dim
block_size_x,
1,
1, // block dim
0,
NULL, // shared mem and stream
args_list,
0)); // arguments
debug_sync();
}});
}
else // pad value provided at compile time (static)
{
pad.reset(new gpu::primitive{[=](void** inputs, void** outputs) {
void* args_list[] = {&inputs[0], &outputs[0]};
CUDA_SAFE_CALL(cuLaunchKernel(*compiled_kernel.get(), CUDA_SAFE_CALL(cuLaunchKernel(*compiled_kernel.get(),
aligned_grid_size_x, aligned_grid_size_x,
1, 1,
...@@ -465,12 +387,11 @@ size_t runtime::gpu::CUDAEmitter::build_pad(const std::array<std::string, 2>& dt ...@@ -465,12 +387,11 @@ size_t runtime::gpu::CUDAEmitter::build_pad(const std::array<std::string, 2>& dt
0)); // arguments 0)); // arguments
debug_sync(); debug_sync();
}}); }});
}
return this->m_primitive_emitter->register_primitive(pad, hash); return this->m_primitive_emitter->register_primitive(pad, hash);
} }
size_t runtime::gpu::CUDAEmitter::build_pad_dynamic(const std::array<std::string, 2>& dtypes, size_t runtime::gpu::CUDAEmitter::build_pad_fill(const std::vector<std::string>& dtypes,
NVShape input_shape, NVShape input_shape,
NVShape output_shape, NVShape output_shape,
NVShape padding_below, NVShape padding_below,
...@@ -478,7 +399,7 @@ size_t runtime::gpu::CUDAEmitter::build_pad_dynamic(const std::array<std::string ...@@ -478,7 +399,7 @@ size_t runtime::gpu::CUDAEmitter::build_pad_dynamic(const std::array<std::string
{ {
uint32_t rank = static_cast<uint32_t>(input_shape.size()); uint32_t rank = static_cast<uint32_t>(input_shape.size());
std::stringstream kernel_name; std::stringstream kernel_name;
kernel_name << "pad_dynamic_" << join(dtypes, "_") << rank; kernel_name << "pad_" << join(dtypes, "_") << rank;
std::string hash = kernel_name.str() + "pad_i" + join(input_shape, "_") + "pad_o" + std::string hash = kernel_name.str() + "pad_i" + join(input_shape, "_") + "pad_o" +
join(output_shape) + "_pb" + join(padding_below, "_") + "_pi" + join(output_shape) + "_pb" + join(padding_below, "_") + "_pi" +
...@@ -494,11 +415,6 @@ size_t runtime::gpu::CUDAEmitter::build_pad_dynamic(const std::array<std::string ...@@ -494,11 +415,6 @@ size_t runtime::gpu::CUDAEmitter::build_pad_dynamic(const std::array<std::string
return primitive_index; return primitive_index;
} }
uint32_t nthreads = static_cast<uint32_t>(shape_size(input_shape));
// TODO: currently we set it to 64, will add tuning method later
uint32_t block_size_x = 64;
uint32_t aligned_grid_size_x = align_to_block_size(nthreads, block_size_x);
NVShape pad_below(input_shape.size(), 0); NVShape pad_below(input_shape.size(), 0);
NVShape pad_interior(input_shape.size(), 1); NVShape pad_interior(input_shape.size(), 1);
...@@ -513,33 +429,37 @@ size_t runtime::gpu::CUDAEmitter::build_pad_dynamic(const std::array<std::string ...@@ -513,33 +429,37 @@ size_t runtime::gpu::CUDAEmitter::build_pad_dynamic(const std::array<std::string
NVShape input_strides = row_major_strides(input_shape); NVShape input_strides = row_major_strides(input_shape);
NVShape output_strides = row_major_strides(output_shape); NVShape output_strides = row_major_strides(output_shape);
uint32_t nthreads = static_cast<uint32_t>(shape_size(output_shape));
// TODO: currently we set it to 64, will add tuning method later
uint32_t block_size_x = 64;
uint32_t aligned_grid_size_x = align_to_block_size(nthreads, block_size_x);
auto args = m_primitive_emitter->add_kernel_args(); auto args = m_primitive_emitter->add_kernel_args();
args.add_placeholder(dtypes[0], "in") args.add_placeholder(dtypes.front(), "in")
.add_placeholder(dtypes[1], "out") .add_placeholder(dtypes[1], "pad")
.add_placeholder(dtypes.back(), "out")
.add("input_shape", input_shape)
.add("input_strides", input_strides) .add("input_strides", input_strides)
.add("output_strides", output_strides) .add("output_strides", output_strides)
.add("padding_below", pad_below) .add("padding_below", pad_below)
.add("padding_interior", pad_interior) .add("padding_interior", pad_interior)
.add("n", nthreads); .add("n", nthreads);
// check if the kernel has already been compiled. if so, create
// a launch primitive for it based on the input tensor shape
// but do not recompile the kernel. otherwise, do it all:
// recompile the kernel and then create the primitive
auto compiled_kernel = m_ctx->compiled_kernel_pool->get(kernel_name.str()); auto compiled_kernel = m_ctx->compiled_kernel_pool->get(kernel_name.str());
if (compiled_kernel == nullptr) if (compiled_kernel == nullptr)
{ {
codegen::CodeWriter writer; codegen::CodeWriter writer;
CudaKernelBuilder::add_pod_typedefs(writer); CudaKernelBuilder::add_pod_typedefs(writer);
CudaKernelBuilder::get_pad_dynamic_op(writer, kernel_name.str(), args, dtypes, rank); CudaKernelBuilder::get_pad_fill_op(writer, kernel_name.str(), args, rank);
compiled_kernel = m_ctx->compiled_kernel_pool->set(kernel_name.str(), writer.get_code()); compiled_kernel = m_ctx->compiled_kernel_pool->set(kernel_name.str(), writer.get_code());
} }
// create the launch primitive // create the launch primitive
std::unique_ptr<gpu::primitive> pad_dynamic( std::unique_ptr<gpu::primitive> pad(
new gpu::primitive{[=](void** inputs, void** outputs) mutable { new gpu::primitive{[=](void** inputs, void** outputs) mutable {
void** args_list = args.resolve_placeholder(0, &inputs[0]) void** args_list = args.resolve_placeholder(0, &inputs[0])
.resolve_placeholder(1, &outputs[0]) .resolve_placeholder(1, &inputs[1])
.resolve_placeholder(2, &outputs[0])
.get_argument_list(); .get_argument_list();
CUDA_SAFE_CALL(cuLaunchKernel(*compiled_kernel.get(), CUDA_SAFE_CALL(cuLaunchKernel(*compiled_kernel.get(),
...@@ -556,7 +476,7 @@ size_t runtime::gpu::CUDAEmitter::build_pad_dynamic(const std::array<std::string ...@@ -556,7 +476,7 @@ size_t runtime::gpu::CUDAEmitter::build_pad_dynamic(const std::array<std::string
debug_sync(); debug_sync();
}}); }});
return this->m_primitive_emitter->register_primitive(pad_dynamic, hash); return this->m_primitive_emitter->register_primitive(pad, hash);
} }
size_t runtime::gpu::CUDAEmitter::build_reshape(const std::array<std::string, 2>& dtypes, size_t runtime::gpu::CUDAEmitter::build_reshape(const std::array<std::string, 2>& dtypes,
...@@ -1335,32 +1255,37 @@ size_t runtime::gpu::CUDAEmitter::build_primitive(const op::MaxPool* node) ...@@ -1335,32 +1255,37 @@ size_t runtime::gpu::CUDAEmitter::build_primitive(const op::MaxPool* node)
/// assymetric padding detection /// assymetric padding detection
bool pad_required = false; bool pad_required = false;
auto shape_to_pool = auto input_shape_padded = input_shape;
runtime::gpu::get_padded_shape(input_shape, padding_below, padding_above, {});
if (shape_to_pool != input_shape)
{
pad_required = true;
}
pad_required = pad_required && (padding_below != padding_above); size_t padded_size;
// asymetric padding // asymetric padding
size_t idx_workspace = std::numeric_limits<size_t>::max(); size_t idx_workspace = std::numeric_limits<size_t>::max();
size_t pad_index = std::numeric_limits<size_t>::max(); size_t pad_index = std::numeric_limits<size_t>::max();
if (pad_required) if (padding_below != padding_above)
{ {
auto temp_size = shape_size(shape_to_pool) * args[0].get_element_type().size(); Shape padding_interior(padding_below.size(), 1);
input_shape_padded =
runtime::gpu::get_padded_shape(input_shape, padding_below, padding_above, {});
padded_size = shape_size(input_shape_padded);
//currntly we set this to float point only, need to add other datatype support later
float pad_value = std::numeric_limits<float>::lowest();
std::vector<float> temp(padded_size, pad_value);
GPUAllocator allocator = m_primitive_emitter->get_memory_allocator(); GPUAllocator allocator = m_primitive_emitter->get_memory_allocator();
idx_workspace = allocator.reserve_workspace(temp_size); idx_workspace = allocator.reserve_argspace(temp.data(),
padded_size * args[0].get_element_type().size());
auto pad_value = TypeInfo::Get(args[0].get_element_type())->lowest(); auto& cuda_emitter = m_primitive_emitter->get_cuda_emitter();
pad_index = cuda_emitter->build_pad({{input_type, output_type}},
pad_index = build_pad({{input_type, output_type}},
input_shape, input_shape,
shape_to_pool, input_shape_padded,
padding_below, padding_below,
padding_above, padding_interior);
Shape{},
pad_value); // asymetric padding has been applied, zero out padding vectors to
// ensure cuDNN does not assume padding during pooling
std::fill(padding_below.begin(), padding_below.end(), 0);
std::fill(padding_above.begin(), padding_above.end(), 0);
pad_required = true;
} }
/// end asymmetric padding detection /// end asymmetric padding detection
...@@ -1377,7 +1302,7 @@ size_t runtime::gpu::CUDAEmitter::build_primitive(const op::MaxPool* node) ...@@ -1377,7 +1302,7 @@ size_t runtime::gpu::CUDAEmitter::build_primitive(const op::MaxPool* node)
{ {
// void* pad_buffer = runtime::gpu::invoke_memory_primitive(m_ctx, idx_workspace); // void* pad_buffer = runtime::gpu::invoke_memory_primitive(m_ctx, idx_workspace);
// gpu::invoke_primitive(m_ctx, // gpu::invoke_primitive(m_ctx,
// pad_dynamic_index, // pad_index,
// std::vector<void*>{inputs[0]}.data(), // std::vector<void*>{inputs[0]}.data(),
// std::vector<void*>{pad_buffer}.data()); // std::vector<void*>{pad_buffer}.data());
...@@ -2403,7 +2328,7 @@ size_t runtime::gpu::CUDAEmitter::build_primitive(const op::ReplaceSlice* node, ...@@ -2403,7 +2328,7 @@ size_t runtime::gpu::CUDAEmitter::build_primitive(const op::ReplaceSlice* node,
Shape input_strides = row_major_strides(input_shape); Shape input_strides = row_major_strides(input_shape);
Shape replace_strides = row_major_strides(replace_shape); Shape replace_strides = row_major_strides(replace_shape);
size_t pad_index = build_pad_dynamic( size_t pad_index = build_pad(
{{input_type, output_type}}, replace_shape, input_shape, lower_bounds, slice_strides); {{input_type, output_type}}, replace_shape, input_shape, lower_bounds, slice_strides);
if (in_place_op) if (in_place_op)
......
...@@ -50,15 +50,13 @@ namespace ngraph ...@@ -50,15 +50,13 @@ namespace ngraph
size_t build_primitive(const op::ReplaceSlice* node, bool in_place_op); size_t build_primitive(const op::ReplaceSlice* node, bool in_place_op);
public: public:
size_t build_pad(const std::array<std::string, 2>& dtypes, size_t build_pad(const std::vector<std::string>& dtypes,
NVShape input_shape, NVShape input_shape,
NVShape output_shape, NVShape output_shape,
NVShape pad_below, NVShape padding_below,
NVShape pad_above, NVShape padding_interior);
NVShape pad_interior,
const std::string& pad_value = "");
size_t build_pad_dynamic(const std::array<std::string, 2>& dtypes, size_t build_pad_fill(const std::vector<std::string>& dtypes,
NVShape input_shape, NVShape input_shape,
NVShape output_shape, NVShape output_shape,
NVShape padding_below, NVShape padding_below,
......
...@@ -418,7 +418,7 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::Convolution* node) ...@@ -418,7 +418,7 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::Convolution* node)
Shape padding_interior(data_dilation_strides); Shape padding_interior(data_dilation_strides);
size_t idx_workspace = std::numeric_limits<size_t>::max(); size_t idx_workspace = std::numeric_limits<size_t>::max();
size_t pad_dynamic_index = std::numeric_limits<size_t>::max(); size_t pad_index = std::numeric_limits<size_t>::max();
bool can_find_algo = true; bool can_find_algo = true;
if (pad_required || is_deconvolution) if (pad_required || is_deconvolution)
{ {
...@@ -431,8 +431,7 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::Convolution* node) ...@@ -431,8 +431,7 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::Convolution* node)
idx_workspace = allocator.reserve_workspace(temp_size, true); idx_workspace = allocator.reserve_workspace(temp_size, true);
auto& cuda_emitter = m_primitive_emitter->get_cuda_emitter(); auto& cuda_emitter = m_primitive_emitter->get_cuda_emitter();
pad_dynamic_index = pad_index = cuda_emitter->build_pad({{args[0].get_element_type().c_type_string(),
cuda_emitter->build_pad_dynamic({{args[0].get_element_type().c_type_string(),
out[0].get_element_type().c_type_string()}}, out[0].get_element_type().c_type_string()}},
input_shape, input_shape,
input_shape_padded, input_shape_padded,
...@@ -458,11 +457,11 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::Convolution* node) ...@@ -458,11 +457,11 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::Convolution* node)
std::unique_ptr<gpu::primitive> kernel_launch( std::unique_ptr<gpu::primitive> kernel_launch(
new gpu::primitive{[=](void** inputs, void** outputs) mutable { new gpu::primitive{[=](void** inputs, void** outputs) mutable {
if (idx_workspace != std::numeric_limits<size_t>::max() && if (idx_workspace != std::numeric_limits<size_t>::max() &&
pad_dynamic_index != std::numeric_limits<size_t>::max()) pad_index != std::numeric_limits<size_t>::max())
{ {
void* pad_buffer = runtime::gpu::invoke_memory_primitive(m_ctx, idx_workspace); void* pad_buffer = runtime::gpu::invoke_memory_primitive(m_ctx, idx_workspace);
gpu::invoke_primitive(m_ctx, gpu::invoke_primitive(m_ctx,
pad_dynamic_index, pad_index,
std::vector<void*>{inputs[0]}.data(), std::vector<void*>{inputs[0]}.data(),
std::vector<void*>{pad_buffer}.data()); std::vector<void*>{pad_buffer}.data());
gpu::invoke_primitive( gpu::invoke_primitive(
...@@ -542,7 +541,7 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::ConvolutionBackprop ...@@ -542,7 +541,7 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::ConvolutionBackprop
Shape padding_interior(data_dilation_strides); Shape padding_interior(data_dilation_strides);
size_t idx_workspace = std::numeric_limits<size_t>::max(); size_t idx_workspace = std::numeric_limits<size_t>::max();
size_t pad_dynamic_index = std::numeric_limits<size_t>::max(); size_t pad_index = std::numeric_limits<size_t>::max();
size_t slice_index = std::numeric_limits<size_t>::max(); size_t slice_index = std::numeric_limits<size_t>::max();
bool can_find_algo = true; bool can_find_algo = true;
if (pad_required || is_deconvolution) if (pad_required || is_deconvolution)
...@@ -556,7 +555,7 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::ConvolutionBackprop ...@@ -556,7 +555,7 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::ConvolutionBackprop
idx_workspace = allocator.reserve_workspace(temp_size, true); idx_workspace = allocator.reserve_workspace(temp_size, true);
auto& cuda_emitter = m_primitive_emitter->get_cuda_emitter(); auto& cuda_emitter = m_primitive_emitter->get_cuda_emitter();
pad_dynamic_index = cuda_emitter->build_pad_dynamic({{input_type, output_type}}, pad_index = cuda_emitter->build_pad({{input_type, output_type}},
output_shape, output_shape,
output_shape_padded, output_shape_padded,
padding_below, padding_below,
...@@ -587,12 +586,12 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::ConvolutionBackprop ...@@ -587,12 +586,12 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::ConvolutionBackprop
std::unique_ptr<gpu::primitive> kernel_launch(new gpu::primitive{[=](void** inputs, std::unique_ptr<gpu::primitive> kernel_launch(new gpu::primitive{[=](void** inputs,
void** outputs) mutable { void** outputs) mutable {
if (idx_workspace != std::numeric_limits<size_t>::max() && if (idx_workspace != std::numeric_limits<size_t>::max() &&
pad_dynamic_index != std::numeric_limits<size_t>::max() && pad_index != std::numeric_limits<size_t>::max() &&
slice_index != std::numeric_limits<size_t>::max()) slice_index != std::numeric_limits<size_t>::max())
{ {
void* pad_buffer = runtime::gpu::invoke_memory_primitive(m_ctx, idx_workspace); void* pad_buffer = runtime::gpu::invoke_memory_primitive(m_ctx, idx_workspace);
gpu::invoke_primitive(m_ctx, gpu::invoke_primitive(m_ctx,
pad_dynamic_index, pad_index,
std::vector<void*>{inputs[0]}.data(), std::vector<void*>{inputs[0]}.data(),
std::vector<void*>{pad_buffer}.data()); std::vector<void*>{pad_buffer}.data());
gpu::invoke_primitive(m_ctx, conv_index, inputs, std::vector<void*>{pad_buffer}.data()); gpu::invoke_primitive(m_ctx, conv_index, inputs, std::vector<void*>{pad_buffer}.data());
...@@ -662,7 +661,7 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::ConvolutionBackprop ...@@ -662,7 +661,7 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::ConvolutionBackprop
Shape padding_interior(data_dilation_strides); Shape padding_interior(data_dilation_strides);
size_t idx_workspace = std::numeric_limits<size_t>::max(); size_t idx_workspace = std::numeric_limits<size_t>::max();
size_t pad_dynamic_index = std::numeric_limits<size_t>::max(); size_t pad_index = std::numeric_limits<size_t>::max();
bool can_find_algo = true; bool can_find_algo = true;
if (pad_required || is_deconvolution) if (pad_required || is_deconvolution)
{ {
...@@ -675,7 +674,7 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::ConvolutionBackprop ...@@ -675,7 +674,7 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::ConvolutionBackprop
idx_workspace = allocator.reserve_workspace(temp_size, true); idx_workspace = allocator.reserve_workspace(temp_size, true);
auto& cuda_emitter = m_primitive_emitter->get_cuda_emitter(); auto& cuda_emitter = m_primitive_emitter->get_cuda_emitter();
pad_dynamic_index = cuda_emitter->build_pad_dynamic({{input_type, output_type}}, pad_index = cuda_emitter->build_pad({{input_type, output_type}},
input_shape_0, input_shape_0,
input_shape_padded, input_shape_padded,
padding_below, padding_below,
...@@ -700,11 +699,11 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::ConvolutionBackprop ...@@ -700,11 +699,11 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::ConvolutionBackprop
std::unique_ptr<gpu::primitive> kernel_launch( std::unique_ptr<gpu::primitive> kernel_launch(
new gpu::primitive{[=](void** inputs, void** outputs) mutable { new gpu::primitive{[=](void** inputs, void** outputs) mutable {
if (idx_workspace != std::numeric_limits<size_t>::max() && if (idx_workspace != std::numeric_limits<size_t>::max() &&
pad_dynamic_index != std::numeric_limits<size_t>::max()) pad_index != std::numeric_limits<size_t>::max())
{ {
void* pad_buffer = runtime::gpu::invoke_memory_primitive(m_ctx, idx_workspace); void* pad_buffer = runtime::gpu::invoke_memory_primitive(m_ctx, idx_workspace);
gpu::invoke_primitive(m_ctx, gpu::invoke_primitive(m_ctx,
pad_dynamic_index, pad_index,
std::vector<void*>{inputs[0]}.data(), std::vector<void*>{inputs[0]}.data(),
std::vector<void*>{pad_buffer}.data()); std::vector<void*>{pad_buffer}.data());
gpu::invoke_primitive( gpu::invoke_primitive(
...@@ -768,7 +767,7 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::MaxPool* node) ...@@ -768,7 +767,7 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::MaxPool* node)
padded_size * args[0].get_element_type().size()); padded_size * args[0].get_element_type().size());
auto& cuda_emitter = m_primitive_emitter->get_cuda_emitter(); auto& cuda_emitter = m_primitive_emitter->get_cuda_emitter();
pad_index = cuda_emitter->build_pad_dynamic({{input_type, output_type}}, pad_index = cuda_emitter->build_pad({{input_type, output_type}},
input_shape, input_shape,
input_shape_padded, input_shape_padded,
padding_below, padding_below,
......
...@@ -638,11 +638,9 @@ void runtime::gpu::CudaKernelBuilder::get_concat_op(codegen::CodeWriter& writer, ...@@ -638,11 +638,9 @@ void runtime::gpu::CudaKernelBuilder::get_concat_op(codegen::CodeWriter& writer,
writer.block_end(); writer.block_end();
} }
void runtime::gpu::CudaKernelBuilder::get_pad_dynamic_op( void runtime::gpu::CudaKernelBuilder::get_pad_op(codegen::CodeWriter& writer,
codegen::CodeWriter& writer,
const std::string& name, const std::string& name,
GPUKernelArgs& args, GPUKernelArgs& args,
const std::array<std::string, 2>& data_types,
size_t rank) size_t rank)
{ {
writer << "extern \"C\" __global__ void cuda_" << name << args.get_input_signature(); writer << "extern \"C\" __global__ void cuda_" << name << args.get_input_signature();
...@@ -673,6 +671,44 @@ void runtime::gpu::CudaKernelBuilder::get_pad_dynamic_op( ...@@ -673,6 +671,44 @@ void runtime::gpu::CudaKernelBuilder::get_pad_dynamic_op(
writer.block_end(); writer.block_end();
} }
void runtime::gpu::CudaKernelBuilder::get_pad_fill_op(codegen::CodeWriter& writer,
const std::string& name,
GPUKernelArgs& args,
size_t rank)
{
writer << "extern \"C\" __global__ void cuda_" << name << args.get_input_signature();
writer.block_begin();
{
writer << "uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n";
writer << "if (tid < n)\n";
writer.block_begin();
{
writer << "bool in_bounds = true;\n";
writer << "uint32_t output_pixel = tid;\n";
writer << "uint32_t input_pixel = 0;\n";
writer << "int32_t input, input_dil;\n";
for (size_t i = 0; i < rank; i++)
{
if (i != 0)
{
writer << "output_pixel %= output_strides" << i - 1 << ";\n";
}
writer << "input_dil = output_pixel / output_strides" << i << " - padding_below"
<< i << ";\n";
writer << "input = input_dil / (padding_interior" << i << " + 1);\n";
writer << "input_dil %= (padding_interior" << i << " + 1);\n";
writer << "in_bounds = in_bounds && (input >= 0) && (input < input_shape" << i
<< ") && (input_dil == 0);\n";
writer << "input_pixel += input * input_strides" << i << ";\n";
}
writer << "out[tid] = (in_bounds) ? in[input_pixel] : *pad;\n";
}
writer.block_end();
}
writer.block_end();
}
void runtime::gpu::CudaKernelBuilder::get_reverse_sequence_op( void runtime::gpu::CudaKernelBuilder::get_reverse_sequence_op(
codegen::CodeWriter& writer, codegen::CodeWriter& writer,
const std::string& name, const std::string& name,
......
...@@ -130,10 +130,14 @@ namespace ngraph ...@@ -130,10 +130,14 @@ namespace ngraph
const std::string& math_kernel, const std::string& math_kernel,
const std::vector<std::string>& data_types); const std::vector<std::string>& data_types);
static void get_pad_dynamic_op(codegen::CodeWriter& writer, static void get_pad_op(codegen::CodeWriter& writer,
const std::string& name,
GPUKernelArgs& args,
size_t rank);
static void get_pad_fill_op(codegen::CodeWriter& writer,
const std::string& name, const std::string& name,
GPUKernelArgs& args, GPUKernelArgs& args,
const std::array<std::string, 2>& data_types,
size_t rank); size_t rank);
static void get_ew_collective_op(codegen::CodeWriter& writer, static void get_ew_collective_op(codegen::CodeWriter& writer,
......
...@@ -827,11 +827,11 @@ void runtime::gpu::GPU_Emitter::emit_Pad(EMIT_ARGS) ...@@ -827,11 +827,11 @@ void runtime::gpu::GPU_Emitter::emit_Pad(EMIT_ARGS)
auto& cuda_emitter = external_function->get_primitive_emitter()->get_cuda_emitter(); auto& cuda_emitter = external_function->get_primitive_emitter()->get_cuda_emitter();
auto pad_index = cuda_emitter->build_pad({{args[0].get_type(), out[0].get_type()}}, auto pad_index = cuda_emitter->build_pad_fill(
{{args[0].get_type(), args[1].get_type(), out[0].get_type()}},
input_shape, input_shape,
output_shape, output_shape,
padding_below, padding_below,
padding_above,
padding_interior); padding_interior);
writer << "void* input[] = {" << node_names(args) << "};\n"; writer << "void* input[] = {" << node_names(args) << "};\n";
writer << "void* output[] = {" << node_names(out) << "};\n"; writer << "void* output[] = {" << node_names(out) << "};\n";
......
...@@ -7497,6 +7497,86 @@ NGRAPH_TEST(${BACKEND_NAME}, pad_interior_exterior_4d_2x0x3x2) ...@@ -7497,6 +7497,86 @@ NGRAPH_TEST(${BACKEND_NAME}, pad_interior_exterior_4d_2x0x3x2)
EXPECT_EQ(expected, read_vector<float>(result)); EXPECT_EQ(expected, read_vector<float>(result));
} }
// This test covers the case with multiple image and with asymetric pad
// bug has been found on nvGPU side now covered by this test
NGRAPH_TEST(${BACKEND_NAME}, pad_2channel_2image_asym)
{
Shape shape_a{2, 2, 4, 4};
auto window_movement_strides = Strides{2, 2};
Shape padding_below{0, 0, 0, 0};
Shape padding_above{0, 0, 2, 2};
Shape padding_interior{0, 0, 0, 0};
auto A = make_shared<op::Parameter>(element::f32, shape_a);
Shape shape_b{};
auto B = make_shared<op::Parameter>(element::f32, shape_b);
Shape shape_r{2, 2, 6, 6};
auto f = make_shared<Function>(
make_shared<op::Pad>(A, B, padding_below, padding_above, padding_interior),
op::ParameterVector{A, B});
auto backend = runtime::Backend::create("${BACKEND_NAME}");
// Create some tensors for input/output
auto a = backend->create_tensor(element::f32, shape_a);
copy_data(a,
test::NDArray<float, 4>({{{{0, 1, 0, 2}, // img 0 chan 0
{0, 3, 2, 0},
{2, 0, 0, 0},
{0, 2, 1, 0}},
{{0, 0, 0, 2}, // img 0 chan 1
{0, 2, 3, 0},
{2, 0, 1, 0},
{2, 0, 0, 0}}},
{{{0, 2, 1, 1}, // img 1 chan 0
{0, 0, 2, 0},
{0, 0, 1, 2},
{0, 0, 0, 0}},
{{2, 1, 0, 0}, // img 1 chan 1
{0, 2, 0, 0},
{1, 1, 2, 0},
{1, 0, 0, 0}}}})
.get_vector());
auto b = backend->create_tensor(element::f32, shape_b);
copy_data(b, vector<float>{42});
auto result = backend->create_tensor(element::f32, shape_r);
backend->call_with_validate(f, {result}, {a, b});
EXPECT_EQ((test::NDArray<float, 4>({{{{0, 1, 0, 2, 42, 42}, // img 0 chan 0
{0, 3, 2, 0, 42, 42},
{2, 0, 0, 0, 42, 42},
{0, 2, 1, 0, 42, 42},
{42, 42, 42, 42, 42, 42},
{42, 42, 42, 42, 42, 42}},
{{0, 0, 0, 2, 42, 42}, // img 1 chan 0
{0, 2, 3, 0, 42, 42},
{2, 0, 1, 0, 42, 42},
{2, 0, 0, 0, 42, 42},
{42, 42, 42, 42, 42, 42},
{42, 42, 42, 42, 42, 42}}},
{{{0, 2, 1, 1, 42, 42}, // img 1 chan 0
{0, 0, 2, 0, 42, 42},
{0, 0, 1, 2, 42, 42},
{0, 0, 0, 0, 42, 42},
{42, 42, 42, 42, 42, 42},
{42, 42, 42, 42, 42, 42}},
{{2, 1, 0, 0, 42, 42}, // img 1 chan 1
{0, 2, 0, 0, 42, 42},
{1, 1, 2, 0, 42, 42},
{1, 0, 0, 0, 42, 42},
{42, 42, 42, 42, 42, 42},
{42, 42, 42, 42, 42, 42}}}})
.get_vector()),
read_vector<float>(result));
}
// Trivial case with no reduced axes. // Trivial case with no reduced axes.
NGRAPH_TEST(${BACKEND_NAME}, product_trivial) NGRAPH_TEST(${BACKEND_NAME}, product_trivial)
{ {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment