Commit bb94fa85 authored by Chris Sullivan's avatar Chris Sullivan Committed by Scott Cyphers

softmax & convolution memory primitive cacheing (#1290)

* Updated softmax.

* Formatting.

* Updated convolution.

* Use build_primitive overloading. Add helper to emit type_string given a node.

* Formatting.

* Update ConvolutionBackpropData.

* convolution backprop & max pool memory primitive cacheing (#1303)

* Updated ConvolutionBackpropFilters.
* Update MaxPool.

* Update Max and Min. (#1307)
parent eba9439b
......@@ -22,10 +22,13 @@
#include "ngraph/codegen/code_writer.hpp"
#include "ngraph/runtime/gpu/cuda_emitter.hpp"
#include "ngraph/runtime/gpu/gpu_cuda_kernel_builder.hpp"
#include "ngraph/runtime/gpu/gpu_emitter.hpp"
#include "ngraph/runtime/gpu/gpu_invoke.hpp"
#include "ngraph/runtime/gpu/gpu_kernel_emitters.hpp"
#include "ngraph/runtime/gpu/gpu_primitive_emitter.hpp"
#include "ngraph/runtime/gpu/gpu_runtime_context.hpp"
#include "ngraph/runtime/gpu/gpu_util.hpp"
#include "ngraph/runtime/gpu/type_info.hpp"
#include "ngraph/util.hpp"
using namespace ngraph;
......@@ -1153,6 +1156,160 @@ size_t runtime::gpu::CUDAEmitter::build_elementwise_n_to_1(const std::vector<std
return primitive_index;
}
size_t runtime::gpu::CUDAEmitter::build_primitive(const op::MaxPool* node)
{
auto& args = node->get_inputs();
auto& out = node->get_outputs();
auto& input_shape = args[0].get_shape();
auto& result_shape = out[0].get_shape();
auto padding_below = node->get_padding_below();
auto padding_above = node->get_padding_above();
auto input_type = args[0].get_element_type().c_type_string();
auto output_type = out[0].get_element_type().c_type_string();
// construct hash to determine if kernel needs to be emitted
// or if it already exists in the primitive list
std::stringstream ss;
ss << "max_pool_" << runtime::gpu::kernel::emit_type_string(node) << "_i"
<< join(input_shape, "_") << "_o" << join(result_shape, "_") << "_ws"
<< join(node->get_window_shape(), "_") << "_wst"
<< join(node->get_window_movement_strides(), "_") << "_pb" << join(padding_below, "_")
<< "_pb" << join(padding_above, "_");
std::string hash = ss.str();
// check if the requested kernel is already an inserted primitive
size_t primitive_index = m_primitive_emitter->lookup(hash);
if (primitive_index != std::numeric_limits<size_t>::max())
{
return primitive_index;
}
/// assymetric padding detection
bool pad_required = false;
auto shape_to_pool =
runtime::gpu::get_padded_shape(input_shape, padding_below, padding_above, {});
if (shape_to_pool != input_shape)
{
pad_required = true;
}
pad_required = pad_required && (padding_below != padding_above);
// asymetric padding
size_t idx_workspace = std::numeric_limits<size_t>::max();
size_t pad_index = std::numeric_limits<size_t>::max();
if (pad_required)
{
auto temp_size = shape_size(shape_to_pool) * args[0].get_element_type().size();
GPUAllocator allocator = m_primitive_emitter->get_memory_allocator();
idx_workspace = allocator.reserve_workspace(temp_size);
auto pad_value = TypeInfo::Get(args[0].get_element_type())->lowest();
pad_index = build_pad({{input_type, output_type}},
input_shape,
shape_to_pool,
padding_below,
padding_above,
Shape{},
pad_value);
}
/// end asymmetric padding detection
size_t max_pool_index = build_1d_max_pool({{input_type, output_type}},
input_shape,
result_shape,
node->get_window_shape().back(),
node->get_window_movement_strides().back());
std::unique_ptr<gpu::primitive> kernel_launch(
new gpu::primitive{[=](void** inputs, void** outputs) mutable {
if (idx_workspace != std::numeric_limits<size_t>::max() &&
pad_index != std::numeric_limits<size_t>::max())
{
// void* pad_buffer = runtime::gpu::invoke_memory_primitive(m_ctx, idx_workspace);
// gpu::invoke_primitive(m_ctx,
// pad_dynamic_index,
// std::vector<void*>{inputs[0]}.data(),
// std::vector<void*>{pad_buffer}.data());
// gpu::invoke_primitive(
// m_ctx, conv_index, std::vector<void*>{pad_buffer, inputs[1]}.data(), outputs);
void* pad_buffer = runtime::gpu::invoke_memory_primitive(m_ctx, idx_workspace);
gpu::invoke_primitive(m_ctx,
pad_index,
std::vector<void*>{inputs[0]}.data(),
std::vector<void*>{pad_buffer}.data());
gpu::invoke_primitive(
m_ctx, max_pool_index, std::vector<void*>{pad_buffer}.data(), outputs);
}
else
{
gpu::invoke_primitive(m_ctx, max_pool_index, inputs, outputs);
}
}});
primitive_index = this->m_primitive_emitter->insert(std::move(kernel_launch));
m_primitive_emitter->cache(hash, primitive_index);
return primitive_index;
}
size_t runtime::gpu::CUDAEmitter::build_primitive(const op::Softmax* node)
{
auto& args = node->get_inputs();
auto& out = node->get_outputs();
auto tensor_shape = args[0].get_shape();
auto axes = node->get_axes();
std::stringstream ss;
ss << "softmax_" << runtime::gpu::kernel::emit_type_string(node) << "_s"
<< join(tensor_shape, "_") << "_ra" << join(axes, "_");
auto hash = ss.str();
size_t primitive_index = m_primitive_emitter->lookup(hash);
if (primitive_index != std::numeric_limits<size_t>::max())
{
return primitive_index;
}
// build composite primitive
// reserve a temporary buffer for the intermediate reduction
GPUAllocator allocator = this->m_primitive_emitter->get_memory_allocator();
auto reduced_shape = tensor_shape;
for (auto const& axis : axes)
{
reduced_shape[axis] = 1;
}
size_t reduced_size = shape_size(reduced_shape);
size_t workspace_idx =
allocator.reserve_workspace(reduced_size * out[0].get_element_type().size());
// exponentiate with fused sum reduction to calculate softmax denominator
auto input_type = args[0].get_element_type().c_type_string();
auto output_type = out[0].get_element_type().c_type_string();
size_t exp_sum_reduce = build_elementwise_collective<ngraph::op::Exp, ngraph::op::Add>(
{{input_type, output_type}}, tensor_shape, {}, axes, true /* multi-output */);
// inplace binary division with fused broadcast to calculate softmax
size_t div_broadcast = build_elementwise_collective<ngraph::op::Divide>(
std::vector<std::string>(3, output_type), tensor_shape, {1}, axes);
std::unique_ptr<gpu::primitive> kernel_launch(
new gpu::primitive{[=](void** inputs, void** outputs) mutable {
void* workspace = runtime::gpu::invoke_memory_primitive(m_ctx, workspace_idx);
// cache the elementwise result and the fused result (multi-output)
runtime::gpu::invoke_primitive(
m_ctx, exp_sum_reduce, inputs, std::vector<void*>{workspace, outputs[0]}.data());
runtime::gpu::invoke_primitive(
m_ctx, div_broadcast, std::vector<void*>{outputs[0], workspace}.data(), outputs);
}});
primitive_index = this->m_primitive_emitter->insert(std::move(kernel_launch));
m_primitive_emitter->cache(hash, primitive_index);
return primitive_index;
}
size_t
runtime::gpu::CUDAEmitter::build_fused_ew_to_collective(const std::vector<std::string>& dtypes,
GPUShape tensor_shape,
......@@ -1653,6 +1810,133 @@ size_t runtime::gpu::CUDAEmitter::build_broadcast(const std::array<std::string,
return primitive_index;
}
size_t runtime::gpu::CUDAEmitter::build_primitive(const op::Convolution* node)
{
std::stringstream ss;
ss << "convolution_fprop_" << runtime::gpu::kernel::emit_type_string(node);
auto& args = node->get_inputs();
auto& out = node->get_outputs();
auto input_shape = args[0].get_shape();
auto filter_shape = args[1].get_shape();
auto output_shape = out[0].get_shape();
auto tensor_size = input_shape.size();
// primitive cache parameters
ss << "_s" << join(input_shape, "_") << "_pb" << join(node->get_padding_below(), "_") << "_pi"
<< join(node->get_data_dilation_strides(), "_") << "_fs" << join(filter_shape, "_") << "_fst"
<< join(node->get_window_movement_strides(), "_") << "_fdi"
<< join(node->get_window_dilation_strides(), "_");
auto hash = ss.str();
// check if the requested primtive is already built
size_t primitive_index = m_primitive_emitter->lookup(hash);
if (primitive_index != std::numeric_limits<size_t>::max())
{
return primitive_index;
}
// Reshape from NC{d1,..,dn} -> C{d1,...,dn}N
// and from KC{df1,...,dfn} -> C{df1,...,dfn}N.
// TODO: This should be done via a pass similar to
// what is done for convolution in the IA transformer
// c.f runtime/cpu/pass/cpu_layout.cpp
GPUAllocator allocator = m_primitive_emitter->get_memory_allocator();
size_t transposed_data_idx =
allocator.reserve_workspace(shape_size(input_shape) * args[0].get_element_type().size());
size_t transposed_filter_idx =
allocator.reserve_workspace(shape_size(filter_shape) * args[1].get_element_type().size());
size_t transposed_output_idx =
allocator.reserve_workspace(shape_size(output_shape) * out[0].get_element_type().size());
GPUShape input_order;
for (int i = 1; i <= tensor_size; i++)
{
input_order.push_back(i % tensor_size);
}
size_t reshape_data_index = build_reshape(
{{args[0].get_element_type().c_type_string(), args[0].get_element_type().c_type_string()}},
input_shape,
input_order);
size_t reshape_filter_index = build_reshape(
{{args[1].get_element_type().c_type_string(), args[1].get_element_type().c_type_string()}},
filter_shape,
input_order);
// local helper to reshape tensor shape objects
auto reshape = [](const Shape& shape, const GPUShape& order) {
Shape output(shape.size(), 0);
for (size_t i = 0; i < shape.size(); i++)
{
output[i] = shape[order[i]];
}
return output;
};
// reorder axes of the input shape (NC{d_1,...,d_n} -> C{d_1,...,d_n}N)
input_shape = reshape(input_shape, input_order);
// reorder axes of the filter shape (KC{df_1,...,df_n} -> C{df_1,...,df_n}K)
filter_shape = reshape(filter_shape, input_order);
// reorder axes of the output shape (NK{do_1,...,do_n} -> K{do_1,...,do_n}N)
output_shape = reshape(output_shape, input_order);
size_t conv_index = build_convolution({{args[0].get_element_type().c_type_string(),
args[1].get_element_type().c_type_string(),
out[0].get_element_type().c_type_string()}},
input_shape,
node->get_padding_below(),
node->get_data_dilation_strides(),
filter_shape,
node->get_window_movement_strides(),
node->get_window_dilation_strides(),
output_shape);
// reshape output tensor (K{do_1,...,do_n}N -> NK{do_1,...,do_n})
input_order.clear();
input_order.push_back(static_cast<int>(tensor_size - 1));
for (int i = 0; i < tensor_size - 1; i++)
{
input_order.push_back(i);
}
size_t reshape_output_index = build_reshape(
{{args[1].get_element_type().c_type_string(), args[1].get_element_type().c_type_string()}},
output_shape,
input_order);
std::unique_ptr<gpu::primitive> kernel_launch(
new gpu::primitive{[=](void** inputs, void** outputs) mutable {
void* data = gpu::invoke_memory_primitive(m_ctx, transposed_data_idx);
void* filter = gpu::invoke_memory_primitive(m_ctx, transposed_filter_idx);
void* output = gpu::invoke_memory_primitive(m_ctx, transposed_output_idx);
gpu::invoke_primitive(m_ctx,
reshape_data_index,
std::vector<void*>{inputs[0]}.data(),
std::vector<void*>{data}.data());
gpu::invoke_primitive(m_ctx,
reshape_filter_index,
std::vector<void*>{inputs[1]}.data(),
std::vector<void*>{filter}.data());
gpu::invoke_primitive(m_ctx,
conv_index,
std::vector<void*>{data, filter}.data(),
std::vector<void*>{output}.data());
gpu::invoke_primitive(m_ctx,
reshape_output_index,
std::vector<void*>{output}.data(),
std::vector<void*>{outputs[0]}.data());
}});
primitive_index = this->m_primitive_emitter->insert(std::move(kernel_launch));
m_primitive_emitter->cache(hash, primitive_index);
return primitive_index;
}
size_t runtime::gpu::CUDAEmitter::build_convolution(const std::array<std::string, 3>& dtypes,
GPUShape input_shape,
GPUShape input_pad_below,
......@@ -1675,7 +1959,7 @@ size_t runtime::gpu::CUDAEmitter::build_convolution(const std::array<std::string
// coalescing and vectorization is maintained regardless of coordinate access
// (e.g. data and filter dilation).
std::string kernel_name = "convolution_fprop_" + join(dtypes, "_");
std::string kernel_name = "convolution_fprop_c_nd_n" + join(dtypes, "_");
std::replace(kernel_name.begin(), kernel_name.end(), ' ', '_');
// prerequisits for kernel cacheing and building
......@@ -1693,20 +1977,6 @@ size_t runtime::gpu::CUDAEmitter::build_convolution(const std::array<std::string
kernel_name = kernel_name + "_n" + std::to_string(N) + "_k" + std::to_string(K) + "_fsz" +
std::to_string(filter_size) + "_r" + std::to_string(rank);
// primitive cache parameters
std::stringstream ss;
ss << kernel_name << "_s" << join(input_shape, "_") << "_pb" << join(input_pad_below, "_")
<< "_pi" << join(input_dilation, "_") << "_fs" << join(filter_shape, "_") << "_fst"
<< join(filter_stride, "_") << "_fdi" << join(filter_dilation, "_");
auto hash = ss.str();
// check if the requested kernel is already an inserted primitive
size_t primitive_index = m_primitive_emitter->lookup(hash);
if (primitive_index != std::numeric_limits<size_t>::max())
{
return primitive_index;
}
// tiling options are determined by
// batch size (N) and number of filters (K)
int reg_tile_size = 1;
......@@ -1904,9 +2174,7 @@ size_t runtime::gpu::CUDAEmitter::build_convolution(const std::array<std::string
debug_sync();
}});
primitive_index = this->m_primitive_emitter->insert(std::move(conv));
m_primitive_emitter->cache(hash, primitive_index);
return primitive_index;
return this->m_primitive_emitter->insert(std::move(conv));
}
void runtime::gpu::CUDAEmitter::print_tensor_from_gpu(codegen::CodeWriter& writer,
......
......@@ -22,6 +22,10 @@
#include "ngraph/runtime/gpu/gpu_shape.hpp"
#include "ngraph/strides.hpp"
#include "ngraph/op/convolution.hpp"
#include "ngraph/op/max_pool.hpp"
#include "ngraph/op/softmax.hpp"
namespace ngraph
{
class GPUShape;
......@@ -37,6 +41,11 @@ namespace ngraph
{
friend class GPUPrimitiveEmitter;
public:
size_t build_primitive(const op::Softmax* node);
size_t build_primitive(const op::Convolution* node);
size_t build_primitive(const op::MaxPool* node);
public:
size_t build_pad(const std::array<std::string, 2>& dtypes,
GPUShape input_shape,
......
......@@ -20,10 +20,12 @@
#include "ngraph/log.hpp"
#include "ngraph/runtime/gpu/cudnn_emitter.hpp"
#include "ngraph/runtime/gpu/gpu_emitter.hpp"
#include "ngraph/runtime/gpu/gpu_invoke.hpp"
#include "ngraph/runtime/gpu/gpu_primitive_emitter.hpp"
#include "ngraph/runtime/gpu/gpu_runtime_context.hpp"
#include "ngraph/runtime/gpu/gpu_util.hpp"
#include "ngraph/runtime/gpu/type_info.hpp"
#include "ngraph/util.hpp"
using namespace ngraph;
......@@ -327,22 +329,538 @@ cudnnConvolutionDescriptor_t& runtime::gpu::CUDNNEmitter::get_cudnn_convolution_
return conv_descriptor;
}
size_t runtime::gpu::CUDNNEmitter::build_convolution(const std::string& dtype,
const Shape& input_tensor_shape,
const Shape& input_filter_shape,
const Shape& output_tensor_shape,
const Strides& window_movement_strides,
const Strides& window_dilation_strides,
const Shape& padding_below)
size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::Convolution* node)
{
auto& args = node->get_inputs();
auto& out = node->get_outputs();
auto input_shape = args[0].get_shape();
auto filter_shape = args[1].get_shape();
auto output_shape = out[0].get_shape();
Strides window_dilation_strides = node->get_window_dilation_strides();
Strides window_movement_strides = node->get_window_movement_strides();
Strides data_dilation_strides = node->get_data_dilation_strides();
CoordinateDiff padding_below_diff = node->get_padding_below();
CoordinateDiff padding_above_diff = node->get_padding_above();
auto dtype = out[0].get_element_type().c_type_string();
// construct hash to determine if kernel needs to be emitted
// or if it already exists in the primitive list
std::stringstream ss;
ss << "convolution_op_" << dtype << "_i" << join(input_shape, "_") << "_w"
<< join(filter_shape, "_") << "_o" << join(output_shape, "_") << "_ws"
<< join(window_movement_strides, "_") << "_wd" << join(window_dilation_strides, "_") << "_p"
<< join(padding_below_diff, "_");
std::string hash = ss.str();
// check if the requested kernel is already an inserted primitive
size_t primitive_index = m_primitive_emitter->lookup(hash);
if (primitive_index != std::numeric_limits<size_t>::max())
{
return primitive_index;
}
bool is_deconvolution = false;
for (auto a : data_dilation_strides)
{
if (a != 1)
{
is_deconvolution = true;
break;
}
}
bool pad_required = (padding_below_diff != padding_above_diff);
Shape padding_below(padding_below_diff.size(), 0);
Shape padding_above(padding_above_diff.size(), 0);
for (int i = 0; i < padding_below.size(); i++)
{
padding_below[i] = static_cast<size_t>(padding_below_diff[i]);
padding_above[i] = static_cast<size_t>(padding_above_diff[i]);
}
Shape input_shape_padded = input_shape;
Shape padding_interior(data_dilation_strides);
size_t idx_workspace = std::numeric_limits<size_t>::max();
size_t pad_dynamic_index = std::numeric_limits<size_t>::max();
if (pad_required || is_deconvolution)
{
input_shape_padded = runtime::gpu::get_padded_shape(
input_shape, padding_below, padding_above, padding_interior);
auto temp_size = shape_size(input_shape_padded) * args[0].get_element_type().size();
GPUAllocator allocator = m_primitive_emitter->get_memory_allocator();
// reserve zero initialized workspace
idx_workspace = allocator.reserve_workspace(temp_size, true);
auto& cuda_emitter = m_primitive_emitter->get_cuda_emitter();
pad_dynamic_index =
cuda_emitter->build_pad_dynamic({{args[0].get_element_type().c_type_string(),
out[0].get_element_type().c_type_string()}},
input_shape,
input_shape_padded,
padding_below,
padding_interior);
// asymetric padding has been applied, zero out padding vectors to
// ensure cudnn does not assume padding
std::fill(padding_below.begin(), padding_below.end(), 0);
}
size_t conv_index = build_convolution(dtype,
input_shape_padded,
filter_shape,
output_shape,
window_movement_strides,
window_dilation_strides,
padding_below);
std::unique_ptr<gpu::primitive> kernel_launch(
new gpu::primitive{[=](void** inputs, void** outputs) mutable {
if (idx_workspace != std::numeric_limits<size_t>::max() &&
pad_dynamic_index != std::numeric_limits<size_t>::max())
{
void* pad_buffer = runtime::gpu::invoke_memory_primitive(m_ctx, idx_workspace);
gpu::invoke_primitive(m_ctx,
pad_dynamic_index,
std::vector<void*>{inputs[0]}.data(),
std::vector<void*>{pad_buffer}.data());
gpu::invoke_primitive(
m_ctx, conv_index, std::vector<void*>{pad_buffer, inputs[1]}.data(), outputs);
}
else
{
gpu::invoke_primitive(m_ctx, conv_index, inputs, outputs);
}
}});
primitive_index = this->m_primitive_emitter->insert(std::move(kernel_launch));
m_primitive_emitter->cache(hash, primitive_index);
return primitive_index;
}
size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::ConvolutionBackpropData* node)
{
auto& args = node->get_inputs();
auto& out = node->get_outputs();
auto input_shape = args[0].get_shape();
auto filter_shape = args[1].get_shape();
auto output_shape = out[0].get_shape();
Strides window_dilation_strides = node->get_window_dilation_strides_forward();
Strides window_movement_strides = node->get_window_movement_strides_forward();
Strides data_dilation_strides = node->get_data_dilation_strides_forward();
CoordinateDiff padding_below_diff = node->get_padding_below_forward();
CoordinateDiff padding_above_diff = node->get_padding_above_forward();
auto input_type = args[0].get_element_type().c_type_string();
auto output_type = out[0].get_element_type().c_type_string();
// construct hash to determine if kernel needs to be emitted
// or if it already exists in the primitive list
std::stringstream ss;
ss << "convolution_bp_data_op_" << output_type << "_i" << join(input_shape, "_") << "_w"
<< join(filter_shape, "_") << "_o" << join(output_shape, "_") << "_ws"
<< join(window_movement_strides, "_") << "_wd" << join(window_dilation_strides, "_") << "_p"
<< join(padding_below_diff, "_");
std::string hash = ss.str();
// check if the requested kernel is already an inserted primitive
size_t primitive_index = m_primitive_emitter->lookup(hash);
if (primitive_index != std::numeric_limits<size_t>::max())
{
return primitive_index;
}
bool is_deconvolution = false;
for (auto a : data_dilation_strides)
{
if (a != 1)
{
is_deconvolution = true;
break;
}
}
bool pad_required = (padding_below_diff != padding_above_diff);
Shape padding_below(padding_below_diff.size(), 0);
Shape padding_above(padding_above_diff.size(), 0);
for (int i = 0; i < padding_below.size(); i++)
{
padding_below[i] = static_cast<size_t>(padding_below_diff[i]);
padding_above[i] = static_cast<size_t>(padding_above_diff[i]);
}
auto output_shape_padded = output_shape;
Shape padding_below_back(output_shape.size(), 0);
Shape padding_interior_back(output_shape.size(), 1);
size_t i = padding_below_back.size() - padding_below.size();
size_t j = 0;
for (; i < padding_below_back.size(); i++)
{
padding_below_back[i] = padding_below[j];
padding_interior_back[i] = data_dilation_strides[j];
j++;
}
Shape padding_interior(data_dilation_strides);
size_t idx_workspace = std::numeric_limits<size_t>::max();
size_t pad_dynamic_index = std::numeric_limits<size_t>::max();
size_t slice_index = std::numeric_limits<size_t>::max();
if (pad_required || is_deconvolution)
{
output_shape_padded =
get_padded_shape(output_shape, padding_below, padding_above, padding_interior);
auto temp_size = shape_size(output_shape_padded) * args[0].get_element_type().size();
GPUAllocator allocator = m_primitive_emitter->get_memory_allocator();
// reserve zero initialized workspace
idx_workspace = allocator.reserve_workspace(temp_size, true);
auto& cuda_emitter = m_primitive_emitter->get_cuda_emitter();
pad_dynamic_index = cuda_emitter->build_pad_dynamic({{input_type, output_type}},
output_shape,
output_shape_padded,
padding_below,
padding_interior);
slice_index = cuda_emitter->build_slice({{input_type, output_type}},
output_shape_padded,
padding_below_back,
padding_interior_back,
output_shape);
// asymetric padding has been applied, zero out padding vectors to
// ensure cudnn does not assume padding
std::fill(padding_below.begin(), padding_below.end(), 0);
}
size_t conv_index = build_convolution_backward_data(output_type,
args[0].get_shape(),
args[1].get_shape(),
output_shape_padded,
window_movement_strides,
window_dilation_strides,
padding_below);
std::unique_ptr<gpu::primitive> kernel_launch(new gpu::primitive{[=](void** inputs,
void** outputs) mutable {
if (idx_workspace != std::numeric_limits<size_t>::max() &&
pad_dynamic_index != std::numeric_limits<size_t>::max() &&
slice_index != std::numeric_limits<size_t>::max())
{
void* pad_buffer = runtime::gpu::invoke_memory_primitive(m_ctx, idx_workspace);
gpu::invoke_primitive(m_ctx,
pad_dynamic_index,
std::vector<void*>{inputs[0]}.data(),
std::vector<void*>{pad_buffer}.data());
gpu::invoke_primitive(m_ctx, conv_index, inputs, std::vector<void*>{pad_buffer}.data());
gpu::invoke_primitive(
m_ctx, slice_index, std::vector<void*>{pad_buffer}.data(), outputs);
}
else
{
gpu::invoke_primitive(m_ctx, conv_index, inputs, outputs);
}
}});
primitive_index = this->m_primitive_emitter->insert(std::move(kernel_launch));
m_primitive_emitter->cache(hash, primitive_index);
return primitive_index;
}
size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::ConvolutionBackpropFilters* node)
{
auto& args = node->get_inputs();
auto& out = node->get_outputs();
auto input_shape_0 = args[0].get_shape();
auto input_shape_1 = args[1].get_shape();
auto filter_shape = out[0].get_shape();
Strides window_dilation_strides = node->get_window_dilation_strides_forward();
Strides window_movement_strides = node->get_window_movement_strides_forward();
Strides data_dilation_strides = node->get_data_dilation_strides_forward();
CoordinateDiff padding_below_diff = node->get_padding_below_forward();
CoordinateDiff padding_above_diff = node->get_padding_above_forward();
auto input_type = args[0].get_element_type().c_type_string();
auto output_type = out[0].get_element_type().c_type_string();
// construct hash to determine if kernel needs to be emitted
// or if it already exists in the primitive list
std::stringstream ss;
ss << "convolution_op_" << dtype << "_i" << join(input_tensor_shape, "_") << "_w"
<< join(input_filter_shape, "_") << "_o" << join(output_tensor_shape, "_") << "_ws"
ss << "convolution_bp_filter_op_" << output_type << "_i" << join(input_shape_0, "_") << "_w"
<< join(filter_shape, "_") << "_o" << join(input_shape_1, "_") << "_ws"
<< join(window_movement_strides, "_") << "_wd" << join(window_dilation_strides, "_") << "_p"
<< join(padding_below, "_");
<< join(padding_below_diff, "_");
std::string hash = ss.str();
// check if the requested kernel is already an inserted primitive
size_t primitive_index = m_primitive_emitter->lookup(hash);
if (primitive_index != std::numeric_limits<size_t>::max())
{
return primitive_index;
}
bool is_deconvolution = false;
for (auto a : data_dilation_strides)
{
if (a != 1)
{
is_deconvolution = true;
break;
}
}
bool pad_required = (padding_below_diff != padding_above_diff);
Shape padding_below(padding_below_diff.size(), 0);
Shape padding_above(padding_above_diff.size(), 0);
for (int i = 0; i < padding_below.size(); i++)
{
padding_below[i] = static_cast<size_t>(padding_below_diff[i]);
padding_above[i] = static_cast<size_t>(padding_above_diff[i]);
}
auto input_shape_padded = input_shape_0;
Shape padding_interior(data_dilation_strides);
size_t idx_workspace = std::numeric_limits<size_t>::max();
size_t pad_dynamic_index = std::numeric_limits<size_t>::max();
if (pad_required || is_deconvolution)
{
input_shape_padded = runtime::gpu::get_padded_shape(
input_shape_0, padding_below, padding_above, padding_interior);
auto temp_size = shape_size(input_shape_padded) * args[0].get_element_type().size();
GPUAllocator allocator = m_primitive_emitter->get_memory_allocator();
// reserve zero initialized workspace
idx_workspace = allocator.reserve_workspace(temp_size, true);
auto& cuda_emitter = m_primitive_emitter->get_cuda_emitter();
pad_dynamic_index = cuda_emitter->build_pad_dynamic({{input_type, output_type}},
input_shape_0,
input_shape_padded,
padding_below,
padding_interior);
// asymetric padding has been applied, zero out padding vectors to
// ensure cudnn does not assume padding
std::fill(padding_below.begin(), padding_below.end(), 0);
}
size_t conv_index = build_convolution_backward_filter(output_type,
input_shape_padded,
input_shape_1,
filter_shape,
window_movement_strides,
window_dilation_strides,
padding_below);
std::unique_ptr<gpu::primitive> kernel_launch(
new gpu::primitive{[=](void** inputs, void** outputs) mutable {
if (idx_workspace != std::numeric_limits<size_t>::max() &&
pad_dynamic_index != std::numeric_limits<size_t>::max())
{
void* pad_buffer = runtime::gpu::invoke_memory_primitive(m_ctx, idx_workspace);
gpu::invoke_primitive(m_ctx,
pad_dynamic_index,
std::vector<void*>{inputs[0]}.data(),
std::vector<void*>{pad_buffer}.data());
gpu::invoke_primitive(
m_ctx, conv_index, std::vector<void*>{pad_buffer, inputs[1]}.data(), outputs);
}
else
{
gpu::invoke_primitive(m_ctx, conv_index, inputs, outputs);
}
}});
primitive_index = this->m_primitive_emitter->insert(std::move(kernel_launch));
m_primitive_emitter->cache(hash, primitive_index);
return primitive_index;
}
size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::MaxPool* node)
{
auto& args = node->get_inputs();
auto& out = node->get_outputs();
auto& input_shape = args[0].get_shape();
auto& result_shape = out[0].get_shape();
auto padding_below = node->get_padding_below();
auto padding_above = node->get_padding_above();
auto input_type = args[0].get_element_type().c_type_string();
auto output_type = out[0].get_element_type().c_type_string();
// construct hash to determine if kernel needs to be emitted
// or if it already exists in the primitive list
std::stringstream ss;
ss << "max_pool_" << output_type << "_i" << join(input_shape, "_") << "_o"
<< join(result_shape, "_") << "_ws" << join(node->get_window_shape(), "_") << "_wst"
<< join(node->get_window_movement_strides(), "_") << "_pb" << join(padding_below, "_")
<< "_pb" << join(padding_above, "_");
std::string hash = ss.str();
// check if the requested kernel is already an inserted primitive
size_t primitive_index = m_primitive_emitter->lookup(hash);
if (primitive_index != std::numeric_limits<size_t>::max())
{
return primitive_index;
}
/// assymetric padding detection
bool pad_required = false;
auto shape_to_pool =
runtime::gpu::get_padded_shape(input_shape, padding_below, padding_above, {});
if (shape_to_pool != input_shape)
{
pad_required = true;
}
pad_required = pad_required && (padding_below != padding_above);
// asymetric padding
size_t idx_workspace = std::numeric_limits<size_t>::max();
size_t pad_index = std::numeric_limits<size_t>::max();
if (pad_required)
{
auto temp_size = shape_size(shape_to_pool) * args[0].get_element_type().size();
GPUAllocator allocator = m_primitive_emitter->get_memory_allocator();
idx_workspace = allocator.reserve_workspace(temp_size);
auto pad_value = TypeInfo::Get(args[0].get_element_type())->lowest();
auto& cuda_emitter = m_primitive_emitter->get_cuda_emitter();
pad_index = cuda_emitter->build_pad({{input_type, output_type}},
input_shape,
shape_to_pool,
padding_below,
padding_above,
Shape{},
pad_value);
// asymetric padding has been applied, zero out padding vectors to
// ensure cuDNN does not assume padding during pooling
std::fill(padding_below.begin(), padding_below.end(), 0);
std::fill(padding_above.begin(), padding_above.end(), 0);
}
/// end asymmetric padding detection
size_t max_pool_index = build_pooling(CUDNN_POOLING_MAX,
output_type,
CUDNNEmitter::Prop::Forward,
shape_to_pool,
result_shape,
node->get_window_movement_strides(),
node->get_window_shape(),
padding_below,
padding_above);
std::unique_ptr<gpu::primitive> kernel_launch(
new gpu::primitive{[=](void** inputs, void** outputs) mutable {
if (idx_workspace != std::numeric_limits<size_t>::max() &&
pad_index != std::numeric_limits<size_t>::max())
{
// void* pad_buffer = runtime::gpu::invoke_memory_primitive(m_ctx, idx_workspace);
// gpu::invoke_primitive(m_ctx,
// pad_dynamic_index,
// std::vector<void*>{inputs[0]}.data(),
// std::vector<void*>{pad_buffer}.data());
// gpu::invoke_primitive(
// m_ctx, conv_index, std::vector<void*>{pad_buffer, inputs[1]}.data(), outputs);
void* pad_buffer = runtime::gpu::invoke_memory_primitive(m_ctx, idx_workspace);
gpu::invoke_primitive(m_ctx,
pad_index,
std::vector<void*>{inputs[0]}.data(),
std::vector<void*>{pad_buffer}.data());
gpu::invoke_primitive(
m_ctx, max_pool_index, std::vector<void*>{pad_buffer}.data(), outputs);
}
else
{
gpu::invoke_primitive(m_ctx, max_pool_index, inputs, outputs);
}
}});
primitive_index = this->m_primitive_emitter->insert(std::move(kernel_launch));
m_primitive_emitter->cache(hash, primitive_index);
return primitive_index;
}
size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::Max* node)
{
auto& args = node->get_inputs();
auto& out = node->get_outputs();
auto& input_shape = args[0].get_shape();
auto& output_shape = out[0].get_shape();
auto input_size = shape_size(input_shape);
auto output_size = shape_size(output_shape);
auto output_element_size = out[0].get_element_type().size();
auto output_type = out[0].get_element_type().c_type_string();
std::stringstream ss;
ss << "max_" << output_type << "_i" << join(input_shape, "_") << "_ra"
<< join(node->get_reduction_axes(), "_");
std::string hash = ss.str();
// check if the requested kernel is already an inserted primitive
size_t primitive_index = m_primitive_emitter->lookup(hash);
if (primitive_index != std::numeric_limits<size_t>::max())
{
return primitive_index;
}
std::unique_ptr<gpu::primitive> kernel_launch;
;
// one of args[] axes has zero size, zero output
if (input_size == 0)
{
GPUAllocator allocator = m_primitive_emitter->get_memory_allocator();
std::vector<float> negative_inf(output_size, -std::numeric_limits<float>::infinity());
size_t idx_float_inf =
allocator.reserve_argspace(negative_inf.data(), negative_inf.size() * sizeof(float));
kernel_launch.reset(new gpu::primitive{[=](void** inputs, void** outputs) mutable {
void* temp_d = runtime::gpu::invoke_memory_primitive(m_ctx, idx_float_inf);
runtime::gpu::cuda_memcpyDtD(outputs[0], temp_d, output_size * output_element_size);
}});
}
else if (input_size == output_size)
{
// no reduction
kernel_launch.reset(new gpu::primitive{[=](void** inputs, void** outputs) mutable {
runtime::gpu::cuda_memcpyDtD(outputs[0], inputs[0], output_size * output_element_size);
}});
}
else
{
auto& cudnn_emitter = m_primitive_emitter->get_cudnn_emitter();
auto max_index = cudnn_emitter->build_reduce_forward(
CUDNN_REDUCE_TENSOR_MAX, output_type, input_shape, node->get_reduction_axes());
kernel_launch.reset(new gpu::primitive{[=](void** inputs, void** outputs) mutable {
gpu::invoke_primitive(m_ctx, max_index, inputs, outputs);
}});
}
primitive_index = this->m_primitive_emitter->insert(std::move(kernel_launch));
m_primitive_emitter->cache(hash, primitive_index);
return primitive_index;
}
size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::Min* node)
{
auto& args = node->get_inputs();
auto& out = node->get_outputs();
auto& input_shape = args[0].get_shape();
auto& output_shape = out[0].get_shape();
auto input_size = shape_size(input_shape);
auto output_size = shape_size(output_shape);
auto output_element_size = out[0].get_element_type().size();
auto output_type = out[0].get_element_type().c_type_string();
std::stringstream ss;
ss << "min_" << output_type << "_i" << join(input_shape, "_") << "_ra"
<< join(node->get_reduction_axes(), "_");
std::string hash = ss.str();
// check if the requested kernel is already an inserted primitive
size_t primitive_index = m_primitive_emitter->lookup(hash);
if (primitive_index != std::numeric_limits<size_t>::max())
......@@ -350,6 +868,52 @@ size_t runtime::gpu::CUDNNEmitter::build_convolution(const std::string& dtype,
return primitive_index;
}
std::unique_ptr<gpu::primitive> kernel_launch;
;
// one of args[] axes has zero size, zero output
if (input_size == 0)
{
GPUAllocator allocator = m_primitive_emitter->get_memory_allocator();
std::vector<float> negative_inf(output_size, std::numeric_limits<float>::infinity());
size_t idx_float_inf =
allocator.reserve_argspace(negative_inf.data(), negative_inf.size() * sizeof(float));
kernel_launch.reset(new gpu::primitive{[=](void** inputs, void** outputs) mutable {
void* temp_d = runtime::gpu::invoke_memory_primitive(m_ctx, idx_float_inf);
runtime::gpu::cuda_memcpyDtD(outputs[0], temp_d, output_size * output_element_size);
}});
}
else if (input_size == output_size)
{
// no reduction
kernel_launch.reset(new gpu::primitive{[=](void** inputs, void** outputs) mutable {
runtime::gpu::cuda_memcpyDtD(outputs[0], inputs[0], output_size * output_element_size);
}});
}
else
{
auto& cudnn_emitter = m_primitive_emitter->get_cudnn_emitter();
auto min_index = cudnn_emitter->build_reduce_forward(
CUDNN_REDUCE_TENSOR_MIN, output_type, input_shape, node->get_reduction_axes());
kernel_launch.reset(new gpu::primitive{[=](void** inputs, void** outputs) mutable {
gpu::invoke_primitive(m_ctx, min_index, inputs, outputs);
}});
}
primitive_index = this->m_primitive_emitter->insert(std::move(kernel_launch));
m_primitive_emitter->cache(hash, primitive_index);
return primitive_index;
}
size_t runtime::gpu::CUDNNEmitter::build_convolution(const std::string& dtype,
const Shape& input_tensor_shape,
const Shape& input_filter_shape,
const Shape& output_tensor_shape,
const Strides& window_movement_strides,
const Strides& window_dilation_strides,
const Shape& padding_below)
{
cudnnDataType_t data_type = get_cudnn_datatype(dtype);
const cudnnTensorFormat_t tensor_format = CUDNN_TENSOR_NCHW;
const cudnnConvolutionMode_t mode = CUDNN_CROSS_CORRELATION;
......@@ -399,9 +963,7 @@ size_t runtime::gpu::CUDNNEmitter::build_convolution(const std::string& dtype,
debug_sync();
}});
primitive_index = this->m_primitive_emitter->insert(std::move(conv));
m_primitive_emitter->cache(hash, primitive_index);
return primitive_index;
return this->m_primitive_emitter->insert(std::move(conv));
}
size_t runtime::gpu::CUDNNEmitter::build_convolution_backward_data(
......@@ -413,20 +975,6 @@ size_t runtime::gpu::CUDNNEmitter::build_convolution_backward_data(
const Strides& window_dilation_strides,
const Shape& padding_below)
{
// construct hash to determine if kernel needs to be emitted
// or if it already exists in the primitive list
std::stringstream ss;
ss << "convolution_bp_data_op_" << dtype << "_i" << join(input_tensor_shape, "_") << "_w"
<< join(input_filter_shape, "_") << "_o" << join(output_tensor_shape, "_") << "_ws"
<< join(window_movement_strides, "_") << "_wd" << join(window_dilation_strides, "_") << "_p"
<< join(padding_below, "_");
std::string hash = ss.str();
// check if the requested kernel is already an inserted primitive
size_t primitive_index = m_primitive_emitter->lookup(hash);
if (primitive_index != std::numeric_limits<size_t>::max())
{
return primitive_index;
}
const cudnnDataType_t data_type = get_cudnn_datatype(dtype);
const cudnnTensorFormat_t tensor_format = CUDNN_TENSOR_NCHW;
const cudnnConvolutionMode_t mode = CUDNN_CROSS_CORRELATION;
......@@ -476,9 +1024,7 @@ size_t runtime::gpu::CUDNNEmitter::build_convolution_backward_data(
debug_sync();
}});
primitive_index = this->m_primitive_emitter->insert(std::move(conv));
m_primitive_emitter->cache(hash, primitive_index);
return primitive_index;
return this->m_primitive_emitter->insert(std::move(conv));
}
size_t runtime::gpu::CUDNNEmitter::build_convolution_backward_filter(
......@@ -490,21 +1036,6 @@ size_t runtime::gpu::CUDNNEmitter::build_convolution_backward_filter(
const Strides& window_dilation_strides,
const Shape& padding_below)
{
// construct hash to determine if kernel needs to be emitted
// or if it already exists in the primitive list
std::stringstream ss;
ss << "convolution_bp_filter_op_" << dtype << "_i" << join(input_tensor_shape_0, "_") << "_w"
<< join(output_filter_shape, "_") << "_o" << join(input_tensor_shape_1, "_") << "_ws"
<< join(window_movement_strides, "_") << "_wd" << join(window_dilation_strides, "_") << "_p"
<< join(padding_below, "_");
std::string hash = ss.str();
// check if the requested kernel is already an inserted primitive
size_t primitive_index = m_primitive_emitter->lookup(hash);
if (primitive_index != std::numeric_limits<size_t>::max())
{
return primitive_index;
}
const cudnnDataType_t data_type = get_cudnn_datatype(dtype);
const cudnnTensorFormat_t tensor_format = CUDNN_TENSOR_NCHW;
const cudnnConvolutionMode_t mode = CUDNN_CROSS_CORRELATION;
......@@ -554,9 +1085,8 @@ size_t runtime::gpu::CUDNNEmitter::build_convolution_backward_filter(
outputs[0]));
debug_sync();
}});
primitive_index = this->m_primitive_emitter->insert(std::move(conv));
m_primitive_emitter->cache(hash, primitive_index);
return primitive_index;
return this->m_primitive_emitter->insert(std::move(conv));
}
size_t runtime::gpu::CUDNNEmitter::build_pooling(const cudnnPoolingMode_t& pool_op,
......@@ -591,7 +1121,10 @@ size_t runtime::gpu::CUDNNEmitter::build_pooling(const cudnnPoolingMode_t& pool_
auto& input_desc = tensor_descriptor_from_shape(input_shape, data_type, tensor_format);
auto& output_desc = tensor_descriptor_from_shape(output_shape, data_type, tensor_format);
if (input_shape.size() == 4)
if (input_shape.size() == 3)
{
}
else if (input_shape.size() == 4)
{
CUDNN_SAFE_CALL(cudnnSetPooling2dDescriptor(desc,
pool_op,
......
......@@ -30,6 +30,11 @@
#include "ngraph/runtime/gpu/gpu_runtime_context.hpp"
#include "ngraph/shape.hpp"
#include "ngraph/op/convolution.hpp"
#include "ngraph/op/max.hpp"
#include "ngraph/op/max_pool.hpp"
#include "ngraph/op/min.hpp"
namespace ngraph
{
namespace runtime
......@@ -48,6 +53,14 @@ namespace ngraph
{
friend class GPUPrimitiveEmitter;
public:
size_t build_primitive(const op::Convolution* node);
size_t build_primitive(const op::ConvolutionBackpropData* node);
size_t build_primitive(const op::ConvolutionBackpropFilters* node);
size_t build_primitive(const op::MaxPool* node);
size_t build_primitive(const op::Max* node);
size_t build_primitive(const op::Min* node);
public:
enum class Prop
{
......
......@@ -144,190 +144,26 @@ namespace ngraph
auto convolution = static_cast<const ngraph::op::Convolution*>(node);
auto input_shape = args[0].get_shape();
auto filter_shape = args[1].get_shape();
auto output_shape = out[0].get_shape();
auto rank = input_shape.size();
Strides window_dilation_strides = convolution->get_window_dilation_strides();
Strides window_movement_strides = convolution->get_window_movement_strides();
Strides data_dilation_strides = convolution->get_data_dilation_strides();
CoordinateDiff padding_below_diff = convolution->get_padding_below();
CoordinateDiff padding_above_diff = convolution->get_padding_above();
if (padding_below_diff.size() > 3)
size_t conv_index = 0;
if (convolution->get_padding_below().size() > 3)
{
// Reshape from NC{d1,..,dn} -> C{d1,...,dn}N
// and from KC{df1,...,dfn} -> C{df1,...,dfn}N.
// TODO: This should be done via a pass similar to
// what is done for convolution in the IA transformer
// c.f runtime/cpu/pass/cpu_layout.cpp
GPUAllocator allocator =
external_function->get_primitive_emitter()->get_memory_allocator();
size_t transposed_data_idx = allocator.reserve_workspace(
args[0].get_size() * args[0].get_element_type().size());
size_t transposed_filter_idx = allocator.reserve_workspace(
args[1].get_size() * args[1].get_element_type().size());
size_t transposed_output_idx = allocator.reserve_workspace(
out[0].get_size() * out[0].get_element_type().size());
GPUShape input_order;
for (int i = 1; i <= rank; i++)
{
input_order.push_back(i % rank);
}
auto& cuda_emitter =
external_function->get_primitive_emitter()->get_cuda_emitter();
size_t reshape_data_index = cuda_emitter->build_reshape(
{{args[0].get_type(), args[0].get_type()}}, input_shape, input_order);
writer << "void* data = gpu::invoke_memory_primitive(ctx, "
<< transposed_data_idx << ");\n";
writer << "gpu::invoke_primitive(ctx, " << reshape_data_index << ", ";
writer << "std::vector<void*>{" << args[0].get_name() << "}.data(), ";
writer << "std::vector<void*>{data}.data());\n";
size_t reshape_filter_index = cuda_emitter->build_reshape(
{{args[1].get_type(), args[1].get_type()}}, filter_shape, input_order);
writer << "void* filter = gpu::invoke_memory_primitive(ctx, "
<< transposed_filter_idx << ");\n";
writer << "gpu::invoke_primitive(ctx, " << reshape_filter_index << ", ";
writer << "std::vector<void*>{" << args[1].get_name() << "}.data(), ";
writer << "std::vector<void*>{filter}.data());\n";
// local helper to reshape tensor shape objects
auto reshape = [](const Shape& shape, const GPUShape& order) {
Shape output(shape.size(), 0);
for (size_t i = 0; i < shape.size(); i++)
{
output[i] = shape[order[i]];
}
return output;
};
// reorder axes of the input shape (NC{d_1,...,d_n} -> C{d_1,...,d_n}N)
input_shape = reshape(input_shape, input_order);
// reorder axes of the filter shape (KC{df_1,...,df_n} -> C{df_1,...,df_n}K)
filter_shape = reshape(filter_shape, input_order);
// reorder axes of the output shape (NK{do_1,...,do_n} -> K{do_1,...,do_n}N)
output_shape = reshape(output_shape, input_order);
size_t conv_index = cuda_emitter->build_convolution(
{{args[0].get_type(), args[1].get_type(), out[0].get_type()}},
input_shape,
padding_below_diff,
data_dilation_strides,
filter_shape,
window_movement_strides,
window_dilation_strides,
output_shape);
writer << "void* output = gpu::invoke_memory_primitive(ctx, "
<< transposed_output_idx << ");\n";
writer << "gpu::invoke_primitive(ctx, " << conv_index << ", ";
writer << "std::vector<void*>{data, filter}.data(), ";
writer << "std::vector<void*>{output}.data());\n";
// reshape output tensor (K{do_1,...,do_n}N -> NK{do_1,...,do_n})
input_order.clear();
input_order.push_back(static_cast<int>(rank - 1));
for (int i = 0; i < rank - 1; i++)
{
input_order.push_back(i);
}
size_t reshape_output_index = cuda_emitter->build_reshape(
{{args[1].get_type(), args[1].get_type()}}, output_shape, input_order);
writer << "gpu::invoke_primitive(ctx, " << reshape_output_index << ", ";
writer << "std::vector<void*>{output}.data(), ";
writer << "std::vector<void*>{" << out[0].get_name() << "}.data());\n";
conv_index = cuda_emitter->build_primitive(convolution);
}
else
{
bool is_deconvolution = false;
for (auto a : data_dilation_strides)
{
if (a != 1)
{
is_deconvolution = true;
break;
}
}
bool pad_required = (padding_below_diff != padding_above_diff);
Shape padding_below(padding_below_diff.size(), 0);
Shape padding_above(padding_above_diff.size(), 0);
for (int i = 0; i < padding_below.size(); i++)
{
padding_below[i] = static_cast<size_t>(padding_below_diff[i]);
padding_above[i] = static_cast<size_t>(padding_above_diff[i]);
}
Shape input_shape_padded = input_shape;
Shape padding_interior(data_dilation_strides);
writer.block_begin();
if (pad_required || is_deconvolution)
{
input_shape_padded = get_padded_shape(
input_shape, padding_below, padding_above, padding_interior);
Shape input_padded_strides = row_major_strides(input_shape_padded);
auto temp_size =
shape_size(input_shape_padded) * args[0].get_element_type().size();
GPUAllocator allocator =
external_function->get_primitive_emitter()->get_memory_allocator();
size_t idx_workspace = allocator.reserve_workspace(temp_size);
writer << "void* pad_buffer = runtime::gpu::invoke_memory_primitive(ctx, "
<< idx_workspace << ");\n";
writer << "std::vector<" << args[0].get_type() << "> pad_buffer_host("
<< shape_size(input_shape_padded) << ", 0);\n";
writer
<< "runtime::gpu::cuda_memcpyHtD(pad_buffer, pad_buffer_host.data(), "
<< temp_size << ");\n";
auto& cuda_emitter =
external_function->get_primitive_emitter()->get_cuda_emitter();
auto pad_dynamic_index = cuda_emitter->build_pad_dynamic(
{{args[0].get_type(), out[0].get_type()}},
input_shape,
input_shape_padded,
padding_below,
padding_interior);
writer << "gpu::invoke_primitive(ctx, " << pad_dynamic_index << ", ";
writer << "std::vector<void*>{" << args[0].get_name() << "}.data(), ";
writer << "std::vector<void*>{pad_buffer}.data()";
writer << ");\n";
// asymetric padding has been applied, zero out padding vectors to
// ensure cudnn does not assume padding
std::fill(padding_below.begin(), padding_below.end(), 0);
}
auto& cudnn_emitter =
external_function->get_primitive_emitter()->get_cudnn_emitter();
size_t index = cudnn_emitter->build_convolution(out[0].get_type(),
input_shape_padded,
args[1].get_shape(),
out[0].get_shape(),
window_movement_strides,
window_dilation_strides,
padding_below);
writer << "gpu::invoke_primitive(ctx, " << index << ", ";
if (pad_required || is_deconvolution)
{
writer << "std::vector<void*>{pad_buffer, " << args[1].get_name()
<< "}.data(), ";
}
else
{
writer << "std::vector<void*>{" << args[0].get_name() << ","
<< args[1].get_name() << "}.data(), ";
}
writer << "std::vector<void*>{" << out[0].get_name() << "}.data()";
writer << ");\n";
writer.block_end();
conv_index = cudnn_emitter->build_primitive(convolution);
}
writer << "gpu::invoke_primitive(ctx, " << conv_index << ", ";
writer << "std::vector<void*>{";
writer << args[0].get_name() << ", ";
writer << args[1].get_name() << ", ";
writer << "}.data(), ";
writer << "std::vector<void*>{" << out[0].get_name() << "}.data());\n";
}
template <>
......@@ -339,128 +175,23 @@ namespace ngraph
}
auto convolution = static_cast<const ngraph::op::ConvolutionBackpropData*>(node);
Strides window_dilation_strides =
convolution->get_window_dilation_strides_forward();
Strides window_movement_strides =
convolution->get_window_movement_strides_forward();
Strides data_dilation_strides = convolution->get_data_dilation_strides_forward();
CoordinateDiff padding_below_diff = convolution->get_padding_below_forward();
CoordinateDiff padding_above_diff = convolution->get_padding_above_forward();
if (padding_below_diff.size() > 3)
if (convolution->get_padding_below_forward().size() > 3)
{
throw std::runtime_error(node->get_name() +
"with more than 3D is not implemented.");
}
bool is_deconvolution = false;
for (auto a : data_dilation_strides)
{
if (a != 1)
{
is_deconvolution = true;
break;
}
}
bool pad_required = (padding_below_diff != padding_above_diff);
Shape padding_below(padding_below_diff.size(), 0);
Shape padding_above(padding_above_diff.size(), 0);
for (int i = 0; i < padding_below.size(); i++)
{
padding_below[i] = static_cast<size_t>(padding_below_diff[i]);
padding_above[i] = static_cast<size_t>(padding_above_diff[i]);
}
auto output_shape = out[0].get_shape();
auto output_shape_padded = output_shape;
Shape padding_below_back(output_shape.size(), 0);
Shape padding_interior_back(output_shape.size(), 1);
size_t i = padding_below_back.size() - padding_below.size();
size_t j = 0;
for (; i < padding_below_back.size(); i++)
{
padding_below_back[i] = padding_below[j];
padding_interior_back[i] = data_dilation_strides[j];
j++;
}
Shape padding_interior(data_dilation_strides);
writer.block_begin();
if (pad_required || is_deconvolution)
{
output_shape_padded = get_padded_shape(
output_shape, padding_below, padding_above, padding_interior);
auto temp_size =
shape_size(output_shape_padded) * args[0].get_element_type().size();
GPUAllocator allocator =
external_function->get_primitive_emitter()->get_memory_allocator();
size_t idx_workspace = allocator.reserve_workspace(temp_size);
writer << "void* pad_buffer = runtime::gpu::invoke_memory_primitive(ctx, "
<< idx_workspace << ");\n";
writer << "std::vector<" << args[0].get_type() << "> pad_buffer_host("
<< shape_size(output_shape_padded) << ", 0);\n";
writer << "runtime::gpu::cuda_memcpyHtD(pad_buffer, pad_buffer_host.data(), "
<< temp_size << ");\n";
auto& cuda_emitter =
external_function->get_primitive_emitter()->get_cuda_emitter();
auto pad_dynamic_index =
cuda_emitter->build_pad_dynamic({{args[0].get_type(), out[0].get_type()}},
output_shape,
output_shape_padded,
padding_below,
padding_interior);
writer << "gpu::invoke_primitive(ctx, " << pad_dynamic_index << ", ";
writer << "std::vector<void*>{" << args[0].get_name() << "}.data(), ";
writer << "std::vector<void*>{pad_buffer}.data()";
writer << ");\n";
// asymetric padding has been applied, zero out padding vectors to
// ensure cuDNN does not assume padding
std::fill(padding_below.begin(), padding_below.end(), 0);
}
auto& cudnn_emitter =
external_function->get_primitive_emitter()->get_cudnn_emitter();
size_t index =
cudnn_emitter->build_convolution_backward_data(out[0].get_type(),
args[0].get_shape(),
args[1].get_shape(),
output_shape_padded,
window_movement_strides,
window_dilation_strides,
padding_below);
size_t conv_index = cudnn_emitter->build_primitive(convolution);
writer << "gpu::invoke_primitive(ctx, " << index << ", ";
writer << "std::vector<void*>{" << args[0].get_name() << "," << args[1].get_name()
<< "}.data(), ";
if (pad_required || is_deconvolution)
{
writer << "std::vector<void*>{pad_buffer}.data()";
}
else
{
writer << "std::vector<void*>{" << out[0].get_name() << "}.data()";
}
writer << ");\n";
// since we padded output with temp buffer, we need to copy back to real ouput
if (pad_required || is_deconvolution)
{
auto& cuda_emitter =
external_function->get_primitive_emitter()->get_cuda_emitter();
auto slice_index =
cuda_emitter->build_slice({{args[0].get_type(), out[0].get_type()}},
output_shape_padded,
padding_below_back,
padding_interior_back,
output_shape);
writer << "gpu::invoke_primitive(ctx, " << slice_index << ", ";
writer << "std::vector<void*>{pad_buffer}.data(), ";
writer << "std::vector<void*>{" << out[0].get_name() << "}.data()";
writer << ");\n";
}
writer.block_end();
writer << "gpu::invoke_primitive(ctx, " << conv_index << ", ";
writer << "std::vector<void*>{";
writer << args[0].get_name() << ", ";
writer << args[1].get_name() << ", ";
writer << "}.data(), ";
writer << "std::vector<void*>{" << out[0].get_name() << "}.data());\n";
}
template <>
......@@ -473,102 +204,22 @@ namespace ngraph
auto convolution = static_cast<const ngraph::op::ConvolutionBackpropFilters*>(node);
Strides window_dilation_strides =
convolution->get_window_dilation_strides_forward();
Strides window_movement_strides =
convolution->get_window_movement_strides_forward();
Strides data_dilation_strides = convolution->get_data_dilation_strides_forward();
CoordinateDiff padding_below_diff = convolution->get_padding_below_forward();
CoordinateDiff padding_above_diff = convolution->get_padding_above_forward();
if (padding_below_diff.size() > 3)
if (convolution->get_padding_below_forward().size() > 3)
{
throw std::runtime_error(node->get_name() +
"with more than 3D is not implemented.");
}
bool is_deconvolution = false;
for (auto a : data_dilation_strides)
{
if (a != 1)
{
is_deconvolution = true;
break;
}
}
bool pad_required = (padding_below_diff != padding_above_diff);
Shape padding_below(padding_below_diff.size(), 0);
Shape padding_above(padding_above_diff.size(), 0);
for (int i = 0; i < padding_below.size(); i++)
{
padding_below[i] = static_cast<size_t>(padding_below_diff[i]);
padding_above[i] = static_cast<size_t>(padding_above_diff[i]);
}
auto input_shape = args[0].get_shape();
auto input_shape_padded = input_shape;
Shape padding_interior(data_dilation_strides);
writer.block_begin();
if (pad_required || is_deconvolution)
{
input_shape_padded = get_padded_shape(
input_shape, padding_below, padding_above, padding_interior);
auto temp_size =
shape_size(input_shape_padded) * args[0].get_element_type().size();
GPUAllocator allocator =
external_function->get_primitive_emitter()->get_memory_allocator();
size_t idx_workspace = allocator.reserve_workspace(temp_size);
writer << "void* pad_buffer = runtime::gpu::invoke_memory_primitive(ctx, "
<< idx_workspace << ");\n";
writer << "std::vector<" << args[0].get_type() << "> pad_buffer_host("
<< shape_size(input_shape_padded) << ", 0);\n";
writer << "runtime::gpu::cuda_memcpyHtD(pad_buffer, pad_buffer_host.data(), "
<< temp_size << ");\n";
auto& cuda_emitter =
external_function->get_primitive_emitter()->get_cuda_emitter();
auto pad_dynamic_index =
cuda_emitter->build_pad_dynamic({{args[0].get_type(), out[0].get_type()}},
input_shape,
input_shape_padded,
padding_below,
padding_interior);
writer << "gpu::invoke_primitive(ctx, " << pad_dynamic_index << ", ";
writer << "std::vector<void*>{" << args[0].get_name() << "}.data(), ";
writer << "std::vector<void*>{pad_buffer}.data()";
writer << ");\n";
// asymetric padding has been applied, zero out padding vectors to
// ensure cuDNN does not assume padding
std::fill(padding_below.begin(), padding_below.end(), 0);
}
auto& cudnn_emitter =
external_function->get_primitive_emitter()->get_cudnn_emitter();
size_t index =
cudnn_emitter->build_convolution_backward_filter(out[0].get_type(),
input_shape_padded,
args[1].get_shape(),
out[0].get_shape(),
window_movement_strides,
window_dilation_strides,
padding_below);
size_t conv_index = cudnn_emitter->build_primitive(convolution);
writer << "gpu::invoke_primitive(ctx, " << index << ", ";
if (pad_required || is_deconvolution)
{
writer << "std::vector<void*>{pad_buffer, " << args[1].get_name()
<< "}.data(), ";
}
else
{
writer << "std::vector<void*>{" << args[0].get_name() << ","
<< args[1].get_name() << "}.data(), ";
}
writer << "std::vector<void*>{" << out[0].get_name() << "}.data()";
writer << ");\n";
writer.block_end();
writer << "gpu::invoke_primitive(ctx, " << conv_index << ", ";
writer << "std::vector<void*>{";
writer << args[0].get_name() << ", ";
writer << args[1].get_name() << ", ";
writer << "}.data(), ";
writer << "std::vector<void*>{" << out[0].get_name() << "}.data());\n";
}
template <>
......@@ -1146,96 +797,40 @@ namespace ngraph
template <>
void GPU_Emitter::EMITTER_DECL(ngraph::op::Max)
{
const ngraph::op::Max* max_op = static_cast<const ngraph::op::Max*>(node);
writer.block_begin();
{
if (out[0].get_size() != 0)
{
// one of args[] axes has zero size, zero output
if (args[0].get_size() == 0)
{
GPUAllocator allocator =
external_function->get_primitive_emitter()->get_memory_allocator();
std::vector<float> negative_inf(
out[0].get_size(), -std::numeric_limits<float>::infinity());
size_t idx_float_inf = allocator.reserve_argspace(
negative_inf.data(), negative_inf.size() * sizeof(float));
writer << "void* temp_d = runtime::gpu::invoke_memory_primitive(ctx, "
<< idx_float_inf << ");\n";
writer << "runtime::gpu::cuda_memcpyDtD(" << out[0].get_name()
<< ", temp_d, " << out[0].get_size() << " * "
<< out[0].get_element_type().size() << ");\n";
}
else if (args[0].get_size() == out[0].get_size())
if (out[0].get_size() == 0)
{
kernel::emit_memcpyDtD(writer, out[0], args[0]);
return;
}
else
{
const ngraph::op::Max* max = static_cast<const ngraph::op::Max*>(node);
auto& cudnn_emitter =
external_function->get_primitive_emitter()->get_cudnn_emitter();
auto max_index =
cudnn_emitter->build_reduce_forward(CUDNN_REDUCE_TENSOR_MAX,
out[0].get_type(),
args[0].get_shape(),
max_op->get_reduction_axes());
auto index = cudnn_emitter->build_primitive(max);
writer << "gpu::invoke_primitive(ctx, " << max_index << ", ";
writer << "gpu::invoke_primitive(ctx, " << index << ", ";
writer << "std::vector<void*>{" << args[0].get_name() << "}.data(), ";
writer << "std::vector<void*>{" << out[0].get_name() << "}.data()";
writer << ");\n";
}
}
}
writer.block_end();
writer << "std::vector<void*>{" << out[0].get_name() << "}.data());\n";
return;
}
template <>
void GPU_Emitter::EMITTER_DECL(ngraph::op::Min)
{
const ngraph::op::Min* min_op = static_cast<const ngraph::op::Min*>(node);
writer.block_begin();
{
if (out[0].get_size() != 0)
{
// one of args[] axes has zero size, zero output
if (args[0].get_size() == 0)
{
GPUAllocator allocator =
external_function->get_primitive_emitter()->get_memory_allocator();
std::vector<float> positive_inf(out[0].get_size(),
std::numeric_limits<float>::infinity());
size_t idx_float_inf = allocator.reserve_argspace(
positive_inf.data(), positive_inf.size() * sizeof(float));
writer << "void* temp_d = runtime::gpu::invoke_memory_primitive(ctx, "
<< idx_float_inf << ");\n";
writer << "runtime::gpu::cuda_memcpyDtD(" << out[0].get_name()
<< ", temp_d, " << out[0].get_size() << " * "
<< out[0].get_element_type().size() << ");\n";
}
else if (args[0].get_size() == out[0].get_size())
if (out[0].get_size() == 0)
{
kernel::emit_memcpyDtD(writer, out[0], args[0]);
return;
}
else
{
const ngraph::op::Min* min = static_cast<const ngraph::op::Min*>(node);
auto& cudnn_emitter =
external_function->get_primitive_emitter()->get_cudnn_emitter();
auto min_index =
cudnn_emitter->build_reduce_forward(CUDNN_REDUCE_TENSOR_MIN,
out[0].get_type(),
args[0].get_shape(),
min_op->get_reduction_axes());
auto index = cudnn_emitter->build_primitive(min);
writer << "gpu::invoke_primitive(ctx, " << min_index << ", ";
writer << "gpu::invoke_primitive(ctx, " << index << ", ";
writer << "std::vector<void*>{" << args[0].get_name() << "}.data(), ";
writer << "std::vector<void*>{" << out[0].get_name() << "}.data()";
writer << ");\n";
}
}
}
writer.block_end();
writer << "std::vector<void*>{" << out[0].get_name() << "}.data());\n";
return;
}
......@@ -1561,10 +1156,8 @@ namespace ngraph
{
// assumes NC{d1,d2,...} format
auto max_pool = static_cast<const ngraph::op::MaxPool*>(node);
writer.block_begin();
{
auto& input_shape = args[0].get_shape();
auto& result_shape = out[0].get_shape();
auto padding_below = max_pool->get_padding_below();
auto padding_above = max_pool->get_padding_above();
if (input_shape.size() < 3)
......@@ -1574,78 +1167,20 @@ namespace ngraph
"Tensors should have at least one spatial dimension, dim(NC{d1...dN}) "
"<= 3");
}
bool pad_required = false;
auto shape_to_pool =
get_padded_shape(input_shape, padding_below, padding_above, {});
if (shape_to_pool != input_shape)
{
pad_required = true;
}
pad_required = pad_required && (padding_below != padding_above);
// asymetric padding
if (pad_required)
{
auto& cuda_emitter =
external_function->get_primitive_emitter()->get_cuda_emitter();
// auto temp_buffer = create_gpu_buffer(shape_size(output_shape)*type_size);
auto temp_size =
shape_size(shape_to_pool) * args[0].get_element_type().size();
GPUAllocator allocator =
external_function->get_primitive_emitter()->get_memory_allocator();
size_t idx_workspace = allocator.reserve_workspace(temp_size);
writer << "void* pad_buffer = runtime::gpu::invoke_memory_primitive(ctx, "
<< idx_workspace << ");\n";
std::stringstream ss;
ss << TypeInfo::Get(args[0].get_element_type())->lowest();
auto pad_index =
cuda_emitter->build_pad({{args[0].get_type(), out[0].get_type()}},
input_shape,
shape_to_pool,
padding_below,
padding_above,
Shape{},
ss.str());
writer << "gpu::invoke_primitive(ctx, " << pad_index << ", ";
writer << "std::vector<void*>{" << args[0].get_name() << "}.data(), ";
writer << "std::vector<void*>{pad_buffer}.data()";
writer << ");\n";
// asymetric padding has been applied, zero out padding vectors to
// ensure cuDNN does not assume padding during pooling
std::fill(padding_below.begin(), padding_below.end(), 0);
std::fill(padding_above.begin(), padding_above.end(), 0);
}
int num_nontrivial_dims = 0;
for (int64_t i = shape_to_pool.size() - 1; i > 1; i--)
{
if (shape_to_pool[i] > 1)
else if (input_shape.size() > 5)
{
num_nontrivial_dims++;
}
throw std::runtime_error(
"Pooling currently only supports up to 3 spatial dimensions.");
}
if (input_shape.size() <= 5)
{
size_t max_pool_index = 0;
size_t max_pool_index;
// 1d max pool (NCW)
if ((input_shape.size() == 3 || num_nontrivial_dims == 1))
if (input_shape.size() == 3)
{
auto& cuda_emitter =
external_function->get_primitive_emitter()->get_cuda_emitter();
max_pool_index = cuda_emitter->build_1d_max_pool(
{{args[0].get_type(), out[0].get_type()}},
input_shape,
result_shape,
max_pool->get_window_shape().back(),
max_pool->get_window_movement_strides().back());
max_pool_index = cuda_emitter->build_primitive(max_pool);
}
// 2d and 3d max pool (NCHW)
else if (input_shape.size() == 4 || input_shape.size() == 5)
......@@ -1653,38 +1188,13 @@ namespace ngraph
auto& cudnn_emitter =
external_function->get_primitive_emitter()->get_cudnn_emitter();
max_pool_index = cudnn_emitter->build_pooling(
CUDNN_POOLING_MAX,
out[0].get_type(),
CUDNNEmitter::Prop::Forward,
shape_to_pool,
result_shape,
max_pool->get_window_movement_strides(),
max_pool->get_window_shape(),
padding_below,
padding_above);
max_pool_index = cudnn_emitter->build_primitive(max_pool);
}
writer << "gpu::invoke_primitive(ctx, " << max_pool_index << ", ";
if (pad_required)
{
writer << "std::vector<void*>{pad_buffer}.data(), ";
}
else
{
writer << "std::vector<void*>{" << args[0].get_name() << "}.data(), ";
}
writer << "std::vector<void*>{" << out[0].get_name() << "}.data()";
writer << ");\n";
}
else
{
throw std::runtime_error(
"Pooling currently only supports up to 3 spatial dimensions.");
}
}
writer.block_end();
}
template <>
void GPU_Emitter::EMITTER_DECL(ngraph::op::MaxPoolBackprop)
......@@ -1866,8 +1376,7 @@ namespace ngraph
size_t avg_pool_index = 0;
// if 1d or has asymmetric padding, must handle pooling manually
if (input_shape.size() == 3 || num_nontrivial_dims == 1 ||
padding_below != padding_above)
if (input_shape.size() == 3 || padding_below != padding_above)
{
auto& cuda_emitter =
external_function->get_primitive_emitter()->get_cuda_emitter();
......@@ -2015,80 +1524,34 @@ namespace ngraph
template <>
void GPU_Emitter::EMITTER_DECL(ngraph::op::Softmax)
{
auto softmax = static_cast<const ngraph::op::Softmax*>(node);
writer.block_begin();
{
auto softmax = static_cast<const ngraph::op::Softmax*>(node);
auto tensor_shape = args[0].get_shape();
auto axes = softmax->get_axes();
auto& cudnn_emitter =
external_function->get_primitive_emitter()->get_cudnn_emitter();
if (axes.size() != tensor_shape.size())
size_t softmax_index;
if (softmax->get_axes().size() != args[0].get_shape().size())
{
auto& cuda_emitter =
external_function->get_primitive_emitter()->get_cuda_emitter();
// reserve a temporary buffer for the intermediate reduction
GPUAllocator allocator =
external_function->get_primitive_emitter()->get_memory_allocator();
auto reduced_shape = tensor_shape;
for (auto const& axis : axes)
{
reduced_shape[axis] = 1;
}
size_t reduced_size = shape_size(reduced_shape);
size_t workspace_idx = allocator.reserve_workspace(
reduced_size * out[0].get_element_type().size());
// exponentiate with fused sum reduction to calculate softmax denominator
size_t exp_sum_reduce =
cuda_emitter
->build_elementwise_collective<ngraph::op::Exp, ngraph::op::Add>(
{{args[0].get_type(), out[0].get_type()}},
args[0].get_shape(),
{},
axes,
true /* multi-output */);
writer << "void* workspace = gpu::invoke_memory_primitive(ctx, "
<< workspace_idx << ");\n";
writer << "gpu::invoke_primitive(ctx, " << exp_sum_reduce << ", ";
writer << "std::vector<void*>{" << args[0].get_name();
writer << "}.data(), ";
// cache the elementwise result and the fused result (multi-output)
writer << "std::vector<void*>{ workspace, ";
writer << out[0].get_name() << "}.data()";
writer << ");\n";
// inplace binary division with fused broadcast to calculate softmax
size_t div_broadcast =
cuda_emitter->build_elementwise_collective<ngraph::op::Divide>(
{{out[0].get_type(), out[0].get_type(), out[0].get_type()}},
out[0].get_shape(),
{1},
axes);
writer << "gpu::invoke_primitive(ctx, " << div_broadcast << ", ";
writer << "std::vector<void*>{" << out[0].get_name();
writer << ", workspace}.data(), ";
writer << "std::vector<void*>{" << out[0].get_name() << "}.data()";
writer << ");\n";
softmax_index = cuda_emitter->build_primitive(softmax);
}
else
{
size_t softmax_index =
cudnn_emitter->build_softmax(CUDNN_SOFTMAX_FAST,
auto& cudnn_emitter =
external_function->get_primitive_emitter()->get_cudnn_emitter();
softmax_index = cudnn_emitter->build_softmax(CUDNN_SOFTMAX_FAST,
CUDNN_SOFTMAX_MODE_INSTANCE,
out[0].get_type(),
CUDNNEmitter::Prop::Forward,
tensor_shape);
args[0].get_shape());
}
writer << "gpu::invoke_primitive(ctx, " << softmax_index << ", ";
writer << "std::vector<void*>{" << args[0].get_name() << "}.data(), ";
writer << "std::vector<void*>{" << out[0].get_name() << "}.data()";
writer << ");\n";
}
}
writer.block_end();
}
}
......
......@@ -232,3 +232,19 @@ void runtime::gpu::kernel::emit_cudnnReduceTensor(codegen::CodeWriter& writer,
writer << " " << out.get_name() << "));\n";
writer << "ngraph::runtime::gpu::free_gpu_buffer(workspace_ptr);\n";
}
std::string runtime::gpu::kernel::emit_type_string(const Node* node)
{
std::stringstream ss;
for (auto const& input : node->get_inputs())
{
ss << input.get_element_type().c_type_string() << "_";
}
for (auto const& output : node->get_outputs())
{
ss << output.get_element_type().c_type_string() << "_";
}
std::string types = ss.str();
std::replace(types.begin(), types.end(), ' ', '_');
return types;
}
......@@ -82,6 +82,8 @@ namespace ngraph
const std::string& output_desc,
const float& alpha,
const float& beta);
std::string emit_type_string(const Node* node);
}
}
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment