Commit ae45c984 authored by Chris Sullivan's avatar Chris Sullivan Committed by Scott Cyphers

Nd convolution via blocked GEMM for C{d1,...,dn}N layout (#1131)

* Added blank convolution kernel and refactored coordinate transform kernel helper.

* Added op::Reshape to the CUDAEmitter.

* Added 2-Nd tiled convolution.

* Bug fixes with data_dilation and filter loop. Still need to add test for coverage of register tiling.

* Styling.

* Removed some comments and code added for testing.

* Some tests became enabled in merge, removing them.
parent 3a43bdac
This diff is collapsed.
......@@ -127,6 +127,21 @@ namespace ngraph
GPUShape result_shape,
const std::set<size_t>& bcast_axes);
size_t build_reshape(const GPURuntimeContext* ctx,
const std::array<std::string, 2>& dtypes,
GPUShape input_shape,
GPUShape input_order);
size_t build_convolution(const GPURuntimeContext* ctx,
const std::array<std::string, 3>& dtypes,
GPUShape input_shape,
GPUShape input_pad_below,
GPUShape input_dilation,
GPUShape filter_shape,
GPUShape filter_stride,
GPUShape filter_dilation,
GPUShape output_shape);
private:
CUDAEmitter(GPUPrimitiveEmitter* emitter);
void print_tensor_from_gpu(codegen::CodeWriter& writer,
......
......@@ -114,6 +114,16 @@ namespace ngraph
const std::array<std::string, 2>& data_types,
bool include_pad);
static void get_convolution_forward(codegen::CodeWriter& writer,
const std::string& name,
const std::array<std::string, 3>& data_types,
int N,
int K,
int filter_size,
int rank,
int sm_tile_size = 8,
int reg_tile_size = 1);
static void add_pod_typedefs(codegen::CodeWriter& writer);
/// \brief Given kernel input variables i_* produce register variables o_coordinates{i}
......@@ -127,6 +137,13 @@ namespace ngraph
std::string i_reduced_strides,
std::string o_coordinates,
size_t rank);
static void coordinate_transform_to_multi_d(codegen::CodeWriter& writer,
std::string i_strides,
std::string i_stride_magic,
std::string i_stride_shift,
std::string i_coord_product,
std::string o_coordinates,
size_t rank);
};
}
}
......
......@@ -149,6 +149,11 @@ namespace ngraph
}
auto convolution = static_cast<const ngraph::op::Convolution*>(node);
auto input_shape = args[0].get_shape();
auto filter_shape = args[1].get_shape();
auto output_shape = out[0].get_shape();
auto rank = input_shape.size();
Strides window_dilation_strides = convolution->get_window_dilation_strides();
Strides window_movement_strides = convolution->get_window_movement_strides();
Strides data_dilation_strides = convolution->get_data_dilation_strides();
......@@ -157,10 +162,105 @@ namespace ngraph
if (padding_below_diff.size() > 3)
{
throw std::runtime_error(node->get_name() +
"with more than 3D is not implemented.");
// Reshape from NC{d1,..,dn} -> C{d1,...,dn}N
// and from KC{df1,...,dfn} -> C{df1,...,dfn}N.
// TODO: This should be done via a pass similar to
// what is done for convolution in the IA transformer
// c.f runtime/cpu/pass/cpu_layout.cpp
GPUAllocator allocator =
external_function->get_primitive_emitter()->get_memory_allocator();
size_t transposed_data_idx = allocator.reserve_workspace(
args[0].get_size() * args[0].get_element_type().size());
size_t transposed_filter_idx = allocator.reserve_workspace(
args[1].get_size() * args[1].get_element_type().size());
size_t transposed_output_idx = allocator.reserve_workspace(
out[0].get_size() * out[0].get_element_type().size());
GPUShape input_order;
for (int i = 1; i <= rank; i++)
{
input_order.push_back(i % rank);
}
auto& cuda_emitter =
external_function->get_primitive_emitter()->get_cuda_emitter();
size_t reshape_data_index =
cuda_emitter->build_reshape(external_function->ctx().get(),
{{args[0].get_type(), args[0].get_type()}},
input_shape,
input_order);
writer << "void* data = gpu::invoke_memory_primitive(ctx, "
<< transposed_data_idx << ");\n";
writer << "gpu::invoke_primitive(ctx, " << reshape_data_index << ", ";
writer << "std::vector<void*>{" << args[0].get_name() << "}.data(), ";
writer << "std::vector<void*>{data}.data());\n";
size_t reshape_filter_index =
cuda_emitter->build_reshape(external_function->ctx().get(),
{{args[1].get_type(), args[1].get_type()}},
filter_shape,
input_order);
writer << "void* filter = gpu::invoke_memory_primitive(ctx, "
<< transposed_filter_idx << ");\n";
writer << "gpu::invoke_primitive(ctx, " << reshape_filter_index << ", ";
writer << "std::vector<void*>{" << args[1].get_name() << "}.data(), ";
writer << "std::vector<void*>{filter}.data());\n";
// local helper to reshape tensor shape objects
auto reshape = [](const Shape& shape, const GPUShape& order) {
Shape output(shape.size(), 0);
for (size_t i = 0; i < shape.size(); i++)
{
output[i] = shape[order[i]];
}
return output;
};
// reorder axes of the input shape (NC{d_1,...,d_n} -> C{d_1,...,d_n}N)
input_shape = reshape(input_shape, input_order);
// reorder axes of the filter shape (KC{df_1,...,df_n} -> C{df_1,...,df_n}K)
filter_shape = reshape(filter_shape, input_order);
// reorder axes of the output shape (NK{do_1,...,do_n} -> K{do_1,...,do_n}N)
output_shape = reshape(output_shape, input_order);
size_t conv_index = cuda_emitter->build_convolution(
external_function->ctx().get(),
{{args[0].get_type(), args[1].get_type(), out[0].get_type()}},
input_shape,
padding_below_diff,
data_dilation_strides,
filter_shape,
window_movement_strides,
window_dilation_strides,
output_shape);
writer << "void* output = gpu::invoke_memory_primitive(ctx, "
<< transposed_output_idx << ");\n";
writer << "gpu::invoke_primitive(ctx, " << conv_index << ", ";
writer << "std::vector<void*>{data, filter}.data(), ";
writer << "std::vector<void*>{output}.data());\n";
// reshape output tensor (K{do_1,...,do_n}N -> NK{do_1,...,do_n})
input_order.clear();
input_order.push_back(static_cast<int>(rank - 1));
for (int i = 0; i < rank - 1; i++)
{
input_order.push_back(i);
}
size_t reshape_output_index =
cuda_emitter->build_reshape(external_function->ctx().get(),
{{args[1].get_type(), args[1].get_type()}},
output_shape,
input_order);
writer << "gpu::invoke_primitive(ctx, " << reshape_output_index << ", ";
writer << "std::vector<void*>{output}.data(), ";
writer << "std::vector<void*>{" << out[0].get_name() << "}.data());\n";
}
else
{
bool is_deconvolution = false;
for (auto a : data_dilation_strides)
{
......@@ -181,7 +281,6 @@ namespace ngraph
padding_above[i] = static_cast<size_t>(padding_above_diff[i]);
}
auto input_shape = args[0].get_shape();
Shape input_shape_padded = input_shape;
Shape padding_interior(data_dilation_strides);
writer.block_begin();
......@@ -199,12 +298,13 @@ namespace ngraph
<< idx_workspace << ");\n";
writer << "std::vector<" << args[0].get_type() << "> pad_buffer_host("
<< shape_size(input_shape_padded) << ", 0);\n";
writer << "runtime::gpu::cuda_memcpyHtD(pad_buffer, pad_buffer_host.data(), "
writer
<< "runtime::gpu::cuda_memcpyHtD(pad_buffer, pad_buffer_host.data(), "
<< temp_size << ");\n";
auto& cuda_emitter =
external_function->get_primitive_emitter()->get_cuda_emitter();
auto pad_dynamic_index =
cuda_emitter->build_pad_dynamic(external_function->ctx().get(),
auto pad_dynamic_index = cuda_emitter->build_pad_dynamic(
external_function->ctx().get(),
{{args[0].get_type(), out[0].get_type()}},
input_shape,
input_shape_padded,
......@@ -216,7 +316,7 @@ namespace ngraph
writer << "std::vector<void*>{pad_buffer}.data()";
writer << ");\n";
// asymetric padding has been applied, zero out padding vectors to
// ensure cuDNN does not assume padding
// ensure cudnn does not assume padding
std::fill(padding_below.begin(), padding_below.end(), 0);
}
auto& cudnn_emitter =
......@@ -246,6 +346,7 @@ namespace ngraph
writer << ");\n";
writer.block_end();
}
}
template <>
void GPU_Emitter::EMITTER_DECL(ngraph::op::ConvolutionBackpropData)
......
......@@ -42,6 +42,12 @@ namespace ngraph
GPUAllocator(const GPUAllocator& g);
~GPUAllocator();
template <typename T>
size_t reserve_argspace(const T& container)
{
return reserve_argspace(container.data(),
container.size() * sizeof(typename T::value_type));
}
size_t reserve_argspace(const void* data, size_t size);
size_t reserve_workspace(size_t size, bool zero_initialize = true);
......
......@@ -21,6 +21,7 @@
#include <vector>
#include "ngraph/axis_set.hpp"
#include "ngraph/axis_vector.hpp"
#include "ngraph/coordinate.hpp"
#include "ngraph/coordinate_diff.hpp"
#include "ngraph/shape.hpp"
......@@ -30,45 +31,45 @@ namespace ngraph
{
class Shape;
/// \brief Shape for a tensor resident on GPU.
class GPUShape : public std::vector<uint32_t>
class GPUShape : public std::vector<int32_t>
{
public:
GPUShape(const std::initializer_list<uint32_t>& axis_lengths)
: std::vector<uint32_t>(axis_lengths)
GPUShape(const std::initializer_list<int32_t>& axis_lengths)
: std::vector<int32_t>(axis_lengths)
{
}
GPUShape(const std::vector<uint32_t>& axis_lengths)
: std::vector<uint32_t>(axis_lengths)
GPUShape(const std::vector<int32_t>& axis_lengths)
: std::vector<int32_t>(axis_lengths)
{
}
GPUShape(const GPUShape& axis_lengths)
: std::vector<uint32_t>(axis_lengths)
: std::vector<int32_t>(axis_lengths)
{
}
explicit GPUShape(size_t n, uint32_t initial_value = 0)
: std::vector<uint32_t>(n, initial_value)
explicit GPUShape(size_t n, int32_t initial_value = 0)
: std::vector<int32_t>(n, initial_value)
{
}
template <class InputIterator>
GPUShape(InputIterator first, InputIterator last)
: std::vector<uint32_t>(first, last)
: std::vector<int32_t>(first, last)
{
}
GPUShape() {}
GPUShape& operator=(const GPUShape& v)
{
static_cast<std::vector<uint32_t>*>(this)->operator=(v);
static_cast<std::vector<int32_t>*>(this)->operator=(v);
return *this;
}
GPUShape& operator=(GPUShape&& v)
{
static_cast<std::vector<uint32_t>*>(this)->operator=(v);
static_cast<std::vector<int32_t>*>(this)->operator=(v);
return *this;
}
......@@ -81,7 +82,7 @@ namespace ngraph
throw std::runtime_error(
"Request exceeds the bitwidth available for GPUShapes (32)");
}
this->push_back(static_cast<uint32_t>(size));
this->push_back(static_cast<int32_t>(size));
}
}
......@@ -95,7 +96,7 @@ namespace ngraph
"Request for Shape which exceeds the bitwidth available for GPUShapes "
"(32)");
}
this->push_back(static_cast<uint32_t>(size));
this->push_back(static_cast<int32_t>(size));
}
}
......@@ -109,7 +110,7 @@ namespace ngraph
"Request for Strides which exceed the bitwidth available for GPUShapes "
"(32)");
}
this->push_back(static_cast<uint32_t>(size));
this->push_back(static_cast<int32_t>(size));
}
}
......@@ -123,21 +124,36 @@ namespace ngraph
"Request for Coordinate which exceed the bitwidth available for GPUShapes "
"(32)");
}
this->push_back(static_cast<uint32_t>(size));
this->push_back(static_cast<int32_t>(size));
}
}
GPUShape(const CoordinateDiff& coord)
{
for (auto const& size : coord)
for (auto const& dim : coord)
{
if (dim > 0 && dim >> 32 != 0)
{
throw std::runtime_error(
"Request for CoordinateDiff which exceed the bitwidth available for "
"GPUShapes "
"(32)");
}
this->push_back(static_cast<int32_t>(dim));
}
}
GPUShape(const AxisVector& vec)
{
for (auto const& size : vec)
{
if (size >> 32 != 0)
{
throw std::runtime_error(
"Request for Coordinate which exceed the bitwidth available for GPUShapes "
"Request for axis vector which exceed the bitwidth available for GPUShapes "
"(32)");
}
this->push_back(static_cast<uint32_t>(size));
this->push_back(static_cast<int32_t>(size));
}
}
};
......
......@@ -186,3 +186,9 @@ std::pair<uint64_t, uint64_t> runtime::gpu::idiv_magic_u64(uint64_t divisor)
{
return magicU64(divisor);
}
uint32_t runtime::gpu::idiv_ceil(int n, int d)
{
// compiler fused modulo and division
return n / d + (n % d > 0);
}
......@@ -103,6 +103,7 @@ namespace ngraph
void cuda_memset(void* dst, int value, size_t buffer_size);
std::pair<uint64_t, uint64_t> idiv_magic_u32(uint64_t max_numerator, uint64_t divisor);
std::pair<uint64_t, uint64_t> idiv_magic_u64(uint64_t divisor);
uint32_t idiv_ceil(int n, int d);
template <typename T>
void print_gpu_tensor(const void* p, size_t element_count)
......
......@@ -6,17 +6,6 @@ batch_norm_three_outputs
computation_reuse
#int64 is not supprted
concat_matrix_int64
#convolution 4d is work in progress
convolution_4d_2items
convolution_4d_4items
convolution_4d_4items_dilated
convolution_4d_4items_padded_neg
convolution_4d_4items_strided
convolution_4d_4items_strided_dilated
convolution_4d_4items_strided_dilated_padded
convolution_4d_4items_strided_dilated_padded_neg
convolution_4d_4items_strided_dilated_padded_same
#cuDNN does not have arithmetic exceptions
divide_by_zero_int32
#int64 is not supprted by cuDNN
dot_matrix_vector_int64
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment