Commit ae45c984 authored by Chris Sullivan's avatar Chris Sullivan Committed by Scott Cyphers

Nd convolution via blocked GEMM for C{d1,...,dn}N layout (#1131)

* Added blank convolution kernel and refactored coordinate transform kernel helper.

* Added op::Reshape to the CUDAEmitter.

* Added 2-Nd tiled convolution.

* Bug fixes with data_dilation and filter loop. Still need to add test for coverage of register tiling.

* Styling.

* Removed some comments and code added for testing.

* Some tests became enabled in merge, removing them.
parent 3a43bdac
This diff is collapsed.
...@@ -127,6 +127,21 @@ namespace ngraph ...@@ -127,6 +127,21 @@ namespace ngraph
GPUShape result_shape, GPUShape result_shape,
const std::set<size_t>& bcast_axes); const std::set<size_t>& bcast_axes);
size_t build_reshape(const GPURuntimeContext* ctx,
const std::array<std::string, 2>& dtypes,
GPUShape input_shape,
GPUShape input_order);
size_t build_convolution(const GPURuntimeContext* ctx,
const std::array<std::string, 3>& dtypes,
GPUShape input_shape,
GPUShape input_pad_below,
GPUShape input_dilation,
GPUShape filter_shape,
GPUShape filter_stride,
GPUShape filter_dilation,
GPUShape output_shape);
private: private:
CUDAEmitter(GPUPrimitiveEmitter* emitter); CUDAEmitter(GPUPrimitiveEmitter* emitter);
void print_tensor_from_gpu(codegen::CodeWriter& writer, void print_tensor_from_gpu(codegen::CodeWriter& writer,
......
...@@ -114,6 +114,16 @@ namespace ngraph ...@@ -114,6 +114,16 @@ namespace ngraph
const std::array<std::string, 2>& data_types, const std::array<std::string, 2>& data_types,
bool include_pad); bool include_pad);
static void get_convolution_forward(codegen::CodeWriter& writer,
const std::string& name,
const std::array<std::string, 3>& data_types,
int N,
int K,
int filter_size,
int rank,
int sm_tile_size = 8,
int reg_tile_size = 1);
static void add_pod_typedefs(codegen::CodeWriter& writer); static void add_pod_typedefs(codegen::CodeWriter& writer);
/// \brief Given kernel input variables i_* produce register variables o_coordinates{i} /// \brief Given kernel input variables i_* produce register variables o_coordinates{i}
...@@ -127,6 +137,13 @@ namespace ngraph ...@@ -127,6 +137,13 @@ namespace ngraph
std::string i_reduced_strides, std::string i_reduced_strides,
std::string o_coordinates, std::string o_coordinates,
size_t rank); size_t rank);
static void coordinate_transform_to_multi_d(codegen::CodeWriter& writer,
std::string i_strides,
std::string i_stride_magic,
std::string i_stride_shift,
std::string i_coord_product,
std::string o_coordinates,
size_t rank);
}; };
} }
} }
......
...@@ -149,6 +149,11 @@ namespace ngraph ...@@ -149,6 +149,11 @@ namespace ngraph
} }
auto convolution = static_cast<const ngraph::op::Convolution*>(node); auto convolution = static_cast<const ngraph::op::Convolution*>(node);
auto input_shape = args[0].get_shape();
auto filter_shape = args[1].get_shape();
auto output_shape = out[0].get_shape();
auto rank = input_shape.size();
Strides window_dilation_strides = convolution->get_window_dilation_strides(); Strides window_dilation_strides = convolution->get_window_dilation_strides();
Strides window_movement_strides = convolution->get_window_movement_strides(); Strides window_movement_strides = convolution->get_window_movement_strides();
Strides data_dilation_strides = convolution->get_data_dilation_strides(); Strides data_dilation_strides = convolution->get_data_dilation_strides();
...@@ -157,10 +162,105 @@ namespace ngraph ...@@ -157,10 +162,105 @@ namespace ngraph
if (padding_below_diff.size() > 3) if (padding_below_diff.size() > 3)
{ {
throw std::runtime_error(node->get_name() + // Reshape from NC{d1,..,dn} -> C{d1,...,dn}N
"with more than 3D is not implemented."); // and from KC{df1,...,dfn} -> C{df1,...,dfn}N.
// TODO: This should be done via a pass similar to
// what is done for convolution in the IA transformer
// c.f runtime/cpu/pass/cpu_layout.cpp
GPUAllocator allocator =
external_function->get_primitive_emitter()->get_memory_allocator();
size_t transposed_data_idx = allocator.reserve_workspace(
args[0].get_size() * args[0].get_element_type().size());
size_t transposed_filter_idx = allocator.reserve_workspace(
args[1].get_size() * args[1].get_element_type().size());
size_t transposed_output_idx = allocator.reserve_workspace(
out[0].get_size() * out[0].get_element_type().size());
GPUShape input_order;
for (int i = 1; i <= rank; i++)
{
input_order.push_back(i % rank);
}
auto& cuda_emitter =
external_function->get_primitive_emitter()->get_cuda_emitter();
size_t reshape_data_index =
cuda_emitter->build_reshape(external_function->ctx().get(),
{{args[0].get_type(), args[0].get_type()}},
input_shape,
input_order);
writer << "void* data = gpu::invoke_memory_primitive(ctx, "
<< transposed_data_idx << ");\n";
writer << "gpu::invoke_primitive(ctx, " << reshape_data_index << ", ";
writer << "std::vector<void*>{" << args[0].get_name() << "}.data(), ";
writer << "std::vector<void*>{data}.data());\n";
size_t reshape_filter_index =
cuda_emitter->build_reshape(external_function->ctx().get(),
{{args[1].get_type(), args[1].get_type()}},
filter_shape,
input_order);
writer << "void* filter = gpu::invoke_memory_primitive(ctx, "
<< transposed_filter_idx << ");\n";
writer << "gpu::invoke_primitive(ctx, " << reshape_filter_index << ", ";
writer << "std::vector<void*>{" << args[1].get_name() << "}.data(), ";
writer << "std::vector<void*>{filter}.data());\n";
// local helper to reshape tensor shape objects
auto reshape = [](const Shape& shape, const GPUShape& order) {
Shape output(shape.size(), 0);
for (size_t i = 0; i < shape.size(); i++)
{
output[i] = shape[order[i]];
}
return output;
};
// reorder axes of the input shape (NC{d_1,...,d_n} -> C{d_1,...,d_n}N)
input_shape = reshape(input_shape, input_order);
// reorder axes of the filter shape (KC{df_1,...,df_n} -> C{df_1,...,df_n}K)
filter_shape = reshape(filter_shape, input_order);
// reorder axes of the output shape (NK{do_1,...,do_n} -> K{do_1,...,do_n}N)
output_shape = reshape(output_shape, input_order);
size_t conv_index = cuda_emitter->build_convolution(
external_function->ctx().get(),
{{args[0].get_type(), args[1].get_type(), out[0].get_type()}},
input_shape,
padding_below_diff,
data_dilation_strides,
filter_shape,
window_movement_strides,
window_dilation_strides,
output_shape);
writer << "void* output = gpu::invoke_memory_primitive(ctx, "
<< transposed_output_idx << ");\n";
writer << "gpu::invoke_primitive(ctx, " << conv_index << ", ";
writer << "std::vector<void*>{data, filter}.data(), ";
writer << "std::vector<void*>{output}.data());\n";
// reshape output tensor (K{do_1,...,do_n}N -> NK{do_1,...,do_n})
input_order.clear();
input_order.push_back(static_cast<int>(rank - 1));
for (int i = 0; i < rank - 1; i++)
{
input_order.push_back(i);
} }
size_t reshape_output_index =
cuda_emitter->build_reshape(external_function->ctx().get(),
{{args[1].get_type(), args[1].get_type()}},
output_shape,
input_order);
writer << "gpu::invoke_primitive(ctx, " << reshape_output_index << ", ";
writer << "std::vector<void*>{output}.data(), ";
writer << "std::vector<void*>{" << out[0].get_name() << "}.data());\n";
}
else
{
bool is_deconvolution = false; bool is_deconvolution = false;
for (auto a : data_dilation_strides) for (auto a : data_dilation_strides)
{ {
...@@ -181,7 +281,6 @@ namespace ngraph ...@@ -181,7 +281,6 @@ namespace ngraph
padding_above[i] = static_cast<size_t>(padding_above_diff[i]); padding_above[i] = static_cast<size_t>(padding_above_diff[i]);
} }
auto input_shape = args[0].get_shape();
Shape input_shape_padded = input_shape; Shape input_shape_padded = input_shape;
Shape padding_interior(data_dilation_strides); Shape padding_interior(data_dilation_strides);
writer.block_begin(); writer.block_begin();
...@@ -199,12 +298,13 @@ namespace ngraph ...@@ -199,12 +298,13 @@ namespace ngraph
<< idx_workspace << ");\n"; << idx_workspace << ");\n";
writer << "std::vector<" << args[0].get_type() << "> pad_buffer_host(" writer << "std::vector<" << args[0].get_type() << "> pad_buffer_host("
<< shape_size(input_shape_padded) << ", 0);\n"; << shape_size(input_shape_padded) << ", 0);\n";
writer << "runtime::gpu::cuda_memcpyHtD(pad_buffer, pad_buffer_host.data(), " writer
<< "runtime::gpu::cuda_memcpyHtD(pad_buffer, pad_buffer_host.data(), "
<< temp_size << ");\n"; << temp_size << ");\n";
auto& cuda_emitter = auto& cuda_emitter =
external_function->get_primitive_emitter()->get_cuda_emitter(); external_function->get_primitive_emitter()->get_cuda_emitter();
auto pad_dynamic_index = auto pad_dynamic_index = cuda_emitter->build_pad_dynamic(
cuda_emitter->build_pad_dynamic(external_function->ctx().get(), external_function->ctx().get(),
{{args[0].get_type(), out[0].get_type()}}, {{args[0].get_type(), out[0].get_type()}},
input_shape, input_shape,
input_shape_padded, input_shape_padded,
...@@ -216,7 +316,7 @@ namespace ngraph ...@@ -216,7 +316,7 @@ namespace ngraph
writer << "std::vector<void*>{pad_buffer}.data()"; writer << "std::vector<void*>{pad_buffer}.data()";
writer << ");\n"; writer << ");\n";
// asymetric padding has been applied, zero out padding vectors to // asymetric padding has been applied, zero out padding vectors to
// ensure cuDNN does not assume padding // ensure cudnn does not assume padding
std::fill(padding_below.begin(), padding_below.end(), 0); std::fill(padding_below.begin(), padding_below.end(), 0);
} }
auto& cudnn_emitter = auto& cudnn_emitter =
...@@ -246,6 +346,7 @@ namespace ngraph ...@@ -246,6 +346,7 @@ namespace ngraph
writer << ");\n"; writer << ");\n";
writer.block_end(); writer.block_end();
} }
}
template <> template <>
void GPU_Emitter::EMITTER_DECL(ngraph::op::ConvolutionBackpropData) void GPU_Emitter::EMITTER_DECL(ngraph::op::ConvolutionBackpropData)
......
...@@ -42,6 +42,12 @@ namespace ngraph ...@@ -42,6 +42,12 @@ namespace ngraph
GPUAllocator(const GPUAllocator& g); GPUAllocator(const GPUAllocator& g);
~GPUAllocator(); ~GPUAllocator();
template <typename T>
size_t reserve_argspace(const T& container)
{
return reserve_argspace(container.data(),
container.size() * sizeof(typename T::value_type));
}
size_t reserve_argspace(const void* data, size_t size); size_t reserve_argspace(const void* data, size_t size);
size_t reserve_workspace(size_t size, bool zero_initialize = true); size_t reserve_workspace(size_t size, bool zero_initialize = true);
......
...@@ -21,6 +21,7 @@ ...@@ -21,6 +21,7 @@
#include <vector> #include <vector>
#include "ngraph/axis_set.hpp" #include "ngraph/axis_set.hpp"
#include "ngraph/axis_vector.hpp"
#include "ngraph/coordinate.hpp" #include "ngraph/coordinate.hpp"
#include "ngraph/coordinate_diff.hpp" #include "ngraph/coordinate_diff.hpp"
#include "ngraph/shape.hpp" #include "ngraph/shape.hpp"
...@@ -30,45 +31,45 @@ namespace ngraph ...@@ -30,45 +31,45 @@ namespace ngraph
{ {
class Shape; class Shape;
/// \brief Shape for a tensor resident on GPU. /// \brief Shape for a tensor resident on GPU.
class GPUShape : public std::vector<uint32_t> class GPUShape : public std::vector<int32_t>
{ {
public: public:
GPUShape(const std::initializer_list<uint32_t>& axis_lengths) GPUShape(const std::initializer_list<int32_t>& axis_lengths)
: std::vector<uint32_t>(axis_lengths) : std::vector<int32_t>(axis_lengths)
{ {
} }
GPUShape(const std::vector<uint32_t>& axis_lengths) GPUShape(const std::vector<int32_t>& axis_lengths)
: std::vector<uint32_t>(axis_lengths) : std::vector<int32_t>(axis_lengths)
{ {
} }
GPUShape(const GPUShape& axis_lengths) GPUShape(const GPUShape& axis_lengths)
: std::vector<uint32_t>(axis_lengths) : std::vector<int32_t>(axis_lengths)
{ {
} }
explicit GPUShape(size_t n, uint32_t initial_value = 0) explicit GPUShape(size_t n, int32_t initial_value = 0)
: std::vector<uint32_t>(n, initial_value) : std::vector<int32_t>(n, initial_value)
{ {
} }
template <class InputIterator> template <class InputIterator>
GPUShape(InputIterator first, InputIterator last) GPUShape(InputIterator first, InputIterator last)
: std::vector<uint32_t>(first, last) : std::vector<int32_t>(first, last)
{ {
} }
GPUShape() {} GPUShape() {}
GPUShape& operator=(const GPUShape& v) GPUShape& operator=(const GPUShape& v)
{ {
static_cast<std::vector<uint32_t>*>(this)->operator=(v); static_cast<std::vector<int32_t>*>(this)->operator=(v);
return *this; return *this;
} }
GPUShape& operator=(GPUShape&& v) GPUShape& operator=(GPUShape&& v)
{ {
static_cast<std::vector<uint32_t>*>(this)->operator=(v); static_cast<std::vector<int32_t>*>(this)->operator=(v);
return *this; return *this;
} }
...@@ -81,7 +82,7 @@ namespace ngraph ...@@ -81,7 +82,7 @@ namespace ngraph
throw std::runtime_error( throw std::runtime_error(
"Request exceeds the bitwidth available for GPUShapes (32)"); "Request exceeds the bitwidth available for GPUShapes (32)");
} }
this->push_back(static_cast<uint32_t>(size)); this->push_back(static_cast<int32_t>(size));
} }
} }
...@@ -95,7 +96,7 @@ namespace ngraph ...@@ -95,7 +96,7 @@ namespace ngraph
"Request for Shape which exceeds the bitwidth available for GPUShapes " "Request for Shape which exceeds the bitwidth available for GPUShapes "
"(32)"); "(32)");
} }
this->push_back(static_cast<uint32_t>(size)); this->push_back(static_cast<int32_t>(size));
} }
} }
...@@ -109,7 +110,7 @@ namespace ngraph ...@@ -109,7 +110,7 @@ namespace ngraph
"Request for Strides which exceed the bitwidth available for GPUShapes " "Request for Strides which exceed the bitwidth available for GPUShapes "
"(32)"); "(32)");
} }
this->push_back(static_cast<uint32_t>(size)); this->push_back(static_cast<int32_t>(size));
} }
} }
...@@ -123,21 +124,36 @@ namespace ngraph ...@@ -123,21 +124,36 @@ namespace ngraph
"Request for Coordinate which exceed the bitwidth available for GPUShapes " "Request for Coordinate which exceed the bitwidth available for GPUShapes "
"(32)"); "(32)");
} }
this->push_back(static_cast<uint32_t>(size)); this->push_back(static_cast<int32_t>(size));
} }
} }
GPUShape(const CoordinateDiff& coord) GPUShape(const CoordinateDiff& coord)
{ {
for (auto const& size : coord) for (auto const& dim : coord)
{
if (dim > 0 && dim >> 32 != 0)
{
throw std::runtime_error(
"Request for CoordinateDiff which exceed the bitwidth available for "
"GPUShapes "
"(32)");
}
this->push_back(static_cast<int32_t>(dim));
}
}
GPUShape(const AxisVector& vec)
{
for (auto const& size : vec)
{ {
if (size >> 32 != 0) if (size >> 32 != 0)
{ {
throw std::runtime_error( throw std::runtime_error(
"Request for Coordinate which exceed the bitwidth available for GPUShapes " "Request for axis vector which exceed the bitwidth available for GPUShapes "
"(32)"); "(32)");
} }
this->push_back(static_cast<uint32_t>(size)); this->push_back(static_cast<int32_t>(size));
} }
} }
}; };
......
...@@ -186,3 +186,9 @@ std::pair<uint64_t, uint64_t> runtime::gpu::idiv_magic_u64(uint64_t divisor) ...@@ -186,3 +186,9 @@ std::pair<uint64_t, uint64_t> runtime::gpu::idiv_magic_u64(uint64_t divisor)
{ {
return magicU64(divisor); return magicU64(divisor);
} }
uint32_t runtime::gpu::idiv_ceil(int n, int d)
{
// compiler fused modulo and division
return n / d + (n % d > 0);
}
...@@ -103,6 +103,7 @@ namespace ngraph ...@@ -103,6 +103,7 @@ namespace ngraph
void cuda_memset(void* dst, int value, size_t buffer_size); void cuda_memset(void* dst, int value, size_t buffer_size);
std::pair<uint64_t, uint64_t> idiv_magic_u32(uint64_t max_numerator, uint64_t divisor); std::pair<uint64_t, uint64_t> idiv_magic_u32(uint64_t max_numerator, uint64_t divisor);
std::pair<uint64_t, uint64_t> idiv_magic_u64(uint64_t divisor); std::pair<uint64_t, uint64_t> idiv_magic_u64(uint64_t divisor);
uint32_t idiv_ceil(int n, int d);
template <typename T> template <typename T>
void print_gpu_tensor(const void* p, size_t element_count) void print_gpu_tensor(const void* p, size_t element_count)
......
...@@ -6,17 +6,6 @@ batch_norm_three_outputs ...@@ -6,17 +6,6 @@ batch_norm_three_outputs
computation_reuse computation_reuse
#int64 is not supprted #int64 is not supprted
concat_matrix_int64 concat_matrix_int64
#convolution 4d is work in progress
convolution_4d_2items
convolution_4d_4items
convolution_4d_4items_dilated
convolution_4d_4items_padded_neg
convolution_4d_4items_strided
convolution_4d_4items_strided_dilated
convolution_4d_4items_strided_dilated_padded
convolution_4d_4items_strided_dilated_padded_neg
convolution_4d_4items_strided_dilated_padded_same
#cuDNN does not have arithmetic exceptions
divide_by_zero_int32 divide_by_zero_int32
#int64 is not supprted by cuDNN #int64 is not supprted by cuDNN
dot_matrix_vector_int64 dot_matrix_vector_int64
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment