Commit ae45c984 authored by Chris Sullivan's avatar Chris Sullivan Committed by Scott Cyphers

Nd convolution via blocked GEMM for C{d1,...,dn}N layout (#1131)

* Added blank convolution kernel and refactored coordinate transform kernel helper.

* Added op::Reshape to the CUDAEmitter.

* Added 2-Nd tiled convolution.

* Bug fixes with data_dilation and filter loop. Still need to add test for coverage of register tiling.

* Styling.

* Removed some comments and code added for testing.

* Some tests became enabled in merge, removing them.
parent 3a43bdac
This diff is collapsed.
...@@ -127,6 +127,21 @@ namespace ngraph ...@@ -127,6 +127,21 @@ namespace ngraph
GPUShape result_shape, GPUShape result_shape,
const std::set<size_t>& bcast_axes); const std::set<size_t>& bcast_axes);
size_t build_reshape(const GPURuntimeContext* ctx,
const std::array<std::string, 2>& dtypes,
GPUShape input_shape,
GPUShape input_order);
size_t build_convolution(const GPURuntimeContext* ctx,
const std::array<std::string, 3>& dtypes,
GPUShape input_shape,
GPUShape input_pad_below,
GPUShape input_dilation,
GPUShape filter_shape,
GPUShape filter_stride,
GPUShape filter_dilation,
GPUShape output_shape);
private: private:
CUDAEmitter(GPUPrimitiveEmitter* emitter); CUDAEmitter(GPUPrimitiveEmitter* emitter);
void print_tensor_from_gpu(codegen::CodeWriter& writer, void print_tensor_from_gpu(codegen::CodeWriter& writer,
......
...@@ -114,6 +114,16 @@ namespace ngraph ...@@ -114,6 +114,16 @@ namespace ngraph
const std::array<std::string, 2>& data_types, const std::array<std::string, 2>& data_types,
bool include_pad); bool include_pad);
static void get_convolution_forward(codegen::CodeWriter& writer,
const std::string& name,
const std::array<std::string, 3>& data_types,
int N,
int K,
int filter_size,
int rank,
int sm_tile_size = 8,
int reg_tile_size = 1);
static void add_pod_typedefs(codegen::CodeWriter& writer); static void add_pod_typedefs(codegen::CodeWriter& writer);
/// \brief Given kernel input variables i_* produce register variables o_coordinates{i} /// \brief Given kernel input variables i_* produce register variables o_coordinates{i}
...@@ -127,6 +137,13 @@ namespace ngraph ...@@ -127,6 +137,13 @@ namespace ngraph
std::string i_reduced_strides, std::string i_reduced_strides,
std::string o_coordinates, std::string o_coordinates,
size_t rank); size_t rank);
static void coordinate_transform_to_multi_d(codegen::CodeWriter& writer,
std::string i_strides,
std::string i_stride_magic,
std::string i_stride_shift,
std::string i_coord_product,
std::string o_coordinates,
size_t rank);
}; };
} }
} }
......
This diff is collapsed.
...@@ -42,6 +42,12 @@ namespace ngraph ...@@ -42,6 +42,12 @@ namespace ngraph
GPUAllocator(const GPUAllocator& g); GPUAllocator(const GPUAllocator& g);
~GPUAllocator(); ~GPUAllocator();
template <typename T>
size_t reserve_argspace(const T& container)
{
return reserve_argspace(container.data(),
container.size() * sizeof(typename T::value_type));
}
size_t reserve_argspace(const void* data, size_t size); size_t reserve_argspace(const void* data, size_t size);
size_t reserve_workspace(size_t size, bool zero_initialize = true); size_t reserve_workspace(size_t size, bool zero_initialize = true);
......
...@@ -21,6 +21,7 @@ ...@@ -21,6 +21,7 @@
#include <vector> #include <vector>
#include "ngraph/axis_set.hpp" #include "ngraph/axis_set.hpp"
#include "ngraph/axis_vector.hpp"
#include "ngraph/coordinate.hpp" #include "ngraph/coordinate.hpp"
#include "ngraph/coordinate_diff.hpp" #include "ngraph/coordinate_diff.hpp"
#include "ngraph/shape.hpp" #include "ngraph/shape.hpp"
...@@ -30,45 +31,45 @@ namespace ngraph ...@@ -30,45 +31,45 @@ namespace ngraph
{ {
class Shape; class Shape;
/// \brief Shape for a tensor resident on GPU. /// \brief Shape for a tensor resident on GPU.
class GPUShape : public std::vector<uint32_t> class GPUShape : public std::vector<int32_t>
{ {
public: public:
GPUShape(const std::initializer_list<uint32_t>& axis_lengths) GPUShape(const std::initializer_list<int32_t>& axis_lengths)
: std::vector<uint32_t>(axis_lengths) : std::vector<int32_t>(axis_lengths)
{ {
} }
GPUShape(const std::vector<uint32_t>& axis_lengths) GPUShape(const std::vector<int32_t>& axis_lengths)
: std::vector<uint32_t>(axis_lengths) : std::vector<int32_t>(axis_lengths)
{ {
} }
GPUShape(const GPUShape& axis_lengths) GPUShape(const GPUShape& axis_lengths)
: std::vector<uint32_t>(axis_lengths) : std::vector<int32_t>(axis_lengths)
{ {
} }
explicit GPUShape(size_t n, uint32_t initial_value = 0) explicit GPUShape(size_t n, int32_t initial_value = 0)
: std::vector<uint32_t>(n, initial_value) : std::vector<int32_t>(n, initial_value)
{ {
} }
template <class InputIterator> template <class InputIterator>
GPUShape(InputIterator first, InputIterator last) GPUShape(InputIterator first, InputIterator last)
: std::vector<uint32_t>(first, last) : std::vector<int32_t>(first, last)
{ {
} }
GPUShape() {} GPUShape() {}
GPUShape& operator=(const GPUShape& v) GPUShape& operator=(const GPUShape& v)
{ {
static_cast<std::vector<uint32_t>*>(this)->operator=(v); static_cast<std::vector<int32_t>*>(this)->operator=(v);
return *this; return *this;
} }
GPUShape& operator=(GPUShape&& v) GPUShape& operator=(GPUShape&& v)
{ {
static_cast<std::vector<uint32_t>*>(this)->operator=(v); static_cast<std::vector<int32_t>*>(this)->operator=(v);
return *this; return *this;
} }
...@@ -81,7 +82,7 @@ namespace ngraph ...@@ -81,7 +82,7 @@ namespace ngraph
throw std::runtime_error( throw std::runtime_error(
"Request exceeds the bitwidth available for GPUShapes (32)"); "Request exceeds the bitwidth available for GPUShapes (32)");
} }
this->push_back(static_cast<uint32_t>(size)); this->push_back(static_cast<int32_t>(size));
} }
} }
...@@ -95,7 +96,7 @@ namespace ngraph ...@@ -95,7 +96,7 @@ namespace ngraph
"Request for Shape which exceeds the bitwidth available for GPUShapes " "Request for Shape which exceeds the bitwidth available for GPUShapes "
"(32)"); "(32)");
} }
this->push_back(static_cast<uint32_t>(size)); this->push_back(static_cast<int32_t>(size));
} }
} }
...@@ -109,7 +110,7 @@ namespace ngraph ...@@ -109,7 +110,7 @@ namespace ngraph
"Request for Strides which exceed the bitwidth available for GPUShapes " "Request for Strides which exceed the bitwidth available for GPUShapes "
"(32)"); "(32)");
} }
this->push_back(static_cast<uint32_t>(size)); this->push_back(static_cast<int32_t>(size));
} }
} }
...@@ -123,21 +124,36 @@ namespace ngraph ...@@ -123,21 +124,36 @@ namespace ngraph
"Request for Coordinate which exceed the bitwidth available for GPUShapes " "Request for Coordinate which exceed the bitwidth available for GPUShapes "
"(32)"); "(32)");
} }
this->push_back(static_cast<uint32_t>(size)); this->push_back(static_cast<int32_t>(size));
} }
} }
GPUShape(const CoordinateDiff& coord) GPUShape(const CoordinateDiff& coord)
{ {
for (auto const& size : coord) for (auto const& dim : coord)
{
if (dim > 0 && dim >> 32 != 0)
{
throw std::runtime_error(
"Request for CoordinateDiff which exceed the bitwidth available for "
"GPUShapes "
"(32)");
}
this->push_back(static_cast<int32_t>(dim));
}
}
GPUShape(const AxisVector& vec)
{
for (auto const& size : vec)
{ {
if (size >> 32 != 0) if (size >> 32 != 0)
{ {
throw std::runtime_error( throw std::runtime_error(
"Request for Coordinate which exceed the bitwidth available for GPUShapes " "Request for axis vector which exceed the bitwidth available for GPUShapes "
"(32)"); "(32)");
} }
this->push_back(static_cast<uint32_t>(size)); this->push_back(static_cast<int32_t>(size));
} }
} }
}; };
......
...@@ -186,3 +186,9 @@ std::pair<uint64_t, uint64_t> runtime::gpu::idiv_magic_u64(uint64_t divisor) ...@@ -186,3 +186,9 @@ std::pair<uint64_t, uint64_t> runtime::gpu::idiv_magic_u64(uint64_t divisor)
{ {
return magicU64(divisor); return magicU64(divisor);
} }
uint32_t runtime::gpu::idiv_ceil(int n, int d)
{
// compiler fused modulo and division
return n / d + (n % d > 0);
}
...@@ -103,6 +103,7 @@ namespace ngraph ...@@ -103,6 +103,7 @@ namespace ngraph
void cuda_memset(void* dst, int value, size_t buffer_size); void cuda_memset(void* dst, int value, size_t buffer_size);
std::pair<uint64_t, uint64_t> idiv_magic_u32(uint64_t max_numerator, uint64_t divisor); std::pair<uint64_t, uint64_t> idiv_magic_u32(uint64_t max_numerator, uint64_t divisor);
std::pair<uint64_t, uint64_t> idiv_magic_u64(uint64_t divisor); std::pair<uint64_t, uint64_t> idiv_magic_u64(uint64_t divisor);
uint32_t idiv_ceil(int n, int d);
template <typename T> template <typename T>
void print_gpu_tensor(const void* p, size_t element_count) void print_gpu_tensor(const void* p, size_t element_count)
......
...@@ -6,17 +6,6 @@ batch_norm_three_outputs ...@@ -6,17 +6,6 @@ batch_norm_three_outputs
computation_reuse computation_reuse
#int64 is not supprted #int64 is not supprted
concat_matrix_int64 concat_matrix_int64
#convolution 4d is work in progress
convolution_4d_2items
convolution_4d_4items
convolution_4d_4items_dilated
convolution_4d_4items_padded_neg
convolution_4d_4items_strided
convolution_4d_4items_strided_dilated
convolution_4d_4items_strided_dilated_padded
convolution_4d_4items_strided_dilated_padded_neg
convolution_4d_4items_strided_dilated_padded_same
#cuDNN does not have arithmetic exceptions
divide_by_zero_int32 divide_by_zero_int32
#int64 is not supprted by cuDNN #int64 is not supprted by cuDNN
dot_matrix_vector_int64 dot_matrix_vector_int64
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment