Commit ae45c984 authored by Chris Sullivan's avatar Chris Sullivan Committed by Scott Cyphers

Nd convolution via blocked GEMM for C{d1,...,dn}N layout (#1131)

* Added blank convolution kernel and refactored coordinate transform kernel helper.

* Added op::Reshape to the CUDAEmitter.

* Added 2-Nd tiled convolution.

* Bug fixes with data_dilation and filter loop. Still need to add test for coverage of register tiling.

* Styling.

* Removed some comments and code added for testing.

* Some tests became enabled in merge, removing them.
parent 3a43bdac
This diff is collapsed.
......@@ -127,6 +127,21 @@ namespace ngraph
GPUShape result_shape,
const std::set<size_t>& bcast_axes);
size_t build_reshape(const GPURuntimeContext* ctx,
const std::array<std::string, 2>& dtypes,
GPUShape input_shape,
GPUShape input_order);
size_t build_convolution(const GPURuntimeContext* ctx,
const std::array<std::string, 3>& dtypes,
GPUShape input_shape,
GPUShape input_pad_below,
GPUShape input_dilation,
GPUShape filter_shape,
GPUShape filter_stride,
GPUShape filter_dilation,
GPUShape output_shape);
private:
CUDAEmitter(GPUPrimitiveEmitter* emitter);
void print_tensor_from_gpu(codegen::CodeWriter& writer,
......
......@@ -114,6 +114,16 @@ namespace ngraph
const std::array<std::string, 2>& data_types,
bool include_pad);
static void get_convolution_forward(codegen::CodeWriter& writer,
const std::string& name,
const std::array<std::string, 3>& data_types,
int N,
int K,
int filter_size,
int rank,
int sm_tile_size = 8,
int reg_tile_size = 1);
static void add_pod_typedefs(codegen::CodeWriter& writer);
/// \brief Given kernel input variables i_* produce register variables o_coordinates{i}
......@@ -127,6 +137,13 @@ namespace ngraph
std::string i_reduced_strides,
std::string o_coordinates,
size_t rank);
static void coordinate_transform_to_multi_d(codegen::CodeWriter& writer,
std::string i_strides,
std::string i_stride_magic,
std::string i_stride_shift,
std::string i_coord_product,
std::string o_coordinates,
size_t rank);
};
}
}
......
This diff is collapsed.
......@@ -42,6 +42,12 @@ namespace ngraph
GPUAllocator(const GPUAllocator& g);
~GPUAllocator();
template <typename T>
size_t reserve_argspace(const T& container)
{
return reserve_argspace(container.data(),
container.size() * sizeof(typename T::value_type));
}
size_t reserve_argspace(const void* data, size_t size);
size_t reserve_workspace(size_t size, bool zero_initialize = true);
......
......@@ -21,6 +21,7 @@
#include <vector>
#include "ngraph/axis_set.hpp"
#include "ngraph/axis_vector.hpp"
#include "ngraph/coordinate.hpp"
#include "ngraph/coordinate_diff.hpp"
#include "ngraph/shape.hpp"
......@@ -30,45 +31,45 @@ namespace ngraph
{
class Shape;
/// \brief Shape for a tensor resident on GPU.
class GPUShape : public std::vector<uint32_t>
class GPUShape : public std::vector<int32_t>
{
public:
GPUShape(const std::initializer_list<uint32_t>& axis_lengths)
: std::vector<uint32_t>(axis_lengths)
GPUShape(const std::initializer_list<int32_t>& axis_lengths)
: std::vector<int32_t>(axis_lengths)
{
}
GPUShape(const std::vector<uint32_t>& axis_lengths)
: std::vector<uint32_t>(axis_lengths)
GPUShape(const std::vector<int32_t>& axis_lengths)
: std::vector<int32_t>(axis_lengths)
{
}
GPUShape(const GPUShape& axis_lengths)
: std::vector<uint32_t>(axis_lengths)
: std::vector<int32_t>(axis_lengths)
{
}
explicit GPUShape(size_t n, uint32_t initial_value = 0)
: std::vector<uint32_t>(n, initial_value)
explicit GPUShape(size_t n, int32_t initial_value = 0)
: std::vector<int32_t>(n, initial_value)
{
}
template <class InputIterator>
GPUShape(InputIterator first, InputIterator last)
: std::vector<uint32_t>(first, last)
: std::vector<int32_t>(first, last)
{
}
GPUShape() {}
GPUShape& operator=(const GPUShape& v)
{
static_cast<std::vector<uint32_t>*>(this)->operator=(v);
static_cast<std::vector<int32_t>*>(this)->operator=(v);
return *this;
}
GPUShape& operator=(GPUShape&& v)
{
static_cast<std::vector<uint32_t>*>(this)->operator=(v);
static_cast<std::vector<int32_t>*>(this)->operator=(v);
return *this;
}
......@@ -81,7 +82,7 @@ namespace ngraph
throw std::runtime_error(
"Request exceeds the bitwidth available for GPUShapes (32)");
}
this->push_back(static_cast<uint32_t>(size));
this->push_back(static_cast<int32_t>(size));
}
}
......@@ -95,7 +96,7 @@ namespace ngraph
"Request for Shape which exceeds the bitwidth available for GPUShapes "
"(32)");
}
this->push_back(static_cast<uint32_t>(size));
this->push_back(static_cast<int32_t>(size));
}
}
......@@ -109,7 +110,7 @@ namespace ngraph
"Request for Strides which exceed the bitwidth available for GPUShapes "
"(32)");
}
this->push_back(static_cast<uint32_t>(size));
this->push_back(static_cast<int32_t>(size));
}
}
......@@ -123,21 +124,36 @@ namespace ngraph
"Request for Coordinate which exceed the bitwidth available for GPUShapes "
"(32)");
}
this->push_back(static_cast<uint32_t>(size));
this->push_back(static_cast<int32_t>(size));
}
}
GPUShape(const CoordinateDiff& coord)
{
for (auto const& size : coord)
for (auto const& dim : coord)
{
if (dim > 0 && dim >> 32 != 0)
{
throw std::runtime_error(
"Request for CoordinateDiff which exceed the bitwidth available for "
"GPUShapes "
"(32)");
}
this->push_back(static_cast<int32_t>(dim));
}
}
GPUShape(const AxisVector& vec)
{
for (auto const& size : vec)
{
if (size >> 32 != 0)
{
throw std::runtime_error(
"Request for Coordinate which exceed the bitwidth available for GPUShapes "
"Request for axis vector which exceed the bitwidth available for GPUShapes "
"(32)");
}
this->push_back(static_cast<uint32_t>(size));
this->push_back(static_cast<int32_t>(size));
}
}
};
......
......@@ -186,3 +186,9 @@ std::pair<uint64_t, uint64_t> runtime::gpu::idiv_magic_u64(uint64_t divisor)
{
return magicU64(divisor);
}
uint32_t runtime::gpu::idiv_ceil(int n, int d)
{
// compiler fused modulo and division
return n / d + (n % d > 0);
}
......@@ -103,6 +103,7 @@ namespace ngraph
void cuda_memset(void* dst, int value, size_t buffer_size);
std::pair<uint64_t, uint64_t> idiv_magic_u32(uint64_t max_numerator, uint64_t divisor);
std::pair<uint64_t, uint64_t> idiv_magic_u64(uint64_t divisor);
uint32_t idiv_ceil(int n, int d);
template <typename T>
void print_gpu_tensor(const void* p, size_t element_count)
......
......@@ -6,17 +6,6 @@ batch_norm_three_outputs
computation_reuse
#int64 is not supprted
concat_matrix_int64
#convolution 4d is work in progress
convolution_4d_2items
convolution_4d_4items
convolution_4d_4items_dilated
convolution_4d_4items_padded_neg
convolution_4d_4items_strided
convolution_4d_4items_strided_dilated
convolution_4d_4items_strided_dilated_padded
convolution_4d_4items_strided_dilated_padded_neg
convolution_4d_4items_strided_dilated_padded_same
#cuDNN does not have arithmetic exceptions
divide_by_zero_int32
#int64 is not supprted by cuDNN
dot_matrix_vector_int64
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment