Commit b8de3b7d authored by Chris Sullivan's avatar Chris Sullivan Committed by Scott Cyphers

Revert changes to gpu shape and update (#1354)

* GPUShape(int32_t) -> NVShape(uint32_2), NVDiff(int32_t)

* Update code merged from master.

* Add nvshape.hpp and nvdiff.hpp.
parent e5e8d03c
...@@ -74,9 +74,9 @@ runtime::gpu::CUDAEmitter::CUDAEmitter(runtime::gpu::GPUPrimitiveEmitter* emitte ...@@ -74,9 +74,9 @@ runtime::gpu::CUDAEmitter::CUDAEmitter(runtime::gpu::GPUPrimitiveEmitter* emitte
} }
size_t runtime::gpu::CUDAEmitter::build_concat(const std::vector<std::string>& dtypes, size_t runtime::gpu::CUDAEmitter::build_concat(const std::vector<std::string>& dtypes,
std::vector<GPUShape> input_shapes, std::vector<NVShape> input_shapes,
size_t concat_axis, size_t concat_axis,
GPUShape output_shape) NVShape output_shape)
{ {
std::stringstream kernel_name; std::stringstream kernel_name;
size_t input_size = input_shapes.size(); size_t input_size = input_shapes.size();
...@@ -168,8 +168,8 @@ size_t runtime::gpu::CUDAEmitter::build_concat(const std::vector<std::string>& d ...@@ -168,8 +168,8 @@ size_t runtime::gpu::CUDAEmitter::build_concat(const std::vector<std::string>& d
} }
size_t runtime::gpu::CUDAEmitter::build_onehot(const std::array<std::string, 2>& dtypes, size_t runtime::gpu::CUDAEmitter::build_onehot(const std::array<std::string, 2>& dtypes,
GPUShape input_shape, NVShape input_shape,
GPUShape output_shape, NVShape output_shape,
size_t one_hot_axis) size_t one_hot_axis)
{ {
std::stringstream kernel_name; std::stringstream kernel_name;
...@@ -238,7 +238,7 @@ size_t runtime::gpu::CUDAEmitter::build_onehot(const std::array<std::string, 2>& ...@@ -238,7 +238,7 @@ size_t runtime::gpu::CUDAEmitter::build_onehot(const std::array<std::string, 2>&
} }
size_t runtime::gpu::CUDAEmitter::build_reverse(const std::array<std::string, 2>& dtypes, size_t runtime::gpu::CUDAEmitter::build_reverse(const std::array<std::string, 2>& dtypes,
GPUShape input_shape, NVShape input_shape,
std::vector<uint32_t> reverse_axes) std::vector<uint32_t> reverse_axes)
{ {
uint32_t rank = static_cast<uint32_t>(input_shape.size()); uint32_t rank = static_cast<uint32_t>(input_shape.size());
...@@ -311,11 +311,11 @@ size_t runtime::gpu::CUDAEmitter::build_reverse(const std::array<std::string, 2> ...@@ -311,11 +311,11 @@ size_t runtime::gpu::CUDAEmitter::build_reverse(const std::array<std::string, 2>
} }
size_t runtime::gpu::CUDAEmitter::build_pad(const std::array<std::string, 2>& dtypes, size_t runtime::gpu::CUDAEmitter::build_pad(const std::array<std::string, 2>& dtypes,
GPUShape input_shape, NVShape input_shape,
GPUShape output_shape, NVShape output_shape,
GPUShape padding_below, NVShape padding_below,
GPUShape padding_above, NVShape padding_above,
GPUShape padding_interior, NVShape padding_interior,
const std::string& pad_value) const std::string& pad_value)
{ {
// Need to check: are there models in which some tensors will have different types? if so, this // Need to check: are there models in which some tensors will have different types? if so, this
...@@ -350,9 +350,9 @@ size_t runtime::gpu::CUDAEmitter::build_pad(const std::array<std::string, 2>& dt ...@@ -350,9 +350,9 @@ size_t runtime::gpu::CUDAEmitter::build_pad(const std::array<std::string, 2>& dt
if (compiled_kernel == nullptr) if (compiled_kernel == nullptr)
{ {
// normalize pad dimensions to shape dimensions // normalize pad dimensions to shape dimensions
GPUShape pad_below(input_shape.size(), 0); NVShape pad_below(input_shape.size(), 0);
GPUShape pad_above(input_shape.size(), 0); NVShape pad_above(input_shape.size(), 0);
GPUShape pad_interior(input_shape.size(), 0); NVShape pad_interior(input_shape.size(), 0);
// if padding_interior is not zero length, it // if padding_interior is not zero length, it
// is from op::Pad for which padding_below will // is from op::Pad for which padding_below will
...@@ -372,8 +372,8 @@ size_t runtime::gpu::CUDAEmitter::build_pad(const std::array<std::string, 2>& dt ...@@ -372,8 +372,8 @@ size_t runtime::gpu::CUDAEmitter::build_pad(const std::array<std::string, 2>& dt
pad_interior = padding_interior; pad_interior = padding_interior;
} }
GPUShape input_strides = row_major_strides(input_shape); NVShape input_strides = row_major_strides(input_shape);
GPUShape output_strides = row_major_strides(output_shape); NVShape output_strides = row_major_strides(output_shape);
int offset = 0; int offset = 0;
for (size_t i = 0; i < output_strides.size(); i++) for (size_t i = 0; i < output_strides.size(); i++)
...@@ -477,10 +477,10 @@ size_t runtime::gpu::CUDAEmitter::build_pad(const std::array<std::string, 2>& dt ...@@ -477,10 +477,10 @@ size_t runtime::gpu::CUDAEmitter::build_pad(const std::array<std::string, 2>& dt
} }
size_t runtime::gpu::CUDAEmitter::build_pad_dynamic(const std::array<std::string, 2>& dtypes, size_t runtime::gpu::CUDAEmitter::build_pad_dynamic(const std::array<std::string, 2>& dtypes,
GPUShape input_shape, NVShape input_shape,
GPUShape output_shape, NVShape output_shape,
GPUShape padding_below, NVShape padding_below,
GPUShape padding_interior) NVShape padding_interior)
{ {
std::stringstream kernel_name; std::stringstream kernel_name;
kernel_name << "pad_dynamic_" << join(dtypes, "_"); kernel_name << "pad_dynamic_" << join(dtypes, "_");
...@@ -514,8 +514,8 @@ size_t runtime::gpu::CUDAEmitter::build_pad_dynamic(const std::array<std::string ...@@ -514,8 +514,8 @@ size_t runtime::gpu::CUDAEmitter::build_pad_dynamic(const std::array<std::string
uint32_t rank = static_cast<uint32_t>(input_shape.size()); uint32_t rank = static_cast<uint32_t>(input_shape.size());
uint32_t nthreads = static_cast<uint32_t>(shape_size(input_shape)); uint32_t nthreads = static_cast<uint32_t>(shape_size(input_shape));
GPUShape pad_below(input_shape.size(), 0); NVShape pad_below(input_shape.size(), 0);
GPUShape pad_interior(input_shape.size(), 1); NVShape pad_interior(input_shape.size(), 1);
int64_t i = padding_below.size() - 1; int64_t i = padding_below.size() - 1;
int64_t j = input_shape.size() - 1; int64_t j = input_shape.size() - 1;
...@@ -525,8 +525,8 @@ size_t runtime::gpu::CUDAEmitter::build_pad_dynamic(const std::array<std::string ...@@ -525,8 +525,8 @@ size_t runtime::gpu::CUDAEmitter::build_pad_dynamic(const std::array<std::string
pad_interior[j] = padding_interior[i]; pad_interior[j] = padding_interior[i];
} }
GPUShape input_strides = row_major_strides(input_shape); NVShape input_strides = row_major_strides(input_shape);
GPUShape output_strides = row_major_strides(output_shape); NVShape output_strides = row_major_strides(output_shape);
// get an allocator for transient per kernel gpu memory // get an allocator for transient per kernel gpu memory
GPUAllocator allocator = this->m_primitive_emitter->get_memory_allocator(); GPUAllocator allocator = this->m_primitive_emitter->get_memory_allocator();
...@@ -576,8 +576,8 @@ size_t runtime::gpu::CUDAEmitter::build_pad_dynamic(const std::array<std::string ...@@ -576,8 +576,8 @@ size_t runtime::gpu::CUDAEmitter::build_pad_dynamic(const std::array<std::string
return primitive_index; return primitive_index;
} }
size_t runtime::gpu::CUDAEmitter::build_reshape(const std::array<std::string, 2>& dtypes, size_t runtime::gpu::CUDAEmitter::build_reshape(const std::array<std::string, 2>& dtypes,
GPUShape input_shape, NVShape input_shape,
GPUShape input_order) NVShape input_order)
{ {
auto rank = input_shape.size(); auto rank = input_shape.size();
std::stringstream kernel_name; std::stringstream kernel_name;
...@@ -613,9 +613,9 @@ size_t runtime::gpu::CUDAEmitter::build_reshape(const std::array<std::string, 2> ...@@ -613,9 +613,9 @@ size_t runtime::gpu::CUDAEmitter::build_reshape(const std::array<std::string, 2>
//TODO: currently we set it to 64, will add tuning method later //TODO: currently we set it to 64, will add tuning method later
uint32_t block_size_x = 64; uint32_t block_size_x = 64;
uint32_t aligned_grid_size_x = align_to_block_size(nthreads, block_size_x); uint32_t aligned_grid_size_x = align_to_block_size(nthreads, block_size_x);
GPUShape input_strides = row_major_strides(input_shape); NVShape input_strides = row_major_strides(input_shape);
GPUShape output_strides(rank); NVShape output_strides(rank);
GPUShape trans_strides(rank); NVShape trans_strides(rank);
int stride = 1; int stride = 1;
for (int64_t i = rank - 1; i >= 0; i--) for (int64_t i = rank - 1; i >= 0; i--)
{ {
...@@ -662,10 +662,10 @@ size_t runtime::gpu::CUDAEmitter::build_reshape(const std::array<std::string, 2> ...@@ -662,10 +662,10 @@ size_t runtime::gpu::CUDAEmitter::build_reshape(const std::array<std::string, 2>
} }
size_t runtime::gpu::CUDAEmitter::build_slice(const std::array<std::string, 2>& dtypes, size_t runtime::gpu::CUDAEmitter::build_slice(const std::array<std::string, 2>& dtypes,
GPUShape input_shape, NVShape input_shape,
GPUShape lower_bounds, NVShape lower_bounds,
GPUShape slice_strides, NVShape slice_strides,
GPUShape output_shape) NVShape output_shape)
{ {
std::stringstream kernel_name; std::stringstream kernel_name;
kernel_name << "slice_" << join(dtypes, "_") << "_r_" << output_shape.size(); kernel_name << "slice_" << join(dtypes, "_") << "_r_" << output_shape.size();
...@@ -701,8 +701,8 @@ size_t runtime::gpu::CUDAEmitter::build_slice(const std::array<std::string, 2>& ...@@ -701,8 +701,8 @@ size_t runtime::gpu::CUDAEmitter::build_slice(const std::array<std::string, 2>&
//TODO: currently we set it to 64, will add tuning method later //TODO: currently we set it to 64, will add tuning method later
uint32_t block_size_x = 64; uint32_t block_size_x = 64;
uint32_t aligned_grid_size_x = align_to_block_size(nthreads, block_size_x); uint32_t aligned_grid_size_x = align_to_block_size(nthreads, block_size_x);
GPUShape output_strides = row_major_strides(output_shape); NVShape output_strides = row_major_strides(output_shape);
GPUShape input_strides = row_major_strides(input_shape); NVShape input_strides = row_major_strides(input_shape);
// get an allocator for transient per kernel gpu memory // get an allocator for transient per kernel gpu memory
GPUAllocator allocator = this->m_primitive_emitter->get_memory_allocator(); GPUAllocator allocator = this->m_primitive_emitter->get_memory_allocator();
...@@ -751,9 +751,9 @@ size_t runtime::gpu::CUDAEmitter::build_slice(const std::array<std::string, 2>& ...@@ -751,9 +751,9 @@ size_t runtime::gpu::CUDAEmitter::build_slice(const std::array<std::string, 2>&
} }
size_t runtime::gpu::CUDAEmitter::build_reverse_sequence(const std::array<std::string, 3>& dtypes, size_t runtime::gpu::CUDAEmitter::build_reverse_sequence(const std::array<std::string, 3>& dtypes,
GPUShape input_shape0, NVShape input_shape0,
GPUShape input_shape1, NVShape input_shape1,
GPUShape output_shape, NVShape output_shape,
size_t batch_axis, size_t batch_axis,
size_t sequence_axis) size_t sequence_axis)
{ {
...@@ -792,7 +792,7 @@ size_t runtime::gpu::CUDAEmitter::build_reverse_sequence(const std::array<std::s ...@@ -792,7 +792,7 @@ size_t runtime::gpu::CUDAEmitter::build_reverse_sequence(const std::array<std::s
//TODO: currently we set it to 64, will add tuning method later //TODO: currently we set it to 64, will add tuning method later
uint32_t block_size_x = 64; uint32_t block_size_x = 64;
uint32_t aligned_grid_size_x = align_to_block_size(nthreads, block_size_x); uint32_t aligned_grid_size_x = align_to_block_size(nthreads, block_size_x);
GPUShape output_strides = row_major_strides(output_shape); NVShape output_strides = row_major_strides(output_shape);
// get an allocator for transient per kernel gpu memory // get an allocator for transient per kernel gpu memory
GPUAllocator allocator = this->m_primitive_emitter->get_memory_allocator(); GPUAllocator allocator = this->m_primitive_emitter->get_memory_allocator();
...@@ -834,8 +834,8 @@ size_t runtime::gpu::CUDAEmitter::build_reverse_sequence(const std::array<std::s ...@@ -834,8 +834,8 @@ size_t runtime::gpu::CUDAEmitter::build_reverse_sequence(const std::array<std::s
} }
size_t runtime::gpu::CUDAEmitter::build_1d_max_pool(const std::array<std::string, 2>& dtypes, size_t runtime::gpu::CUDAEmitter::build_1d_max_pool(const std::array<std::string, 2>& dtypes,
GPUShape input_shape, NVShape input_shape,
GPUShape output_shape, NVShape output_shape,
size_t window_width, size_t window_width,
size_t window_stride) size_t window_stride)
{ {
...@@ -894,7 +894,7 @@ size_t runtime::gpu::CUDAEmitter::build_1d_max_pool(const std::array<std::string ...@@ -894,7 +894,7 @@ size_t runtime::gpu::CUDAEmitter::build_1d_max_pool(const std::array<std::string
} }
pooling_op_shape pooling_op_shape
avgpool_shape(GPUShape in, GPUShape out, GPUShape window, GPUShape strides, GPUShape pad) avgpool_shape(NVShape in, NVShape out, NVShape window, NVShape strides, NVShape pad)
{ {
pooling_op_shape shape; pooling_op_shape shape;
shape.N = in[0]; shape.N = in[0];
...@@ -963,11 +963,11 @@ pooling_op_shape ...@@ -963,11 +963,11 @@ pooling_op_shape
} }
size_t runtime::gpu::CUDAEmitter::build_avg_pool(const std::array<std::string, 2>& dtypes, size_t runtime::gpu::CUDAEmitter::build_avg_pool(const std::array<std::string, 2>& dtypes,
GPUShape input_shape, NVShape input_shape,
GPUShape output_shape, NVShape output_shape,
GPUShape window_shape, NVShape window_shape,
GPUShape window_stride, NVShape window_stride,
GPUShape padding_below, NVShape padding_below,
bool include_pad) bool include_pad)
{ {
// assumes NCDHW format // assumes NCDHW format
...@@ -1082,7 +1082,7 @@ size_t runtime::gpu::CUDAEmitter::build_avg_pool(const std::array<std::string, 2 ...@@ -1082,7 +1082,7 @@ size_t runtime::gpu::CUDAEmitter::build_avg_pool(const std::array<std::string, 2
} }
size_t runtime::gpu::CUDAEmitter::build_elementwise_n_to_1(const std::vector<std::string>& dtypes, size_t runtime::gpu::CUDAEmitter::build_elementwise_n_to_1(const std::vector<std::string>& dtypes,
GPUShape tensor_shape, NVShape tensor_shape,
const char* op, const char* op,
const char* kernel) const char* kernel)
{ {
...@@ -1256,8 +1256,8 @@ size_t runtime::gpu::CUDAEmitter::build_primitive(const op::MaxPool* node) ...@@ -1256,8 +1256,8 @@ size_t runtime::gpu::CUDAEmitter::build_primitive(const op::MaxPool* node)
} }
size_t runtime::gpu::CUDAEmitter::build_softmax_divide(const std::vector<std::string>& dtypes, size_t runtime::gpu::CUDAEmitter::build_softmax_divide(const std::vector<std::string>& dtypes,
GPUShape input_shape, NVShape input_shape,
GPUShape reduce_shape, NVShape reduce_shape,
std::vector<size_t> axes_flag) std::vector<size_t> axes_flag)
{ {
std::string kernel_name = std::string kernel_name =
...@@ -1287,8 +1287,8 @@ size_t runtime::gpu::CUDAEmitter::build_softmax_divide(const std::vector<std::st ...@@ -1287,8 +1287,8 @@ size_t runtime::gpu::CUDAEmitter::build_softmax_divide(const std::vector<std::st
compiled_kernel = m_ctx->compiled_kernel_pool->set(kernel_name, writer.get_code()); compiled_kernel = m_ctx->compiled_kernel_pool->set(kernel_name, writer.get_code());
} }
GPUShape input_strides = row_major_strides(input_shape); NVShape input_strides = row_major_strides(input_shape);
GPUShape reduce_strides = row_major_strides(reduce_shape); NVShape reduce_strides = row_major_strides(reduce_shape);
GPUAllocator allocator = this->m_primitive_emitter->get_memory_allocator(); GPUAllocator allocator = this->m_primitive_emitter->get_memory_allocator();
...@@ -1408,7 +1408,7 @@ size_t runtime::gpu::CUDAEmitter::build_primitive(const op::Softmax* node) ...@@ -1408,7 +1408,7 @@ size_t runtime::gpu::CUDAEmitter::build_primitive(const op::Softmax* node)
size_t size_t
runtime::gpu::CUDAEmitter::build_fused_ew_to_collective(const std::vector<std::string>& dtypes, runtime::gpu::CUDAEmitter::build_fused_ew_to_collective(const std::vector<std::string>& dtypes,
GPUShape tensor_shape, NVShape tensor_shape,
const std::set<size_t>& reduced_tensors, const std::set<size_t>& reduced_tensors,
const std::set<size_t>& axes, const std::set<size_t>& axes,
const char* op, const char* op,
...@@ -1462,7 +1462,7 @@ size_t ...@@ -1462,7 +1462,7 @@ size_t
} }
// calculate strides // calculate strides
GPUShape strides = row_major_strides(tensor_shape); NVShape strides = row_major_strides(tensor_shape);
// precacluate invariants for integer division via multiplication // precacluate invariants for integer division via multiplication
std::vector<int> stride_magic; std::vector<int> stride_magic;
std::vector<int> stride_shift; std::vector<int> stride_shift;
...@@ -1475,12 +1475,12 @@ size_t ...@@ -1475,12 +1475,12 @@ size_t
stride_shift.push_back(shift); stride_shift.push_back(shift);
} }
// calculate reduced tensor strides with 0s inserted for reduced axes // calculate reduced tensor strides with 0s inserted for reduced axes
GPUShape reduced_shape = tensor_shape; NVShape reduced_shape = tensor_shape;
for (auto const& axis : axes) for (auto const& axis : axes)
{ {
reduced_shape[axis] = 1; reduced_shape[axis] = 1;
} }
GPUShape reduced_strides = row_major_strides(reduced_shape); NVShape reduced_strides = row_major_strides(reduced_shape);
for (auto const& axis : axes) for (auto const& axis : axes)
{ {
reduced_strides[axis] = 0; reduced_strides[axis] = 0;
...@@ -1544,10 +1544,10 @@ size_t ...@@ -1544,10 +1544,10 @@ size_t
size_t runtime::gpu::CUDAEmitter::build_reduce_window(const OpName op_name, size_t runtime::gpu::CUDAEmitter::build_reduce_window(const OpName op_name,
const std::vector<std::string>& dtypes, const std::vector<std::string>& dtypes,
GPUShape input_shape, NVShape input_shape,
GPUShape output_shape, NVShape output_shape,
GPUShape reduce_window_shape, NVShape reduce_window_shape,
GPUShape reduce_window_strides) NVShape reduce_window_strides)
{ {
const char* op = NULL; const char* op = NULL;
const char* kernel = NULL; const char* kernel = NULL;
...@@ -1605,7 +1605,7 @@ size_t runtime::gpu::CUDAEmitter::build_reduce_window(const OpName op_name, ...@@ -1605,7 +1605,7 @@ size_t runtime::gpu::CUDAEmitter::build_reduce_window(const OpName op_name,
} }
size_t nthreads = shape_size(output_shape); size_t nthreads = shape_size(output_shape);
GPUShape input_strides = row_major_strides(input_shape); NVShape input_strides = row_major_strides(input_shape);
// get an allocator for transient per kernel gpu memory // get an allocator for transient per kernel gpu memory
GPUAllocator allocator = this->m_primitive_emitter->get_memory_allocator(); GPUAllocator allocator = this->m_primitive_emitter->get_memory_allocator();
...@@ -1658,11 +1658,11 @@ size_t runtime::gpu::CUDAEmitter::build_reduce_window(const OpName op_name, ...@@ -1658,11 +1658,11 @@ size_t runtime::gpu::CUDAEmitter::build_reduce_window(const OpName op_name,
} }
size_t runtime::gpu::CUDAEmitter::build_replace_slice(const std::array<std::string, 3>& dtypes, size_t runtime::gpu::CUDAEmitter::build_replace_slice(const std::array<std::string, 3>& dtypes,
GPUShape tensor_shape, NVShape tensor_shape,
GPUShape source_shape, NVShape source_shape,
GPUShape lower_bounds, NVShape lower_bounds,
GPUShape upper_bounds, NVShape upper_bounds,
GPUShape slice_strides) NVShape slice_strides)
{ {
// assumes NC{d1,...,dn} format // assumes NC{d1,...,dn} format
std::string kernel_name = "repslices_" + join(dtypes, "_"); std::string kernel_name = "repslices_" + join(dtypes, "_");
...@@ -1695,8 +1695,8 @@ size_t runtime::gpu::CUDAEmitter::build_replace_slice(const std::array<std::stri ...@@ -1695,8 +1695,8 @@ size_t runtime::gpu::CUDAEmitter::build_replace_slice(const std::array<std::stri
} }
// calculate strides // calculate strides
GPUShape input_strides = row_major_strides(tensor_shape); NVShape input_strides = row_major_strides(tensor_shape);
GPUShape source_strides = row_major_strides(source_shape); NVShape source_strides = row_major_strides(source_shape);
// precacluate invariants for integer division via multiplication // precacluate invariants for integer division via multiplication
std::vector<int> dmagics; std::vector<int> dmagics;
std::vector<int> dshifts; std::vector<int> dshifts;
...@@ -1796,7 +1796,7 @@ size_t runtime::gpu::CUDAEmitter::build_replace_slice(const std::array<std::stri ...@@ -1796,7 +1796,7 @@ size_t runtime::gpu::CUDAEmitter::build_replace_slice(const std::array<std::stri
} }
size_t runtime::gpu::CUDAEmitter::build_broadcast(const std::array<std::string, 2>& dtypes, size_t runtime::gpu::CUDAEmitter::build_broadcast(const std::array<std::string, 2>& dtypes,
GPUShape result_shape, NVShape result_shape,
const std::set<size_t>& reduce_axes) const std::set<size_t>& reduce_axes)
{ {
// assumes NC{d1,...,dn} format // assumes NC{d1,...,dn} format
...@@ -1816,7 +1816,7 @@ size_t runtime::gpu::CUDAEmitter::build_broadcast(const std::array<std::string, ...@@ -1816,7 +1816,7 @@ size_t runtime::gpu::CUDAEmitter::build_broadcast(const std::array<std::string,
} }
// calculate strides // calculate strides
GPUShape strides = row_major_strides(result_shape); NVShape strides = row_major_strides(result_shape);
// precacluate invariants for integer division via multiplication // precacluate invariants for integer division via multiplication
std::vector<int> stride_magic; std::vector<int> stride_magic;
std::vector<int> stride_shift; std::vector<int> stride_shift;
...@@ -1829,12 +1829,12 @@ size_t runtime::gpu::CUDAEmitter::build_broadcast(const std::array<std::string, ...@@ -1829,12 +1829,12 @@ size_t runtime::gpu::CUDAEmitter::build_broadcast(const std::array<std::string,
stride_shift.push_back(shift); stride_shift.push_back(shift);
} }
// calculate reduced tensor strides with 0s inserted for reduced axes // calculate reduced tensor strides with 0s inserted for reduced axes
GPUShape reduced_shape = result_shape; NVShape reduced_shape = result_shape;
for (auto const& axis : reduce_axes) for (auto const& axis : reduce_axes)
{ {
reduced_shape[axis] = 1; reduced_shape[axis] = 1;
} }
GPUShape reduced_strides = row_major_strides(reduced_shape); NVShape reduced_strides = row_major_strides(reduced_shape);
for (auto const& axis : reduce_axes) for (auto const& axis : reduce_axes)
{ {
reduced_strides[axis] = 0; reduced_strides[axis] = 0;
...@@ -1940,7 +1940,7 @@ size_t runtime::gpu::CUDAEmitter::build_primitive(const op::Convolution* node) ...@@ -1940,7 +1940,7 @@ size_t runtime::gpu::CUDAEmitter::build_primitive(const op::Convolution* node)
size_t transposed_output_idx = size_t transposed_output_idx =
allocator.reserve_workspace(shape_size(output_shape) * out[0].get_element_type().size()); allocator.reserve_workspace(shape_size(output_shape) * out[0].get_element_type().size());
GPUShape input_order; NVShape input_order;
for (int i = 1; i <= tensor_size; i++) for (int i = 1; i <= tensor_size; i++)
{ {
input_order.push_back(i % tensor_size); input_order.push_back(i % tensor_size);
...@@ -1957,7 +1957,7 @@ size_t runtime::gpu::CUDAEmitter::build_primitive(const op::Convolution* node) ...@@ -1957,7 +1957,7 @@ size_t runtime::gpu::CUDAEmitter::build_primitive(const op::Convolution* node)
input_order); input_order);
// local helper to reshape tensor shape objects // local helper to reshape tensor shape objects
auto reshape = [](const Shape& shape, const GPUShape& order) { auto reshape = [](const Shape& shape, const NVShape& order) {
Shape output(shape.size(), 0); Shape output(shape.size(), 0);
for (size_t i = 0; i < shape.size(); i++) for (size_t i = 0; i < shape.size(); i++)
{ {
...@@ -1977,12 +1977,12 @@ size_t runtime::gpu::CUDAEmitter::build_primitive(const op::Convolution* node) ...@@ -1977,12 +1977,12 @@ size_t runtime::gpu::CUDAEmitter::build_primitive(const op::Convolution* node)
args[1].get_element_type().c_type_string(), args[1].get_element_type().c_type_string(),
out[0].get_element_type().c_type_string()}}, out[0].get_element_type().c_type_string()}},
input_shape, input_shape,
node->get_padding_below(),
node->get_data_dilation_strides(),
filter_shape, filter_shape,
output_shape,
node->get_window_movement_strides(), node->get_window_movement_strides(),
node->get_window_dilation_strides(), node->get_window_dilation_strides(),
output_shape); node->get_data_dilation_strides(),
node->get_padding_below());
// reshape output tensor (K{do_1,...,do_n}N -> NK{do_1,...,do_n}) // reshape output tensor (K{do_1,...,do_n}N -> NK{do_1,...,do_n})
input_order.clear(); input_order.clear();
...@@ -2026,13 +2026,13 @@ size_t runtime::gpu::CUDAEmitter::build_primitive(const op::Convolution* node) ...@@ -2026,13 +2026,13 @@ size_t runtime::gpu::CUDAEmitter::build_primitive(const op::Convolution* node)
} }
size_t runtime::gpu::CUDAEmitter::build_convolution(const std::array<std::string, 3>& dtypes, size_t runtime::gpu::CUDAEmitter::build_convolution(const std::array<std::string, 3>& dtypes,
GPUShape input_shape, NVShape input_shape,
GPUShape input_pad_below, NVShape filter_shape,
GPUShape input_dilation, NVShape output_shape,
GPUShape filter_shape, NVShape filter_stride,
GPUShape filter_stride, NVShape filter_dilation,
GPUShape filter_dilation, NVShape input_dilation,
GPUShape output_shape) NVDiff input_pad_below)
{ {
// convolution is performed on tensors in the following format // convolution is performed on tensors in the following format
// input_shape: C{di_1,...,du_n}N // input_shape: C{di_1,...,du_n}N
...@@ -2155,7 +2155,7 @@ size_t runtime::gpu::CUDAEmitter::build_convolution(const std::array<std::string ...@@ -2155,7 +2155,7 @@ size_t runtime::gpu::CUDAEmitter::build_convolution(const std::array<std::string
data_dilation_magic[i] = magic; data_dilation_magic[i] = magic;
data_dilation_shift[i] = shift; data_dilation_shift[i] = shift;
} }
GPUShape input_shape_str = row_major_strides(input_shape); NVShape input_shape_str = row_major_strides(input_shape);
float alpha = 1.0f; float alpha = 1.0f;
float beta = 0.0f; float beta = 0.0f;
...@@ -2267,7 +2267,7 @@ size_t runtime::gpu::CUDAEmitter::build_convolution(const std::array<std::string ...@@ -2267,7 +2267,7 @@ size_t runtime::gpu::CUDAEmitter::build_convolution(const std::array<std::string
void runtime::gpu::CUDAEmitter::print_tensor_from_gpu(codegen::CodeWriter& writer, void runtime::gpu::CUDAEmitter::print_tensor_from_gpu(codegen::CodeWriter& writer,
const std::string& tensor_name, const std::string& tensor_name,
GPUShape shape) NVShape shape)
{ {
auto strides = row_major_strides(shape); auto strides = row_major_strides(shape);
writer << "__syncthreads();\n"; writer << "__syncthreads();\n";
......
...@@ -19,7 +19,8 @@ ...@@ -19,7 +19,8 @@
#include <array> #include <array>
#include "ngraph/codegen/code_writer.hpp" #include "ngraph/codegen/code_writer.hpp"
#include "ngraph/runtime/gpu/gpu_cuda_kernel_ops.hpp" #include "ngraph/runtime/gpu/gpu_cuda_kernel_ops.hpp"
#include "ngraph/runtime/gpu/gpu_shape.hpp" #include "ngraph/runtime/gpu/nvdiff.hpp"
#include "ngraph/runtime/gpu/nvshape.hpp"
#include "ngraph/strides.hpp" #include "ngraph/strides.hpp"
#include "ngraph/op/convolution.hpp" #include "ngraph/op/convolution.hpp"
...@@ -28,7 +29,7 @@ ...@@ -28,7 +29,7 @@
namespace ngraph namespace ngraph
{ {
class GPUShape; class NVShape;
namespace runtime namespace runtime
{ {
...@@ -48,65 +49,65 @@ namespace ngraph ...@@ -48,65 +49,65 @@ namespace ngraph
public: public:
size_t build_pad(const std::array<std::string, 2>& dtypes, size_t build_pad(const std::array<std::string, 2>& dtypes,
GPUShape input_shape, NVShape input_shape,
GPUShape output_shape, NVShape output_shape,
GPUShape pad_below, NVShape pad_below,
GPUShape pad_above, NVShape pad_above,
GPUShape pad_interior, NVShape pad_interior,
const std::string& pad_value = ""); const std::string& pad_value = "");
size_t build_pad_dynamic(const std::array<std::string, 2>& dtypes, size_t build_pad_dynamic(const std::array<std::string, 2>& dtypes,
GPUShape input_shape, NVShape input_shape,
GPUShape output_shape, NVShape output_shape,
GPUShape padding_below, NVShape padding_below,
GPUShape padding_interior); NVShape padding_interior);
size_t build_1d_max_pool(const std::array<std::string, 2>& dtypes, size_t build_1d_max_pool(const std::array<std::string, 2>& dtypes,
GPUShape input_shape, NVShape input_shape,
GPUShape output_shape, NVShape output_shape,
size_t window_width, size_t window_width,
size_t window_stride); size_t window_stride);
size_t build_avg_pool(const std::array<std::string, 2>& dtypes, size_t build_avg_pool(const std::array<std::string, 2>& dtypes,
GPUShape input_shape, NVShape input_shape,
GPUShape output_shape, NVShape output_shape,
GPUShape window_shape, NVShape window_shape,
GPUShape window_stride, NVShape window_stride,
GPUShape padding_below, NVShape padding_below,
bool include_pad = false); bool include_pad = false);
size_t build_slice(const std::array<std::string, 2>& dtypes, size_t build_slice(const std::array<std::string, 2>& dtypes,
GPUShape input_shape, NVShape input_shape,
GPUShape lower_bounds, NVShape lower_bounds,
GPUShape slice_strides, NVShape slice_strides,
GPUShape output_shape); NVShape output_shape);
size_t build_reduce_window(const OpName op_name, size_t build_reduce_window(const OpName op_name,
const std::vector<std::string>& dtypes, const std::vector<std::string>& dtypes,
GPUShape input_shape, NVShape input_shape,
GPUShape output_shape, NVShape output_shape,
GPUShape reduce_window_shape, NVShape reduce_window_shape,
GPUShape reduce_window_strides); NVShape reduce_window_strides);
size_t build_reverse_sequence(const std::array<std::string, 3>& dtypes, size_t build_reverse_sequence(const std::array<std::string, 3>& dtypes,
GPUShape input_shape0, NVShape input_shape0,
GPUShape input_shape1, NVShape input_shape1,
GPUShape output_shape, NVShape output_shape,
size_t batch_axis, size_t batch_axis,
size_t sequence_axis); size_t sequence_axis);
size_t build_onehot(const std::array<std::string, 2>& dtypes, size_t build_onehot(const std::array<std::string, 2>& dtypes,
GPUShape input_shape, NVShape input_shape,
GPUShape output_shape, NVShape output_shape,
size_t one_hot_axis); size_t one_hot_axis);
size_t build_reverse(const std::array<std::string, 2>& dtypes, size_t build_reverse(const std::array<std::string, 2>& dtypes,
GPUShape input_shape, NVShape input_shape,
std::vector<uint32_t> reverse_axes); std::vector<uint32_t> reverse_axes);
template <typename T> template <typename T>
size_t build_elementwise(const std::vector<std::string>& dtypes, size_t build_elementwise(const std::vector<std::string>& dtypes,
GPUShape tensor_shape) NVShape tensor_shape)
{ {
return build_elementwise_n_to_1( return build_elementwise_n_to_1(
dtypes, tensor_shape, CudaOpMap<T>::op, CudaOpMap<T>::math_kernel); dtypes, tensor_shape, CudaOpMap<T>::op, CudaOpMap<T>::math_kernel);
...@@ -114,7 +115,7 @@ namespace ngraph ...@@ -114,7 +115,7 @@ namespace ngraph
template <typename ELEMENTWISE_OP_TYPE, typename REDUCE_OP_TYPE = ngraph::op::Nop> template <typename ELEMENTWISE_OP_TYPE, typename REDUCE_OP_TYPE = ngraph::op::Nop>
size_t build_elementwise_collective(const std::vector<std::string>& dtypes, size_t build_elementwise_collective(const std::vector<std::string>& dtypes,
GPUShape tensor_shape, NVShape tensor_shape,
const std::set<size_t>& reduced_tensors = {}, const std::set<size_t>& reduced_tensors = {},
const std::set<size_t>& axes = {}, const std::set<size_t>& axes = {},
bool save_elementwise = false) bool save_elementwise = false)
...@@ -130,37 +131,37 @@ namespace ngraph ...@@ -130,37 +131,37 @@ namespace ngraph
} }
size_t build_replace_slice(const std::array<std::string, 3>& dtypes, size_t build_replace_slice(const std::array<std::string, 3>& dtypes,
GPUShape tensor_shape, NVShape tensor_shape,
GPUShape source_shape, NVShape source_shape,
GPUShape lower_bounds, NVShape lower_bounds,
GPUShape upper_bounds, NVShape upper_bounds,
GPUShape slice_stride); NVShape slice_stride);
size_t build_broadcast(const std::array<std::string, 2>& dtypes, size_t build_broadcast(const std::array<std::string, 2>& dtypes,
GPUShape result_shape, NVShape result_shape,
const std::set<size_t>& bcast_axes); const std::set<size_t>& bcast_axes);
size_t build_reshape(const std::array<std::string, 2>& dtypes, size_t build_reshape(const std::array<std::string, 2>& dtypes,
GPUShape input_shape, NVShape input_shape,
GPUShape input_order); NVShape input_order);
size_t build_convolution(const std::array<std::string, 3>& dtypes, size_t build_convolution(const std::array<std::string, 3>& dtypes,
GPUShape input_shape, NVShape input_shape,
GPUShape input_pad_below, NVShape filter_shape,
GPUShape input_dilation, NVShape output_shape,
GPUShape filter_shape, NVShape filter_stride,
GPUShape filter_stride, NVShape filter_dilation,
GPUShape filter_dilation, NVShape input_dilation,
GPUShape output_shape); NVDiff input_pad_below);
size_t build_concat(const std::vector<std::string>& dtypes, size_t build_concat(const std::vector<std::string>& dtypes,
std::vector<GPUShape> input_shapes, std::vector<NVShape> input_shapes,
size_t concat_axis, size_t concat_axis,
GPUShape output_shape); NVShape output_shape);
size_t build_softmax_divide(const std::vector<std::string>& dtypes, size_t build_softmax_divide(const std::vector<std::string>& dtypes,
GPUShape input_shape, NVShape input_shape,
GPUShape reduce_shape, NVShape reduce_shape,
std::vector<size_t> axes_flag); std::vector<size_t> axes_flag);
void debug_sync(); void debug_sync();
...@@ -171,14 +172,14 @@ namespace ngraph ...@@ -171,14 +172,14 @@ namespace ngraph
uint32_t align_to_block_size(uint32_t threads, uint32_t block_size); uint32_t align_to_block_size(uint32_t threads, uint32_t block_size);
void print_tensor_from_gpu(codegen::CodeWriter& writer, void print_tensor_from_gpu(codegen::CodeWriter& writer,
const std::string& tensor_name, const std::string& tensor_name,
GPUShape shape); NVShape shape);
std::string include_helpers(); std::string include_helpers();
size_t build_elementwise_n_to_1(const std::vector<std::string>& dtypes, size_t build_elementwise_n_to_1(const std::vector<std::string>& dtypes,
GPUShape tensor_shape, NVShape tensor_shape,
const char* op, const char* op,
const char* kernel); const char* kernel);
size_t build_fused_ew_to_collective(const std::vector<std::string>& dtypes, size_t build_fused_ew_to_collective(const std::vector<std::string>& dtypes,
GPUShape tensor_shape, NVShape tensor_shape,
const std::set<size_t>& reduced_tensors, const std::set<size_t>& reduced_tensors,
const std::set<size_t>& axes, const std::set<size_t>& axes,
const char* op, const char* op,
......
...@@ -475,7 +475,7 @@ namespace ngraph ...@@ -475,7 +475,7 @@ namespace ngraph
auto axis = concat->get_concatenation_axis(); auto axis = concat->get_concatenation_axis();
std::vector<std::string> dtypes; std::vector<std::string> dtypes;
std::vector<GPUShape> input_shapes; std::vector<NVShape> input_shapes;
for (auto arg : args) for (auto arg : args)
{ {
dtypes.push_back(arg.get_type()); dtypes.push_back(arg.get_type());
......
/*******************************************************************************
* Copyright 2017-2018 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#pragma once
#include <cstdio>
#include <stdexcept>
#include <vector>
#include "ngraph/coordinate_diff.hpp"
namespace ngraph
{
class Shape;
/// \brief Shape for a tensor resident on GPU.
class NVDiff : public std::vector<int32_t>
{
public:
NVDiff(const std::initializer_list<int32_t>& axis_lengths)
: std::vector<int32_t>(axis_lengths)
{
}
NVDiff(const std::vector<int32_t>& axis_lengths)
: std::vector<int32_t>(axis_lengths)
{
}
NVDiff(const NVDiff& axis_lengths)
: std::vector<int32_t>(axis_lengths)
{
}
explicit NVDiff(size_t n, int32_t initial_value = 0)
: std::vector<int32_t>(n, initial_value)
{
}
template <class InputIterator>
NVDiff(InputIterator first, InputIterator last)
: std::vector<int32_t>(first, last)
{
}
NVDiff() {}
NVDiff& operator=(const NVDiff& v)
{
static_cast<std::vector<int32_t>*>(this)->operator=(v);
return *this;
}
NVDiff& operator=(NVDiff&& v)
{
static_cast<std::vector<int32_t>*>(this)->operator=(v);
return *this;
}
NVDiff(const CoordinateDiff& coord)
{
for (auto const& dim : coord)
{
if (std::abs(dim) >> 32 != 0)
{
throw std::runtime_error(
"Request for CoordinateDiff which exceed the bitwidth available for "
"NVDiffs (32)");
}
this->push_back(static_cast<int32_t>(dim));
}
}
};
}
...@@ -31,129 +31,114 @@ namespace ngraph ...@@ -31,129 +31,114 @@ namespace ngraph
{ {
class Shape; class Shape;
/// \brief Shape for a tensor resident on GPU. /// \brief Shape for a tensor resident on GPU.
class GPUShape : public std::vector<int32_t> class NVShape : public std::vector<uint32_t>
{ {
public: public:
GPUShape(const std::initializer_list<int32_t>& axis_lengths) NVShape(const std::initializer_list<uint32_t>& axis_lengths)
: std::vector<int32_t>(axis_lengths) : std::vector<uint32_t>(axis_lengths)
{ {
} }
GPUShape(const std::vector<int32_t>& axis_lengths) NVShape(const std::vector<uint32_t>& axis_lengths)
: std::vector<int32_t>(axis_lengths) : std::vector<uint32_t>(axis_lengths)
{ {
} }
GPUShape(const GPUShape& axis_lengths) NVShape(const NVShape& axis_lengths)
: std::vector<int32_t>(axis_lengths) : std::vector<uint32_t>(axis_lengths)
{ {
} }
explicit GPUShape(size_t n, int32_t initial_value = 0) explicit NVShape(size_t n, uint32_t initial_value = 0)
: std::vector<int32_t>(n, initial_value) : std::vector<uint32_t>(n, initial_value)
{ {
} }
template <class InputIterator> template <class InputIterator>
GPUShape(InputIterator first, InputIterator last) NVShape(InputIterator first, InputIterator last)
: std::vector<int32_t>(first, last) : std::vector<uint32_t>(first, last)
{ {
} }
GPUShape() {} NVShape() {}
GPUShape& operator=(const GPUShape& v) NVShape& operator=(const NVShape& v)
{ {
static_cast<std::vector<int32_t>*>(this)->operator=(v); static_cast<std::vector<uint32_t>*>(this)->operator=(v);
return *this; return *this;
} }
GPUShape& operator=(GPUShape&& v) NVShape& operator=(NVShape&& v)
{ {
static_cast<std::vector<int32_t>*>(this)->operator=(v); static_cast<std::vector<uint32_t>*>(this)->operator=(v);
return *this; return *this;
} }
GPUShape(const std::vector<size_t>& vec) NVShape(const std::vector<size_t>& vec)
{ {
for (size_t const& size : vec) for (size_t const& size : vec)
{ {
if (size >> 32 != 0) if (size >> 32 != 0)
{ {
throw std::runtime_error( throw std::runtime_error(
"Request exceeds the bitwidth available for GPUShapes (32)"); "Request exceeds the bitwidth available for NVShapes (32)");
} }
this->push_back(static_cast<int32_t>(size)); this->push_back(static_cast<uint32_t>(size));
} }
} }
GPUShape(const Shape& shape) NVShape(const Shape& shape)
{ {
for (size_t const& size : shape) for (size_t const& size : shape)
{ {
if (size >> 32 != 0) if (size >> 32 != 0)
{ {
throw std::runtime_error( throw std::runtime_error(
"Request for Shape which exceeds the bitwidth available for GPUShapes " "Request for Shape which exceeds the bitwidth available for NVShapes "
"(32)"); "(32)");
} }
this->push_back(static_cast<int32_t>(size)); this->push_back(static_cast<uint32_t>(size));
} }
} }
GPUShape(const Strides& strides) NVShape(const Strides& strides)
{ {
for (size_t const& size : strides) for (size_t const& size : strides)
{ {
if (size >> 32 != 0) if (size >> 32 != 0)
{ {
throw std::runtime_error( throw std::runtime_error(
"Request for Strides which exceed the bitwidth available for GPUShapes " "Request for Strides which exceed the bitwidth available for NVShapes "
"(32)"); "(32)");
} }
this->push_back(static_cast<int32_t>(size)); this->push_back(static_cast<uint32_t>(size));
} }
} }
GPUShape(const Coordinate& coord) NVShape(const Coordinate& coord)
{ {
for (size_t const& size : coord) for (size_t const& size : coord)
{ {
if (size >> 32 != 0) if (size >> 32 != 0)
{ {
throw std::runtime_error( throw std::runtime_error(
"Request for Coordinate which exceed the bitwidth available for GPUShapes " "Request for Coordinate which exceed the bitwidth available for NVShapes "
"(32)"); "(32)");
} }
this->push_back(static_cast<int32_t>(size)); this->push_back(static_cast<uint32_t>(size));
} }
} }
GPUShape(const CoordinateDiff& coord) NVShape(const AxisVector& vec)
{
for (auto const& dim : coord)
{
if (dim > 0 && dim >> 32 != 0)
{
throw std::runtime_error(
"Request for CoordinateDiff which exceed the bitwidth available for "
"GPUShapes "
"(32)");
}
this->push_back(static_cast<int32_t>(dim));
}
}
GPUShape(const AxisVector& vec)
{ {
for (auto const& size : vec) for (auto const& size : vec)
{ {
if (size >> 32 != 0) if (size >> 32 != 0)
{ {
throw std::runtime_error( throw std::runtime_error(
"Request for axis vector which exceed the bitwidth available for GPUShapes " "Request for axis vector which exceed the bitwidth available for NVShapes "
"(32)"); "(32)");
} }
this->push_back(static_cast<int32_t>(size)); this->push_back(static_cast<uint32_t>(size));
} }
} }
}; };
......
...@@ -20,15 +20,15 @@ ...@@ -20,15 +20,15 @@
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "ngraph/ngraph.hpp" #include "ngraph/ngraph.hpp"
#include "ngraph/runtime/gpu/gpu_primitive_emitter.hpp" #include "ngraph/runtime/gpu/gpu_primitive_emitter.hpp"
#include "ngraph/runtime/gpu/gpu_shape.hpp"
#include "ngraph/runtime/gpu/gpu_util.hpp" #include "ngraph/runtime/gpu/gpu_util.hpp"
#include "ngraph/runtime/gpu/nvshape.hpp"
using namespace ngraph; using namespace ngraph;
TEST(gpu_test, gpu_shape_from_64bit_shape) TEST(gpu_test, gpu_shape_from_64bit_shape)
{ {
Shape shape{1UL << 33}; Shape shape{1UL << 33};
ASSERT_ANY_THROW([](GPUShape s) {}(shape);); ASSERT_ANY_THROW([](NVShape s) {}(shape););
} }
TEST(gpu_test, memory_manager_unallocated) TEST(gpu_test, memory_manager_unallocated)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment