Revert changes to gpu shape and update (#1354)

* GPUShape(int32_t) -> NVShape(uint32_2), NVDiff(int32_t) * Update code merged from master. * Add nvshape.hpp and nvdiff.hpp.

Revert changes to gpu shape and update (#1354)
* GPUShape(int32_t) -> NVShape(uint32_2), NVDiff(int32_t) * Update code merged from master. * Add nvshape.hpp and nvdiff.hpp.
b8de3b7d · Chris Sullivan · Scott Cyphers · e5e8d03c · b8de3b7d · b8de3b7d
Commit b8de3b7d authored Aug 08, 2018 by Chris Sullivan Committed by Scott Cyphers Aug 08, 2018
6 changed files
--- a/src/ngraph/runtime/gpu/cuda_emitter.cpp
+++ b/src/ngraph/runtime/gpu/cuda_emitter.cpp
@@ -74,9 +74,9 @@ runtime::gpu::CUDAEmitter::CUDAEmitter(runtime::gpu::GPUPrimitiveEmitter* emitte
 }
 size_t runtime::gpu::CUDAEmitter::build_concat(const std::vector<std::string>& dtypes,
-                                               std::vector<GPUShape> input_shapes,
+                                               std::vector<NVShape> input_shapes,
                                               size_t concat_axis,
-                                               GPUShape output_shape)
+                                               NVShape output_shape)
 {
    std::stringstream kernel_name;
    size_t input_size = input_shapes.size();
@@ -168,8 +168,8 @@ size_t runtime::gpu::CUDAEmitter::build_concat(const std::vector<std::string>& d
 }
 size_t runtime::gpu::CUDAEmitter::build_onehot(const std::array<std::string, 2>& dtypes,
-                                               GPUShape input_shape,
+                                               NVShape input_shape,
-                                               GPUShape output_shape,
+                                               NVShape output_shape,
                                               size_t one_hot_axis)
 {
    std::stringstream kernel_name;
@@ -238,7 +238,7 @@ size_t runtime::gpu::CUDAEmitter::build_onehot(const std::array<std::string, 2>&
 }
 size_t runtime::gpu::CUDAEmitter::build_reverse(const std::array<std::string, 2>& dtypes,
-                                                GPUShape input_shape,
+                                                NVShape input_shape,
                                                std::vector<uint32_t> reverse_axes)
 {
    uint32_t rank = static_cast<uint32_t>(input_shape.size());
@@ -311,11 +311,11 @@ size_t runtime::gpu::CUDAEmitter::build_reverse(const std::array<std::string, 2>
 }
 size_t runtime::gpu::CUDAEmitter::build_pad(const std::array<std::string, 2>& dtypes,
-                                            GPUShape input_shape,
+                                            NVShape input_shape,
-                                            GPUShape output_shape,
+                                            NVShape output_shape,
-                                            GPUShape padding_below,
+                                            NVShape padding_below,
-                                            GPUShape padding_above,
+                                            NVShape padding_above,
-                                            GPUShape padding_interior,
+                                            NVShape padding_interior,
                                            const std::string& pad_value)
 {
    // Need to check: are there models in which some tensors will have different types? if so, this
@@ -350,9 +350,9 @@ size_t runtime::gpu::CUDAEmitter::build_pad(const std::array<std::string, 2>& dt
    if (compiled_kernel == nullptr)
    {
        // normalize pad dimensions to shape dimensions
-        GPUShape pad_below(input_shape.size(), 0);
+        NVShape pad_below(input_shape.size(), 0);
-        GPUShape pad_above(input_shape.size(), 0);
+        NVShape pad_above(input_shape.size(), 0);
-        GPUShape pad_interior(input_shape.size(), 0);
+        NVShape pad_interior(input_shape.size(), 0);
        // if padding_interior is not zero length, it
        // is from op::Pad for which padding_below will
@@ -372,8 +372,8 @@ size_t runtime::gpu::CUDAEmitter::build_pad(const std::array<std::string, 2>& dt
            pad_interior = padding_interior;
        }
-        GPUShape input_strides = row_major_strides(input_shape);
+        NVShape input_strides = row_major_strides(input_shape);
-        GPUShape output_strides = row_major_strides(output_shape);
+        NVShape output_strides = row_major_strides(output_shape);
        int offset = 0;
        for (size_t i = 0; i < output_strides.size(); i++)
@@ -477,10 +477,10 @@ size_t runtime::gpu::CUDAEmitter::build_pad(const std::array<std::string, 2>& dt
 }
 size_t runtime::gpu::CUDAEmitter::build_pad_dynamic(const std::array<std::string, 2>& dtypes,
-                                                    GPUShape input_shape,
+                                                    NVShape input_shape,
-                                                    GPUShape output_shape,
+                                                    NVShape output_shape,
-                                                    GPUShape padding_below,
+                                                    NVShape padding_below,
-                                                    GPUShape padding_interior)
+                                                    NVShape padding_interior)
 {
    std::stringstream kernel_name;
    kernel_name << "pad_dynamic_" << join(dtypes, "_");
@@ -514,8 +514,8 @@ size_t runtime::gpu::CUDAEmitter::build_pad_dynamic(const std::array<std::string
    uint32_t rank = static_cast<uint32_t>(input_shape.size());
    uint32_t nthreads = static_cast<uint32_t>(shape_size(input_shape));
-    GPUShape pad_below(input_shape.size(), 0);
+    NVShape pad_below(input_shape.size(), 0);
-    GPUShape pad_interior(input_shape.size(), 1);
+    NVShape pad_interior(input_shape.size(), 1);
    int64_t i = padding_below.size() - 1;
    int64_t j = input_shape.size() - 1;
@@ -525,8 +525,8 @@ size_t runtime::gpu::CUDAEmitter::build_pad_dynamic(const std::array<std::string
        pad_interior[j] = padding_interior[i];
    }
-    GPUShape input_strides = row_major_strides(input_shape);
+    NVShape input_strides = row_major_strides(input_shape);
-    GPUShape output_strides = row_major_strides(output_shape);
+    NVShape output_strides = row_major_strides(output_shape);
    // get an allocator for transient per kernel gpu memory
    GPUAllocator allocator = this->m_primitive_emitter->get_memory_allocator();
@@ -576,8 +576,8 @@ size_t runtime::gpu::CUDAEmitter::build_pad_dynamic(const std::array<std::string
    return primitive_index;
 }
 size_t runtime::gpu::CUDAEmitter::build_reshape(const std::array<std::string, 2>& dtypes,
-                                                GPUShape input_shape,
+                                                NVShape input_shape,
-                                                GPUShape input_order)
+                                                NVShape input_order)
 {
    auto rank = input_shape.size();
    std::stringstream kernel_name;
@@ -613,9 +613,9 @@ size_t runtime::gpu::CUDAEmitter::build_reshape(const std::array<std::string, 2>
    //TODO: currently we set it to 64, will add tuning method later
    uint32_t block_size_x = 64;
    uint32_t aligned_grid_size_x = align_to_block_size(nthreads, block_size_x);
-    GPUShape input_strides = row_major_strides(input_shape);
+    NVShape input_strides = row_major_strides(input_shape);
-    GPUShape output_strides(rank);
+    NVShape output_strides(rank);
-    GPUShape trans_strides(rank);
+    NVShape trans_strides(rank);
    int stride = 1;
    for (int64_t i = rank - 1; i >= 0; i--)
    {
@@ -662,10 +662,10 @@ size_t runtime::gpu::CUDAEmitter::build_reshape(const std::array<std::string, 2>
 }
 size_t runtime::gpu::CUDAEmitter::build_slice(const std::array<std::string, 2>& dtypes,
-                                              GPUShape input_shape,
+                                              NVShape input_shape,
-                                              GPUShape lower_bounds,
+                                              NVShape lower_bounds,
-                                              GPUShape slice_strides,
+                                              NVShape slice_strides,
-                                              GPUShape output_shape)
+                                              NVShape output_shape)
 {
    std::stringstream kernel_name;
    kernel_name << "slice_" << join(dtypes, "_") << "_r_" << output_shape.size();
@@ -701,8 +701,8 @@ size_t runtime::gpu::CUDAEmitter::build_slice(const std::array<std::string, 2>&
    //TODO: currently we set it to 64, will add tuning method later
    uint32_t block_size_x = 64;
    uint32_t aligned_grid_size_x = align_to_block_size(nthreads, block_size_x);
-    GPUShape output_strides = row_major_strides(output_shape);
+    NVShape output_strides = row_major_strides(output_shape);
-    GPUShape input_strides = row_major_strides(input_shape);
+    NVShape input_strides = row_major_strides(input_shape);
    // get an allocator for transient per kernel gpu memory
    GPUAllocator allocator = this->m_primitive_emitter->get_memory_allocator();
@@ -751,9 +751,9 @@ size_t runtime::gpu::CUDAEmitter::build_slice(const std::array<std::string, 2>&
 }
 size_t runtime::gpu::CUDAEmitter::build_reverse_sequence(const std::array<std::string, 3>& dtypes,
-                                                         GPUShape input_shape0,
+                                                         NVShape input_shape0,
-                                                         GPUShape input_shape1,
+                                                         NVShape input_shape1,
-                                                         GPUShape output_shape,
+                                                         NVShape output_shape,
                                                         size_t batch_axis,
                                                         size_t sequence_axis)
 {
@@ -792,7 +792,7 @@ size_t runtime::gpu::CUDAEmitter::build_reverse_sequence(const std::array<std::s
    //TODO: currently we set it to 64, will add tuning method later
    uint32_t block_size_x = 64;
    uint32_t aligned_grid_size_x = align_to_block_size(nthreads, block_size_x);
-    GPUShape output_strides = row_major_strides(output_shape);
+    NVShape output_strides = row_major_strides(output_shape);
    // get an allocator for transient per kernel gpu memory
    GPUAllocator allocator = this->m_primitive_emitter->get_memory_allocator();
@@ -834,8 +834,8 @@ size_t runtime::gpu::CUDAEmitter::build_reverse_sequence(const std::array<std::s
 }
 size_t runtime::gpu::CUDAEmitter::build_1d_max_pool(const std::array<std::string, 2>& dtypes,
-                                                    GPUShape input_shape,
+                                                    NVShape input_shape,
-                                                    GPUShape output_shape,
+                                                    NVShape output_shape,
                                                    size_t window_width,
                                                    size_t window_stride)
 {
@@ -894,7 +894,7 @@ size_t runtime::gpu::CUDAEmitter::build_1d_max_pool(const std::array<std::string
 }
 pooling_op_shape
-    avgpool_shape(GPUShape in, GPUShape out, GPUShape window, GPUShape strides, GPUShape pad)
+    avgpool_shape(NVShape in, NVShape out, NVShape window, NVShape strides, NVShape pad)
 {
    pooling_op_shape shape;
    shape.N = in[0];
@@ -963,11 +963,11 @@ pooling_op_shape
 }
 size_t runtime::gpu::CUDAEmitter::build_avg_pool(const std::array<std::string, 2>& dtypes,
-                                                 GPUShape input_shape,
+                                                 NVShape input_shape,
-                                                 GPUShape output_shape,
+                                                 NVShape output_shape,
-                                                 GPUShape window_shape,
+                                                 NVShape window_shape,
-                                                 GPUShape window_stride,
+                                                 NVShape window_stride,
-                                                 GPUShape padding_below,
+                                                 NVShape padding_below,
                                                 bool include_pad)
 {
    // assumes NCDHW format
@@ -1082,7 +1082,7 @@ size_t runtime::gpu::CUDAEmitter::build_avg_pool(const std::array<std::string, 2
 }
 size_t runtime::gpu::CUDAEmitter::build_elementwise_n_to_1(const std::vector<std::string>& dtypes,
-                                                           GPUShape tensor_shape,
+                                                           NVShape tensor_shape,
                                                           const char* op,
                                                           const char* kernel)
 {
@@ -1256,8 +1256,8 @@ size_t runtime::gpu::CUDAEmitter::build_primitive(const op::MaxPool* node)
 }
 size_t runtime::gpu::CUDAEmitter::build_softmax_divide(const std::vector<std::string>& dtypes,
-                                                       GPUShape input_shape,
+                                                       NVShape input_shape,
-                                                       GPUShape reduce_shape,
+                                                       NVShape reduce_shape,
                                                       std::vector<size_t> axes_flag)
 {
    std::string kernel_name =
@@ -1287,8 +1287,8 @@ size_t runtime::gpu::CUDAEmitter::build_softmax_divide(const std::vector<std::st
        compiled_kernel = m_ctx->compiled_kernel_pool->set(kernel_name, writer.get_code());
    }
-    GPUShape input_strides = row_major_strides(input_shape);
+    NVShape input_strides = row_major_strides(input_shape);
-    GPUShape reduce_strides = row_major_strides(reduce_shape);
+    NVShape reduce_strides = row_major_strides(reduce_shape);
    GPUAllocator allocator = this->m_primitive_emitter->get_memory_allocator();
@@ -1408,7 +1408,7 @@ size_t runtime::gpu::CUDAEmitter::build_primitive(const op::Softmax* node)
 size_t
    runtime::gpu::CUDAEmitter::build_fused_ew_to_collective(const std::vector<std::string>& dtypes,
-                                                            GPUShape tensor_shape,
+                                                            NVShape tensor_shape,
                                                            const std::set<size_t>& reduced_tensors,
                                                            const std::set<size_t>& axes,
                                                            const char* op,
@@ -1462,7 +1462,7 @@ size_t
    }
    // calculate strides
-    GPUShape strides = row_major_strides(tensor_shape);
+    NVShape strides = row_major_strides(tensor_shape);
    // precacluate invariants for integer division via multiplication
    std::vector<int> stride_magic;
    std::vector<int> stride_shift;
@@ -1475,12 +1475,12 @@ size_t
        stride_shift.push_back(shift);
    }
    // calculate reduced tensor strides with 0s inserted for reduced axes
-    GPUShape reduced_shape = tensor_shape;
+    NVShape reduced_shape = tensor_shape;
    for (auto const& axis : axes)
    {
        reduced_shape[axis] = 1;
    }
-    GPUShape reduced_strides = row_major_strides(reduced_shape);
+    NVShape reduced_strides = row_major_strides(reduced_shape);
    for (auto const& axis : axes)
    {
        reduced_strides[axis] = 0;
@@ -1544,10 +1544,10 @@ size_t
 size_t runtime::gpu::CUDAEmitter::build_reduce_window(const OpName op_name,
                                                      const std::vector<std::string>& dtypes,
-                                                      GPUShape input_shape,
+                                                      NVShape input_shape,
-                                                      GPUShape output_shape,
+                                                      NVShape output_shape,
-                                                      GPUShape reduce_window_shape,
+                                                      NVShape reduce_window_shape,
-                                                      GPUShape reduce_window_strides)
+                                                      NVShape reduce_window_strides)
 {
    const char* op = NULL;
    const char* kernel = NULL;
@@ -1605,7 +1605,7 @@ size_t runtime::gpu::CUDAEmitter::build_reduce_window(const OpName op_name,
    }
    size_t nthreads = shape_size(output_shape);
-    GPUShape input_strides = row_major_strides(input_shape);
+    NVShape input_strides = row_major_strides(input_shape);
    // get an allocator for transient per kernel gpu memory
    GPUAllocator allocator = this->m_primitive_emitter->get_memory_allocator();
@@ -1658,11 +1658,11 @@ size_t runtime::gpu::CUDAEmitter::build_reduce_window(const OpName op_name,
 }
 size_t runtime::gpu::CUDAEmitter::build_replace_slice(const std::array<std::string, 3>& dtypes,
-                                                      GPUShape tensor_shape,
+                                                      NVShape tensor_shape,
-                                                      GPUShape source_shape,
+                                                      NVShape source_shape,
-                                                      GPUShape lower_bounds,
+                                                      NVShape lower_bounds,
-                                                      GPUShape upper_bounds,
+                                                      NVShape upper_bounds,
-                                                      GPUShape slice_strides)
+                                                      NVShape slice_strides)
 {
    // assumes NC{d1,...,dn} format
    std::string kernel_name = "repslices_" + join(dtypes, "_");
@@ -1695,8 +1695,8 @@ size_t runtime::gpu::CUDAEmitter::build_replace_slice(const std::array<std::stri
    }
    // calculate strides
-    GPUShape input_strides = row_major_strides(tensor_shape);
+    NVShape input_strides = row_major_strides(tensor_shape);
-    GPUShape source_strides = row_major_strides(source_shape);
+    NVShape source_strides = row_major_strides(source_shape);
    // precacluate invariants for integer division via multiplication
    std::vector<int> dmagics;
    std::vector<int> dshifts;
@@ -1796,7 +1796,7 @@ size_t runtime::gpu::CUDAEmitter::build_replace_slice(const std::array<std::stri
 }
 size_t runtime::gpu::CUDAEmitter::build_broadcast(const std::array<std::string, 2>& dtypes,
-                                                  GPUShape result_shape,
+                                                  NVShape result_shape,
                                                  const std::set<size_t>& reduce_axes)
 {
    // assumes NC{d1,...,dn} format
@@ -1816,7 +1816,7 @@ size_t runtime::gpu::CUDAEmitter::build_broadcast(const std::array<std::string,
    }
    // calculate strides
-    GPUShape strides = row_major_strides(result_shape);
+    NVShape strides = row_major_strides(result_shape);
    // precacluate invariants for integer division via multiplication
    std::vector<int> stride_magic;
    std::vector<int> stride_shift;
@@ -1829,12 +1829,12 @@ size_t runtime::gpu::CUDAEmitter::build_broadcast(const std::array<std::string,
        stride_shift.push_back(shift);
    }
    // calculate reduced tensor strides with 0s inserted for reduced axes
-    GPUShape reduced_shape = result_shape;
+    NVShape reduced_shape = result_shape;
    for (auto const& axis : reduce_axes)
    {
        reduced_shape[axis] = 1;
    }
-    GPUShape reduced_strides = row_major_strides(reduced_shape);
+    NVShape reduced_strides = row_major_strides(reduced_shape);
    for (auto const& axis : reduce_axes)
    {
        reduced_strides[axis] = 0;
@@ -1940,7 +1940,7 @@ size_t runtime::gpu::CUDAEmitter::build_primitive(const op::Convolution* node)
    size_t transposed_output_idx =
        allocator.reserve_workspace(shape_size(output_shape) * out[0].get_element_type().size());
-    GPUShape input_order;
+    NVShape input_order;
    for (int i = 1; i <= tensor_size; i++)
    {
        input_order.push_back(i % tensor_size);
@@ -1957,7 +1957,7 @@ size_t runtime::gpu::CUDAEmitter::build_primitive(const op::Convolution* node)
        input_order);
    // local helper to reshape tensor shape objects
-    auto reshape = [](const Shape& shape, const GPUShape& order) {
+    auto reshape = [](const Shape& shape, const NVShape& order) {
        Shape output(shape.size(), 0);
        for (size_t i = 0; i < shape.size(); i++)
        {
@@ -1977,12 +1977,12 @@ size_t runtime::gpu::CUDAEmitter::build_primitive(const op::Convolution* node)
                                            args[1].get_element_type().c_type_string(),
                                            out[0].get_element_type().c_type_string()}},
                                          input_shape,
-                                          node->get_padding_below(),
-                                          node->get_data_dilation_strides(),
                                          filter_shape,
+                                          output_shape,
                                          node->get_window_movement_strides(),
                                          node->get_window_dilation_strides(),
-                                          output_shape);
+                                          node->get_data_dilation_strides(),
+                                          node->get_padding_below());
    // reshape output tensor (K{do_1,...,do_n}N -> NK{do_1,...,do_n})
    input_order.clear();
@@ -2026,13 +2026,13 @@ size_t runtime::gpu::CUDAEmitter::build_primitive(const op::Convolution* node)
 }
 size_t runtime::gpu::CUDAEmitter::build_convolution(const std::array<std::string, 3>& dtypes,
-                                                    GPUShape input_shape,
+                                                    NVShape input_shape,
-                                                    GPUShape input_pad_below,
+                                                    NVShape filter_shape,
-                                                    GPUShape input_dilation,
+                                                    NVShape output_shape,
-                                                    GPUShape filter_shape,
+                                                    NVShape filter_stride,
-                                                    GPUShape filter_stride,
+                                                    NVShape filter_dilation,
-                                                    GPUShape filter_dilation,
+                                                    NVShape input_dilation,
-                                                    GPUShape output_shape)
+                                                    NVDiff input_pad_below)
 {
    // convolution is performed on tensors in the following format
    // input_shape:  C{di_1,...,du_n}N
@@ -2155,7 +2155,7 @@ size_t runtime::gpu::CUDAEmitter::build_convolution(const std::array<std::string
        data_dilation_magic[i] = magic;
        data_dilation_shift[i] = shift;
    }
-    GPUShape input_shape_str = row_major_strides(input_shape);
+    NVShape input_shape_str = row_major_strides(input_shape);
    float alpha = 1.0f;
    float beta = 0.0f;
@@ -2267,7 +2267,7 @@ size_t runtime::gpu::CUDAEmitter::build_convolution(const std::array<std::string
 void runtime::gpu::CUDAEmitter::print_tensor_from_gpu(codegen::CodeWriter& writer,
                                                      const std::string& tensor_name,
-                                                      GPUShape shape)
+                                                      NVShape shape)
 {
    auto strides = row_major_strides(shape);
    writer << "__syncthreads();\n";

--- a/src/ngraph/runtime/gpu/cuda_emitter.hpp
+++ b/src/ngraph/runtime/gpu/cuda_emitter.hpp
@@ -19,7 +19,8 @@
 #include <array>
 #include "ngraph/codegen/code_writer.hpp"
 #include "ngraph/runtime/gpu/gpu_cuda_kernel_ops.hpp"
-#include "ngraph/runtime/gpu/gpu_shape.hpp"
+#include "ngraph/runtime/gpu/nvdiff.hpp"
+#include "ngraph/runtime/gpu/nvshape.hpp"
 #include "ngraph/strides.hpp"
 #include "ngraph/op/convolution.hpp"
@@ -28,7 +29,7 @@
 namespace ngraph
 {
-    class GPUShape;
+    class NVShape;
    namespace runtime
    {
@@ -48,65 +49,65 @@ namespace ngraph
            public:
                size_t build_pad(const std::array<std::string, 2>& dtypes,
-                                 GPUShape input_shape,
+                                 NVShape input_shape,
-                                 GPUShape output_shape,
+                                 NVShape output_shape,
-                                 GPUShape pad_below,
+                                 NVShape pad_below,
-                                 GPUShape pad_above,
+                                 NVShape pad_above,
-                                 GPUShape pad_interior,
+                                 NVShape pad_interior,
                                 const std::string& pad_value = "");
                size_t build_pad_dynamic(const std::array<std::string, 2>& dtypes,
-                                         GPUShape input_shape,
+                                         NVShape input_shape,
-                                         GPUShape output_shape,
+                                         NVShape output_shape,
-                                         GPUShape padding_below,
+                                         NVShape padding_below,
-                                         GPUShape padding_interior);
+                                         NVShape padding_interior);
                size_t build_1d_max_pool(const std::array<std::string, 2>& dtypes,
-                                         GPUShape input_shape,
+                                         NVShape input_shape,
-                                         GPUShape output_shape,
+                                         NVShape output_shape,
                                         size_t window_width,
                                         size_t window_stride);
                size_t build_avg_pool(const std::array<std::string, 2>& dtypes,
-                                      GPUShape input_shape,
+                                      NVShape input_shape,
-                                      GPUShape output_shape,
+                                      NVShape output_shape,
-                                      GPUShape window_shape,
+                                      NVShape window_shape,
-                                      GPUShape window_stride,
+                                      NVShape window_stride,
-                                      GPUShape padding_below,
+                                      NVShape padding_below,
                                      bool include_pad = false);
                size_t build_slice(const std::array<std::string, 2>& dtypes,
-                                   GPUShape input_shape,
+                                   NVShape input_shape,
-                                   GPUShape lower_bounds,
+                                   NVShape lower_bounds,
-                                   GPUShape slice_strides,
+                                   NVShape slice_strides,
-                                   GPUShape output_shape);
+                                   NVShape output_shape);
                size_t build_reduce_window(const OpName op_name,
                                           const std::vector<std::string>& dtypes,
-                                           GPUShape input_shape,
+                                           NVShape input_shape,
-                                           GPUShape output_shape,
+                                           NVShape output_shape,
-                                           GPUShape reduce_window_shape,
+                                           NVShape reduce_window_shape,
-                                           GPUShape reduce_window_strides);
+                                           NVShape reduce_window_strides);
                size_t build_reverse_sequence(const std::array<std::string, 3>& dtypes,
-                                              GPUShape input_shape0,
+                                              NVShape input_shape0,
-                                              GPUShape input_shape1,
+                                              NVShape input_shape1,
-                                              GPUShape output_shape,
+                                              NVShape output_shape,
                                              size_t batch_axis,
                                              size_t sequence_axis);
                size_t build_onehot(const std::array<std::string, 2>& dtypes,
-                                    GPUShape input_shape,
+                                    NVShape input_shape,
-                                    GPUShape output_shape,
+                                    NVShape output_shape,
                                    size_t one_hot_axis);
                size_t build_reverse(const std::array<std::string, 2>& dtypes,
-                                     GPUShape input_shape,
+                                     NVShape input_shape,
                                     std::vector<uint32_t> reverse_axes);
                template <typename T>
                size_t build_elementwise(const std::vector<std::string>& dtypes,
-                                         GPUShape tensor_shape)
+                                         NVShape tensor_shape)
                {
                    return build_elementwise_n_to_1(
                        dtypes, tensor_shape, CudaOpMap<T>::op, CudaOpMap<T>::math_kernel);
@@ -114,7 +115,7 @@ namespace ngraph
                template <typename ELEMENTWISE_OP_TYPE, typename REDUCE_OP_TYPE = ngraph::op::Nop>
                size_t build_elementwise_collective(const std::vector<std::string>& dtypes,
-                                                    GPUShape tensor_shape,
+                                                    NVShape tensor_shape,
                                                    const std::set<size_t>& reduced_tensors = {},
                                                    const std::set<size_t>& axes = {},
                                                    bool save_elementwise = false)
@@ -130,37 +131,37 @@ namespace ngraph
                }
                size_t build_replace_slice(const std::array<std::string, 3>& dtypes,
-                                           GPUShape tensor_shape,
+                                           NVShape tensor_shape,
-                                           GPUShape source_shape,
+                                           NVShape source_shape,
-                                           GPUShape lower_bounds,
+                                           NVShape lower_bounds,
-                                           GPUShape upper_bounds,
+                                           NVShape upper_bounds,
-                                           GPUShape slice_stride);
+                                           NVShape slice_stride);
                size_t build_broadcast(const std::array<std::string, 2>& dtypes,
-                                       GPUShape result_shape,
+                                       NVShape result_shape,
                                       const std::set<size_t>& bcast_axes);
                size_t build_reshape(const std::array<std::string, 2>& dtypes,
-                                     GPUShape input_shape,
+                                     NVShape input_shape,
-                                     GPUShape input_order);
+                                     NVShape input_order);
                size_t build_convolution(const std::array<std::string, 3>& dtypes,
-                                         GPUShape input_shape,
+                                         NVShape input_shape,
-                                         GPUShape input_pad_below,
+                                         NVShape filter_shape,
-                                         GPUShape input_dilation,
+                                         NVShape output_shape,
-                                         GPUShape filter_shape,
+                                         NVShape filter_stride,
-                                         GPUShape filter_stride,
+                                         NVShape filter_dilation,
-                                         GPUShape filter_dilation,
+                                         NVShape input_dilation,
-                                         GPUShape output_shape);
+                                         NVDiff input_pad_below);
                size_t build_concat(const std::vector<std::string>& dtypes,
-                                    std::vector<GPUShape> input_shapes,
+                                    std::vector<NVShape> input_shapes,
                                    size_t concat_axis,
-                                    GPUShape output_shape);
+                                    NVShape output_shape);
                size_t build_softmax_divide(const std::vector<std::string>& dtypes,
-                                            GPUShape input_shape,
+                                            NVShape input_shape,
-                                            GPUShape reduce_shape,
+                                            NVShape reduce_shape,
                                            std::vector<size_t> axes_flag);
                void debug_sync();
@@ -171,14 +172,14 @@ namespace ngraph
                uint32_t align_to_block_size(uint32_t threads, uint32_t block_size);
                void print_tensor_from_gpu(codegen::CodeWriter& writer,
                                           const std::string& tensor_name,
-                                           GPUShape shape);
+                                           NVShape shape);
                std::string include_helpers();
                size_t build_elementwise_n_to_1(const std::vector<std::string>& dtypes,
-                                                GPUShape tensor_shape,
+                                                NVShape tensor_shape,
                                                const char* op,
                                                const char* kernel);
                size_t build_fused_ew_to_collective(const std::vector<std::string>& dtypes,
-                                                    GPUShape tensor_shape,
+                                                    NVShape tensor_shape,
                                                    const std::set<size_t>& reduced_tensors,
                                                    const std::set<size_t>& axes,
                                                    const char* op,

--- a/src/ngraph/runtime/gpu/gpu_emitter.cpp
+++ b/src/ngraph/runtime/gpu/gpu_emitter.cpp
@@ -475,7 +475,7 @@ namespace ngraph
                auto axis = concat->get_concatenation_axis();
                std::vector<std::string> dtypes;
-                std::vector<GPUShape> input_shapes;
+                std::vector<NVShape> input_shapes;
                for (auto arg : args)
                {
                    dtypes.push_back(arg.get_type());

--- a/src/ngraph/runtime/gpu/nvdiff.hpp
+++ b/src/ngraph/runtime/gpu/nvdiff.hpp
+/*******************************************************************************
+* Copyright 2017-2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#pragma once
+#include <cstdio>
+#include <stdexcept>
+#include <vector>
+#include "ngraph/coordinate_diff.hpp"
+namespace ngraph
+{
+    class Shape;
+    /// \brief Shape for a tensor resident on GPU.
+    class NVDiff : public std::vector<int32_t>
+    {
+    public:
+        NVDiff(const std::initializer_list<int32_t>& axis_lengths)
+            : std::vector<int32_t>(axis_lengths)
+        {
+        }
+        NVDiff(const std::vector<int32_t>& axis_lengths)
+            : std::vector<int32_t>(axis_lengths)
+        {
+        }
+        NVDiff(const NVDiff& axis_lengths)
+            : std::vector<int32_t>(axis_lengths)
+        {
+        }
+        explicit NVDiff(size_t n, int32_t initial_value = 0)
+            : std::vector<int32_t>(n, initial_value)
+        {
+        }
+        template <class InputIterator>
+        NVDiff(InputIterator first, InputIterator last)
+            : std::vector<int32_t>(first, last)
+        {
+        }
+        NVDiff() {}
+        NVDiff& operator=(const NVDiff& v)
+        {
+            static_cast<std::vector<int32_t>*>(this)->operator=(v);
+            return *this;
+        }
+        NVDiff& operator=(NVDiff&& v)
+        {
+            static_cast<std::vector<int32_t>*>(this)->operator=(v);
+            return *this;
+        }
+        NVDiff(const CoordinateDiff& coord)
+        {
+            for (auto const& dim : coord)
+            {
+                if (std::abs(dim) >> 32 != 0)
+                {
+                    throw std::runtime_error(
+                        "Request for CoordinateDiff which exceed the bitwidth available for "
+                        "NVDiffs (32)");
+                }
+                this->push_back(static_cast<int32_t>(dim));
+            }
+        }
+    };
+}
--- a/src/ngraph/runtime/gpu/gpu_shape.hpp
+++ b/src/ngraph/runtime/gpu/gpu_shape.hpp
@@ -31,129 +31,114 @@ namespace ngraph
 {
    class Shape;
    /// \brief Shape for a tensor resident on GPU.
-    class GPUShape : public std::vector<int32_t>
+    class NVShape : public std::vector<uint32_t>
    {
    public:
-        GPUShape(const std::initializer_list<int32_t>& axis_lengths)
+        NVShape(const std::initializer_list<uint32_t>& axis_lengths)
-            : std::vector<int32_t>(axis_lengths)
+            : std::vector<uint32_t>(axis_lengths)
        {
        }
-        GPUShape(const std::vector<int32_t>& axis_lengths)
+        NVShape(const std::vector<uint32_t>& axis_lengths)
-            : std::vector<int32_t>(axis_lengths)
+            : std::vector<uint32_t>(axis_lengths)
        {
        }
-        GPUShape(const GPUShape& axis_lengths)
+        NVShape(const NVShape& axis_lengths)
-            : std::vector<int32_t>(axis_lengths)
+            : std::vector<uint32_t>(axis_lengths)
        {
        }
-        explicit GPUShape(size_t n, int32_t initial_value = 0)
+        explicit NVShape(size_t n, uint32_t initial_value = 0)
-            : std::vector<int32_t>(n, initial_value)
+            : std::vector<uint32_t>(n, initial_value)
        {
        }
        template <class InputIterator>
-        GPUShape(InputIterator first, InputIterator last)
+        NVShape(InputIterator first, InputIterator last)
-            : std::vector<int32_t>(first, last)
+            : std::vector<uint32_t>(first, last)
        {
        }
-        GPUShape() {}
+        NVShape() {}
-        GPUShape& operator=(const GPUShape& v)
+        NVShape& operator=(const NVShape& v)
        {
-            static_cast<std::vector<int32_t>*>(this)->operator=(v);
+            static_cast<std::vector<uint32_t>*>(this)->operator=(v);
            return *this;
        }
-        GPUShape& operator=(GPUShape&& v)
+        NVShape& operator=(NVShape&& v)
        {
-            static_cast<std::vector<int32_t>*>(this)->operator=(v);
+            static_cast<std::vector<uint32_t>*>(this)->operator=(v);
            return *this;
        }
-        GPUShape(const std::vector<size_t>& vec)
+        NVShape(const std::vector<size_t>& vec)
        {
            for (size_t const& size : vec)
            {
                if (size >> 32 != 0)
                {
                    throw std::runtime_error(
-                        "Request exceeds the bitwidth available for GPUShapes (32)");
+                        "Request exceeds the bitwidth available for NVShapes (32)");
                }
-                this->push_back(static_cast<int32_t>(size));
+                this->push_back(static_cast<uint32_t>(size));
            }
        }
-        GPUShape(const Shape& shape)
+        NVShape(const Shape& shape)
        {
            for (size_t const& size : shape)
            {
                if (size >> 32 != 0)
                {
                    throw std::runtime_error(
-                        "Request for Shape which exceeds the bitwidth available for GPUShapes "
+                        "Request for Shape which exceeds the bitwidth available for NVShapes "
                        "(32)");
                }
-                this->push_back(static_cast<int32_t>(size));
+                this->push_back(static_cast<uint32_t>(size));
            }
        }
-        GPUShape(const Strides& strides)
+        NVShape(const Strides& strides)
        {
            for (size_t const& size : strides)
            {
                if (size >> 32 != 0)
                {
                    throw std::runtime_error(
-                        "Request for Strides which exceed the bitwidth available for GPUShapes "
+                        "Request for Strides which exceed the bitwidth available for NVShapes "
                        "(32)");
                }
-                this->push_back(static_cast<int32_t>(size));
+                this->push_back(static_cast<uint32_t>(size));
            }
        }
-        GPUShape(const Coordinate& coord)
+        NVShape(const Coordinate& coord)
        {
            for (size_t const& size : coord)
            {
                if (size >> 32 != 0)
                {
                    throw std::runtime_error(
-                        "Request for Coordinate which exceed the bitwidth available for GPUShapes "
+                        "Request for Coordinate which exceed the bitwidth available for NVShapes "
                        "(32)");
                }
-                this->push_back(static_cast<int32_t>(size));
+                this->push_back(static_cast<uint32_t>(size));
            }
        }
-        GPUShape(const CoordinateDiff& coord)
+        NVShape(const AxisVector& vec)
-        {
-            for (auto const& dim : coord)
-            {
-                if (dim > 0 && dim >> 32 != 0)
-                {
-                    throw std::runtime_error(
-                        "Request for CoordinateDiff which exceed the bitwidth available for "
-                        "GPUShapes "
-                        "(32)");
-                }
-                this->push_back(static_cast<int32_t>(dim));
-            }
-        }
-        GPUShape(const AxisVector& vec)
        {
            for (auto const& size : vec)
            {
                if (size >> 32 != 0)
                {
                    throw std::runtime_error(
-                        "Request for axis vector which exceed the bitwidth available for GPUShapes "
+                        "Request for axis vector which exceed the bitwidth available for NVShapes "
                        "(32)");
                }
-                this->push_back(static_cast<int32_t>(size));
+                this->push_back(static_cast<uint32_t>(size));
            }
        }
    };

--- a/test/gpu_test.cpp
+++ b/test/gpu_test.cpp
@@ -20,15 +20,15 @@
 #include "gtest/gtest.h"
 #include "ngraph/ngraph.hpp"
 #include "ngraph/runtime/gpu/gpu_primitive_emitter.hpp"
-#include "ngraph/runtime/gpu/gpu_shape.hpp"
 #include "ngraph/runtime/gpu/gpu_util.hpp"
+#include "ngraph/runtime/gpu/nvshape.hpp"
 using namespace ngraph;
 TEST(gpu_test, gpu_shape_from_64bit_shape)
 {
    Shape shape{1UL << 33};
-    ASSERT_ANY_THROW([](GPUShape s) {}(shape););
+    ASSERT_ANY_THROW([](NVShape s) {}(shape););
 }
 TEST(gpu_test, memory_manager_unallocated)