Commit 7188b5bd authored by Ayan Moitra's avatar Ayan Moitra Committed by Scott Cyphers

Support arg_reduce for int32 for nvGPU (#2122)

* Add argmReduce support for int32 +  tests

* add new line

* add to intelGPU manifest

* Address Fenglei's comment

* address comments

* Small change to emitter logic.

* Chris's comments incorporated

* minor edits + clang

* edit
parent d9c540bb
...@@ -16,3 +16,6 @@ quantize_clamp_int32 ...@@ -16,3 +16,6 @@ quantize_clamp_int32
# failing in CI build but passing on local machine # failing in CI build but passing on local machine
max_3d_to_scalar_int32 max_3d_to_scalar_int32
argmin_trivial_in_i32
argmax_4D_axis_3_i64_in_i32
...@@ -165,6 +165,15 @@ size_t runtime::gpu::CUDNNEmitter::build_reduce_forward(const cudnnReduceTensorO ...@@ -165,6 +165,15 @@ size_t runtime::gpu::CUDNNEmitter::build_reduce_forward(const cudnnReduceTensorO
const ReductionMode& reduction_mode) const ReductionMode& reduction_mode)
{ {
auto input_type = dtypes[0]; auto input_type = dtypes[0];
bool use_cudnn_reduce = !((reduction_mode == ReductionMode::Reduce) &&
((input_type == element::i32) || (input_type == element::i8)));
NGRAPH_ASSERT(use_cudnn_reduce)
<< "cuDNN reduce for input type int32_t or int8_t currently not supported";
bool unsupported_int8_type_arg_reduce =
!((reduction_mode == ReductionMode::ArgReduce) && (input_type == element::i8));
NGRAPH_ASSERT(unsupported_int8_type_arg_reduce)
<< "cuDNN arg_reduce for input type int8_t currently not supported";
auto output_type = dtypes[1]; auto output_type = dtypes[1];
std::stringstream ss; std::stringstream ss;
ss << "reduce_" << reduce_op << "_" << input_type.c_type_string() << "_" ss << "reduce_" << reduce_op << "_" << input_type.c_type_string() << "_"
...@@ -180,7 +189,8 @@ size_t runtime::gpu::CUDNNEmitter::build_reduce_forward(const cudnnReduceTensorO ...@@ -180,7 +189,8 @@ size_t runtime::gpu::CUDNNEmitter::build_reduce_forward(const cudnnReduceTensorO
} }
auto& desc = m_descriptors.build<cudnnReduceTensorDescriptor_t>(); auto& desc = m_descriptors.build<cudnnReduceTensorDescriptor_t>();
cudnnDataType_t data_type = get_cudnn_datatype(input_type); auto modified_input_type = (input_type == element::i32) ? element::f64 : input_type;
cudnnDataType_t data_type = get_cudnn_datatype(modified_input_type);
cudnnTensorFormat_t tensor_format = CUDNN_TENSOR_NCHW; cudnnTensorFormat_t tensor_format = CUDNN_TENSOR_NCHW;
auto& input_desc = tensor_descriptor_from_shape(input_shape, data_type, tensor_format); auto& input_desc = tensor_descriptor_from_shape(input_shape, data_type, tensor_format);
Shape output_shape = input_shape; Shape output_shape = input_shape;
...@@ -193,15 +203,6 @@ size_t runtime::gpu::CUDNNEmitter::build_reduce_forward(const cudnnReduceTensorO ...@@ -193,15 +203,6 @@ size_t runtime::gpu::CUDNNEmitter::build_reduce_forward(const cudnnReduceTensorO
// get an allocator for transient per kernel gpu memory // get an allocator for transient per kernel gpu memory
GPUAllocator allocator = this->m_primitive_emitter->get_memory_allocator(); GPUAllocator allocator = this->m_primitive_emitter->get_memory_allocator();
size_t workspace_size = 0;
CUDNN_SAFE_CALL(cudnnGetReductionWorkspaceSize(
*m_ctx->cudnn_handle, desc, input_desc, output_desc, &workspace_size));
size_t input_buffer_size = shape_size(input_shape) * input_type.size();
if (workspace_size < input_buffer_size)
{
workspace_size = input_buffer_size;
}
size_t workspace_idx = allocator.reserve_workspace(workspace_size);
void* alpha = m_host_parameters.allocate_by_datatype(data_type, 1.0); void* alpha = m_host_parameters.allocate_by_datatype(data_type, 1.0);
void* beta = m_host_parameters.allocate_by_datatype(data_type, 0); void* beta = m_host_parameters.allocate_by_datatype(data_type, 0);
...@@ -217,6 +218,12 @@ size_t runtime::gpu::CUDNNEmitter::build_reduce_forward(const cudnnReduceTensorO ...@@ -217,6 +218,12 @@ size_t runtime::gpu::CUDNNEmitter::build_reduce_forward(const cudnnReduceTensorO
CUDNN_NOT_PROPAGATE_NAN, CUDNN_NOT_PROPAGATE_NAN,
CUDNN_REDUCE_TENSOR_NO_INDICES, CUDNN_REDUCE_TENSOR_NO_INDICES,
CUDNN_32BIT_INDICES)); CUDNN_32BIT_INDICES));
size_t workspace_size = 0;
CUDNN_SAFE_CALL(cudnnGetReductionWorkspaceSize(
*m_ctx->cudnn_handle, desc, input_desc, output_desc, &workspace_size));
size_t workspace_idx = allocator.reserve_workspace(workspace_size);
// emit reduce operation // emit reduce operation
reduce.reset(new gpu::primitive{ reduce.reset(new gpu::primitive{
[=, &desc, &input_desc, &output_desc](void** inputs, void** outputs) { [=, &desc, &input_desc, &output_desc](void** inputs, void** outputs) {
...@@ -243,71 +250,83 @@ size_t runtime::gpu::CUDNNEmitter::build_reduce_forward(const cudnnReduceTensorO ...@@ -243,71 +250,83 @@ size_t runtime::gpu::CUDNNEmitter::build_reduce_forward(const cudnnReduceTensorO
{ {
if (output_type == element::i32 || output_type == element::i64) if (output_type == element::i32 || output_type == element::i64)
{ {
size_t indices_size = shape_size(output_shape) * output_type.size(); // Since cuDNN only outputs int32 indices
size_t indices_size = shape_size(output_shape) * element::i32.size();
size_t reduce_buffer_idx = size_t reduce_buffer_idx =
allocator.reserve_workspace(shape_size(output_shape) * input_type.size()); allocator.reserve_workspace(shape_size(output_shape) * modified_input_type.size());
CUDNN_SAFE_CALL(cudnnSetReduceTensorDescriptor(desc, CUDNN_SAFE_CALL(cudnnSetReduceTensorDescriptor(desc,
reduce_op, reduce_op,
data_type, data_type,
CUDNN_NOT_PROPAGATE_NAN, CUDNN_NOT_PROPAGATE_NAN,
CUDNN_REDUCE_TENSOR_FLATTENED_INDICES, CUDNN_REDUCE_TENSOR_FLATTENED_INDICES,
CUDNN_32BIT_INDICES)); CUDNN_32BIT_INDICES));
size_t workspace_size = 0;
CUDNN_SAFE_CALL(cudnnGetReductionWorkspaceSize(
*m_ctx->cudnn_handle, desc, input_desc, output_desc, &workspace_size));
size_t workspace_idx = allocator.reserve_workspace(workspace_size);
auto& cuda_emitter = m_primitive_emitter->get_cuda_emitter();
std::function<void(void**, void**)> convert_output = [](void** inputs, void** outputs) {
};
std::function<void*(void*)> convert_output_space = [](void* ptr) { return ptr; };
if (output_type == element::i64) if (output_type == element::i64)
{ {
size_t workspace_indices_idx = size_t workspace_indices_idx = allocator.reserve_workspace(indices_size);
allocator.reserve_workspace(shape_size(output_shape) * input_type.size()); auto convert_idx = cuda_emitter->template build_elementwise<op::Convert>(
auto& cuda_emitter = m_primitive_emitter->get_cuda_emitter();
auto convert_idx = cuda_emitter->build_elementwise<op::Convert>(
{element::i32.c_type_string(), element::i64.c_type_string()}, output_shape); {element::i32.c_type_string(), element::i64.c_type_string()}, output_shape);
reduce.reset(new gpu::primitive{ convert_output = [=](void** inputs, void** outputs) {
[=, &desc, &input_desc, &output_desc](void** inputs, void** outputs) { gpu::invoke_primitive(m_ctx, convert_idx, inputs, outputs);
void* workspace_indices_ptr = };
runtime::gpu::invoke_memory_primitive(m_ctx, workspace_indices_idx); convert_output_space = [=](void* ptr) {
void* workspace_ptr = return runtime::gpu::invoke_memory_primitive(m_ctx, workspace_indices_idx);
runtime::gpu::invoke_memory_primitive(m_ctx, workspace_idx); };
void* reduce_buffer =
runtime::gpu::invoke_memory_primitive(m_ctx, reduce_buffer_idx);
CUDNN_SAFE_CALL(cudnnReduceTensor(*m_ctx->cudnn_handle,
desc,
workspace_indices_ptr,
indices_size,
workspace_ptr,
workspace_size,
alpha,
input_desc,
inputs[0],
beta,
output_desc,
reduce_buffer));
gpu::invoke_primitive(m_ctx, convert_idx, &workspace_indices_ptr, outputs);
debug_sync();
}});
} }
else
std::function<void(void**, void**)> convert_input = [](void** inputs, void** outputs) {
};
std::function<void*(void*)> convert_input_space = [](void* ptr) { return ptr; };
if (input_type == element::i32)
{ {
reduce.reset(new gpu::primitive{ size_t input_idx = allocator.reserve_workspace(shape_size(input_shape) *
[=, &desc, &input_desc, &output_desc](void** inputs, void** outputs) { modified_input_type.size());
auto convert_input_idx = cuda_emitter->template build_elementwise<op::Convert>(
{input_type.c_type_string(), modified_input_type.c_type_string()}, input_shape);
convert_input = [=](void** inputs, void** outputs) {
gpu::invoke_primitive(m_ctx, convert_input_idx, inputs, outputs);
};
convert_input_space = [=](void* ptr) {
return runtime::gpu::invoke_memory_primitive(m_ctx, input_idx);
};
}
void* workspace_ptr = reduce.reset(new gpu::primitive{[=, &desc, &input_desc, &output_desc](void** inputs,
runtime::gpu::invoke_memory_primitive(m_ctx, workspace_idx); void** outputs) {
void* input_ptr = convert_input_space(inputs[0]);
void* workspace_indices_ptr = convert_output_space(outputs[0]);
void* workspace_ptr = runtime::gpu::invoke_memory_primitive(m_ctx, workspace_idx);
void* reduce_buffer = void* reduce_buffer =
runtime::gpu::invoke_memory_primitive(m_ctx, reduce_buffer_idx); runtime::gpu::invoke_memory_primitive(m_ctx, reduce_buffer_idx);
convert_input(inputs, &input_ptr);
CUDNN_SAFE_CALL(cudnnReduceTensor(*m_ctx->cudnn_handle, CUDNN_SAFE_CALL(cudnnReduceTensor(*m_ctx->cudnn_handle,
desc, desc,
outputs[0], workspace_indices_ptr,
indices_size, indices_size,
workspace_ptr, workspace_ptr,
workspace_size, workspace_size,
alpha, alpha,
input_desc, input_desc,
inputs[0], input_ptr,
beta, beta,
output_desc, output_desc,
reduce_buffer)); reduce_buffer));
convert_output(&workspace_indices_ptr, outputs);
debug_sync(); debug_sync();
}}); }});
} }
}
else else
{ {
std::stringstream ss_er; std::stringstream ss_er;
......
...@@ -134,6 +134,7 @@ shape_of_vector ...@@ -134,6 +134,7 @@ shape_of_vector
shape_of_matrix shape_of_matrix
shape_of_5d shape_of_5d
sum_stable_acc sum_stable_acc
sum_trivial_in_double
product_2d_to_scalar_int32 product_2d_to_scalar_int32
product_to_scalar_int32 product_to_scalar_int32
product_to_scalar_int8 product_to_scalar_int8
...@@ -141,3 +142,6 @@ max_matrix_rows_zero_int32 ...@@ -141,3 +142,6 @@ max_matrix_rows_zero_int32
max_to_scalar_int8 max_to_scalar_int8
min_to_scalar_int8 min_to_scalar_int8
max_3d_to_scalar_double max_3d_to_scalar_double
argmin_trivial_in_i32
argmax_4D_axis_3_i64_in_i32
argmin_trivial_in_double
...@@ -311,3 +311,82 @@ NGRAPH_TEST(${BACKEND_NAME}, argmax_4D_axis_3) ...@@ -311,3 +311,82 @@ NGRAPH_TEST(${BACKEND_NAME}, argmax_4D_axis_3)
.get_vector()), .get_vector()),
read_vector<int>(result)); read_vector<int>(result));
} }
NGRAPH_TEST(${BACKEND_NAME}, argmin_trivial_in_i32)
{
Shape shape{4, 3};
Shape rshape{3};
auto A = make_shared<op::Parameter>(element::i32, shape);
auto f = make_shared<Function>(make_shared<op::ArgMin>(A, 0, element::i32), ParameterVector{A});
auto backend = runtime::Backend::create("${BACKEND_NAME}");
// Create some tensors for input/output
auto a = backend->create_tensor(element::i32, shape);
copy_data(a, vector<int32_t>{12, 2, 10, 9, 8, 4, 6, 1, 5, 3, 11, 7});
auto result = backend->create_tensor(element::i32, rshape);
backend->call_with_validate(f, {result}, {a});
EXPECT_EQ((vector<int>{3, 2, 1}), read_vector<int>(result));
}
NGRAPH_TEST(${BACKEND_NAME}, argmax_4D_axis_3_i64_in_i32)
{
Shape shape{2, 2, 5, 5}; // NCHW ->(0,1,2,3)
Shape rshape{2, 2, 5};
auto A = make_shared<op::Parameter>(element::i32, shape);
auto f = make_shared<Function>(make_shared<op::ArgMax>(A, 3, element::i64), ParameterVector{A});
auto backend = runtime::Backend::create("${BACKEND_NAME}");
// Create some tensors for input/output
auto a = backend->create_tensor(element::i32, shape);
copy_data(a,
test::NDArray<int32_t, 4>({{{{0, 1, 0, 2, 1}, // img 0 ch 0
{0, 3, 2, 0, 0},
{2, 0, 0, 0, 1},
{2, 0, 1, 1, 2},
{0, 2, 1, 0, 0}},
{{0, 0, 0, 2, 0}, // img 0 ch 1
{0, 2, 3, 0, 1},
{2, 0, 1, 0, 2},
{3, 1, 0, 0, 0},
{2, 0, 0, 0, 0}}},
{{{0, 2, 1, 1, 0}, // img 1 ch 0
{0, 0, 2, 0, 1},
{0, 0, 1, 2, 3},
{2, 0, 0, 3, 0},
{0, 0, 0, 0, 0}},
{{2, 1, 0, 0, 1}, // img 1 ch 1
{0, 2, 0, 0, 0},
{1, 1, 2, 0, 2},
{1, 1, 1, 0, 1},
{1, 0, 0, 0, 2}}}})
.get_vector());
auto result = backend->create_tensor(element::i64, rshape);
backend->call_with_validate(f, {result}, {a});
EXPECT_EQ((test::NDArray<int64_t, 3>({{{3, 1, 0, 0, 1}, {3, 2, 0, 0, 0}}, //ch0
{{1, 2, 4, 3, 0}, {0, 1, 2, 0, 4}}}) //ch1
.get_vector()),
read_vector<int64_t>(result));
}
NGRAPH_TEST(${BACKEND_NAME}, argmin_trivial_in_double)
{
Shape shape{4, 3};
Shape rshape{3};
auto A = make_shared<op::Parameter>(element::f64, shape);
auto f = make_shared<Function>(make_shared<op::ArgMin>(A, 0, element::i32), ParameterVector{A});
auto backend = runtime::Backend::create("${BACKEND_NAME}");
// Create some tensors for input/output
auto a = backend->create_tensor(element::f64, shape);
copy_data(a, vector<double>{12, 2, 10, 9, 8, 4, 6, 1, 5, 3, 11, 7});
auto result = backend->create_tensor(element::i32, rshape);
backend->call_with_validate(f, {result}, {a});
EXPECT_EQ((vector<int32_t>{3, 2, 1}), read_vector<int32_t>(result));
}
...@@ -485,6 +485,24 @@ NGRAPH_TEST(${BACKEND_NAME}, sum_2d_to_scalar_int8) ...@@ -485,6 +485,24 @@ NGRAPH_TEST(${BACKEND_NAME}, sum_2d_to_scalar_int8)
EXPECT_EQ(std::vector<int8_t>{45}, read_vector<int8_t>(result)); EXPECT_EQ(std::vector<int8_t>{45}, read_vector<int8_t>(result));
} }
NGRAPH_TEST(${BACKEND_NAME}, sum_trivial_in_double)
{
Shape shape{4, 3};
Shape rshape{3};
auto A = make_shared<op::Parameter>(element::f64, shape);
auto f = make_shared<Function>(make_shared<op::Sum>(A, AxisSet{0}), ParameterVector{A});
auto backend = runtime::Backend::create("${BACKEND_NAME}");
// Create some tensors for input/output
auto a = backend->create_tensor(element::f64, shape);
copy_data(a, vector<double>{12, 2, 10, 9, 8, 4, 6, 1, 5, 3, 11, 7});
auto result = backend->create_tensor(element::f64, rshape);
backend->call_with_validate(f, {result}, {a});
EXPECT_EQ((vector<double>{30, 22, 26}), read_vector<double>(result));
}
#if NGRAPH_INTERPRETER_ENABLE #if NGRAPH_INTERPRETER_ENABLE
NGRAPH_TEST(${BACKEND_NAME}, sum_stable_acc) NGRAPH_TEST(${BACKEND_NAME}, sum_stable_acc)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment