Commit cccdc304 authored by Ayan Moitra's avatar Ayan Moitra Committed by Scott Cyphers

Add cudnn sum and test that exposes tf test failure for nvGPU (#2033)

* Add cudnn sum and test that exposes failure

* Add test to interpreter manifest and move test to sum file

* change test name

* Fails INTELGPU and no need to run on CPU

* Comment incorporation + kahan summation for interpreter

* small oversight correction

* resolve conflict

* minor variable name edit
parent 76b8b4d4
......@@ -1563,6 +1563,15 @@ void runtime::gpu::GPU_Emitter::emit_Subtract(EMIT_ARGS)
}
void runtime::gpu::GPU_Emitter::emit_Sum(EMIT_ARGS)
{
runtime::gpu::GPU_Emitter::emit_Sum_1(external_function, writer, node, args, out);
}
void runtime::gpu::GPU_Emitter::emit_Sum_0(EMIT_ARGS)
/* emit_Sum_0 uses native cuda kernels to perform Sum reduction. This method
is faster than cudnn implementation but in its current state is less precise
than cudnn reduce. That is causing tensorflow tests aimed at testing stabilty
to fail */
{
const ngraph::op::Sum* sum = static_cast<const ngraph::op::Sum*>(node);
writer.block_begin();
......@@ -1602,6 +1611,49 @@ void runtime::gpu::GPU_Emitter::emit_Sum(EMIT_ARGS)
writer.block_end();
}
void runtime::gpu::GPU_Emitter::emit_Sum_1(EMIT_ARGS)
/* emit_Sum_1 uses cudnn to perform Sum reduction. This method, although
slower than the native cuda implementation is more precise and fixes the issue with
tensorflow test failures*/
{
const ngraph::op::Sum* sum = static_cast<const ngraph::op::Sum*>(node);
std::vector<element::Type> dtypes{args[0].get_element_type(), out[0].get_element_type()};
cudnnReduceTensorOp_t reduce_op = CUDNN_REDUCE_TENSOR_ADD;
writer.block_begin();
{
if (out[0].get_size() != 0)
{
// one of args[] axes has zero size, zero output
if (args[0].get_size() == 0)
{
kernel::emit_memset(writer, out[0], 0);
}
else if (args[0].get_size() == out[0].get_size())
{
kernel::emit_memcpyDtD(writer, out[0], args[0]);
}
else
{
auto& cudnn_emitter =
external_function->get_primitive_emitter()->get_cudnn_emitter();
auto sum_index =
cudnn_emitter->build_reduce_forward(reduce_op,
dtypes,
args[0].get_shape(),
sum->get_reduction_axes(),
CUDNNEmitter::ReductionMode::Reduce);
writer << "gpu::invoke_primitive(ctx, " << sum_index << ", ";
writer << "std::vector<void*>{" << args[0].get_name() << "}.data(), ";
writer << "std::vector<void*>{" << out[0].get_name() << "}.data()";
writer << ");\n";
}
}
}
writer.block_end();
}
void runtime::gpu::GPU_Emitter::emit_Tan(EMIT_ARGS)
{
emit_elementwise<ngraph::op::Tan>(external_function, writer, node, args, out);
......
......@@ -76,6 +76,8 @@ namespace ngraph
}
static void emit_ArgReduce(EMIT_ARGS, cudnnReduceTensorOp_t);
static void emit_Sum_0(EMIT_ARGS);
static void emit_Sum_1(EMIT_ARGS);
/// \brief Create a list of node names for each arg in args
/// \param args list of tensor arguments
......
......@@ -119,3 +119,5 @@ shape_of_scalar
shape_of_vector
shape_of_matrix
shape_of_5d
sum_stable_acc
......@@ -43,12 +43,14 @@ namespace ngraph
CoordinateTransform input_transform(in_shape);
T c = 0;
for (const Coordinate& input_coord : input_transform)
{
Coordinate output_coord = reduce(input_coord, reduction_axes);
out[output_transform.index(output_coord)] +=
arg[input_transform.index(input_coord)];
T y = arg[input_transform.index(input_coord)] - c;
T t = out[output_transform.index(output_coord)] + y;
c = (t - out[output_transform.index(output_coord)]) - y;
out[output_transform.index(output_coord)] = t;
}
}
}
......
......@@ -427,3 +427,38 @@ NGRAPH_TEST(${BACKEND_NAME}, sum_5d_to_scalar)
backend->call_with_validate(f, {result}, {a});
EXPECT_EQ(std::vector<float>{243.}, read_vector<float>(result));
}
#if NGRAPH_INTERPRETER_ENABLE
NGRAPH_TEST(${BACKEND_NAME}, sum_stable_acc)
{
std::string backend_name = "${BACKEND_NAME}";
if (backend_name == "INTERPRETER")
{
exit(0);
}
Shape shape_a{10, 10, 10, 30};
auto A = make_shared<op::Parameter>(element::f32, shape_a);
Shape shape_rt{10};
auto f =
make_shared<Function>(make_shared<op::Sum>(A, AxisSet{1, 2, 3}), op::ParameterVector{A});
test::Uniform<float> rng(1000.0f, 1000.1f, 2112);
vector<vector<float>> args;
for (shared_ptr<op::Parameter> param : f->get_parameters())
{
vector<float> tensor_val(shape_size(param->get_shape()));
rng.initialize(tensor_val);
args.push_back(tensor_val);
}
auto ref_func = clone_function(*f);
auto bk_func = clone_function(*f);
auto ref_results = execute(ref_func, args, "INTERPRETER");
auto bk_results = execute(bk_func, args, "${BACKEND_NAME}");
EXPECT_TRUE(test::all_close_f(ref_results.at(0), bk_results.at(0), 24, 3));
}
#endif
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment