Commit 905cafd2 authored by Chris Sullivan's avatar Chris Sullivan Committed by Robert Kimball

GPU op::Result implementation (#611)

* Added GPU emitter for op::Result.
For now it simply copies the output tensor.

All but 3 tests now pass. The remaining
failing tests are:
* GPU.dot_0_0
* GPU.dot_matrix_2x0_0x2
* GPU.dot_2x0_0

* Removed call to handle memory aliasing in gpu_external_function.

* fix gpu emitter bug that will return in the middle of function

* Merge pull request #609 from NervanaSystems/tfl/fix_return_bug

fix gpu emitter bug that will return in the middle of function

* GPU backend skips added for recent softmax test and updated aliased output test that uses op::Constant.
parent 9db548c6
......@@ -63,6 +63,10 @@ void runtime::gpu::GPU_Emitter::EmitAbs(codegen::CodeWriter& writer,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
if (out[0].get_size() == 0)
{
return;
}
writer << "{ // " << n->get_name() << "\n";
writer.indent++;
writer << "int count = " << out[0].get_size() << ";\n";
......@@ -78,10 +82,13 @@ void runtime::gpu::GPU_Emitter::EmitAdd(codegen::CodeWriter& writer,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
if (out[0].get_size() == 0)
{
return;
}
writer << "{ // " << n->get_name() << "\n";
writer.indent++;
writer << "int count = " << out[0].get_size() << ";\n";
writer << "if(count == 0) return;\n";
writer += R"(
float alpha1 = 1.0, alpha2 = 1.0, beta = 0;
cudnnTensorDescriptor_t descriptor;
......@@ -126,6 +133,11 @@ void runtime::gpu::GPU_Emitter::EmitDot(codegen::CodeWriter& writer,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
if (out[0].get_size() == 0)
{
return;
}
const ngraph::op::Dot* dot = static_cast<const ngraph::op::Dot*>(n);
const Shape& arg0_shape = args[0].get_shape();
const Shape& arg1_shape = args[1].get_shape();
......@@ -136,7 +148,6 @@ void runtime::gpu::GPU_Emitter::EmitDot(codegen::CodeWriter& writer,
writer << "{ // " << n->get_name() << "\n";
writer.indent++;
writer << "int count = " << second.get_size() << ";\n";
writer << "if(count == 0) return;\n";
writer << "cublasScopy("
<< "cublas_handle,"
<< "count ," << second.get_name() << ","
......@@ -149,17 +160,6 @@ void runtime::gpu::GPU_Emitter::EmitDot(codegen::CodeWriter& writer,
return;
}
//return if output size is 0;
if (out[0].get_size() == 0)
{
writer << "{ // " << n->get_name() << "\n";
writer.indent++;
writer << "return;\n";
writer.indent--;
writer << "}\n";
return;
}
//set output to 0 if input size is 0
if (args[0].get_size() == 0 || args[1].get_size() == 0)
{
......@@ -167,7 +167,6 @@ void runtime::gpu::GPU_Emitter::EmitDot(codegen::CodeWriter& writer,
writer.indent++;
writer << "runtime::gpu::cuda_memset(" << out[0].get_name() << ", 0, " << out[0].get_size()
<< " * sizeof(float));\n";
writer << "return;\n";
writer.indent--;
writer << "}\n";
return;
......@@ -307,10 +306,13 @@ void runtime::gpu::GPU_Emitter::EmitMaximum(codegen::CodeWriter& writer,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
if (out[0].get_size() == 0)
{
return;
}
writer << "{ // " << n->get_name() << "\n";
writer.indent++;
writer << "int count = " << out[0].get_size() << ";\n";
writer << "if(count == 0) return;\n";
writer += R"(
float alpha1 = 1.0, alpha2 = 1.0, beta = 0;
cudnnTensorDescriptor_t descriptor;
......@@ -348,10 +350,13 @@ void runtime::gpu::GPU_Emitter::EmitMinimum(codegen::CodeWriter& writer,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
if (out[0].get_size() == 0)
{
return;
}
writer << "{ // " << n->get_name() << "\n";
writer.indent++;
writer << "int count = " << out[0].get_size() << ";\n";
writer << "if(count == 0) return;\n";
writer += R"(
float alpha1 = 1.0, alpha2 = 1.0, beta = 0;
cudnnTensorDescriptor_t descriptor;
......@@ -390,10 +395,13 @@ void runtime::gpu::GPU_Emitter::EmitNegative(
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
if (out[0].get_size() == 0)
{
return;
}
writer << "{ // " << n->get_name() << "\n";
writer.indent++;
writer << "int count = " << out[0].get_size() << ";\n";
writer << "if(count == 0) return;\n";
writer += R"(
float alpha1 = -1.0, alpha2 = 0, beta = 0;
cudnnTensorDescriptor_t descriptor;
......@@ -457,6 +465,10 @@ void runtime::gpu::GPU_Emitter::EmitBroadcast(
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
if (out[0].get_size() == 0)
{
return;
}
auto broadcast = static_cast<const ngraph::op::Broadcast*>(n);
auto arg_shape = args[0].get_shape();
auto result_shape = out[0].get_shape();
......@@ -539,6 +551,10 @@ void runtime::gpu::GPU_Emitter::EmitReshape(codegen::CodeWriter& writer,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
if (out[0].get_size() == 0)
{
return;
}
auto reshape = static_cast<const op::Reshape*>(n);
writer << "{ // " << n->get_name() << "\n";
writer.indent++;
......@@ -646,10 +662,13 @@ void runtime::gpu::GPU_Emitter::EmitMultiply(
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
if (out[0].get_size() == 0)
{
return;
}
writer << "{ // " << n->get_name() << "\n";
writer.indent++;
writer << "int count = " << out[0].get_size() << ";\n";
writer << "if(count == 0) return;\n";
writer += R"(
float alpha1 = 1.0, alpha2 = 1.0, beta = 0;
cudnnTensorDescriptor_t descriptor;
......@@ -808,10 +827,13 @@ void runtime::gpu::GPU_Emitter::EmitSqrt(codegen::CodeWriter& writer,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
if (out[0].get_size() == 0)
{
return;
}
writer << "{ // " << n->get_name() << "\n";
writer.indent++;
writer << "int count = " << out[0].get_size() << ";\n";
writer << "if(count == 0) return;\n";
writer += R"(
float alpha1 = 1.0, alpha2 = 0, beta = 0;
cudnnTensorDescriptor_t descriptor;
......@@ -894,3 +916,17 @@ void runtime::gpu::GPU_Emitter::EmitSelectAndScatter(
{
throw std::runtime_error(n->get_name() + " is not implemented.");
}
void runtime::gpu::GPU_Emitter::EmitResult(codegen::CodeWriter& writer,
const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
writer << "{ //" << n->get_name() << "\n";
writer.indent++;
writer << "runtime::gpu::cuda_memcpyDtD(" << out[0].get_name() << ", " << args[0].get_name()
<< ", " << out[0].get_size() << " * " << out[0].get_element_type().size() << ");\n";
writer.indent--;
writer << "}\n";
return;
}
......@@ -92,6 +92,7 @@ namespace ngraph
static void EMITTER_DECL(EmitReverse);
static void EMITTER_DECL(EmitReduceWindow);
static void EMITTER_DECL(EmitSelectAndScatter);
static void EMITTER_DECL(EmitResult);
};
}
}
......
......@@ -197,6 +197,7 @@ static const runtime::gpu::OpMap dispatcher{
{TI(ngraph::op::Reverse), &runtime::gpu::GPU_Emitter::EmitReverse},
{TI(ngraph::op::ReduceWindow), &runtime::gpu::GPU_Emitter::EmitReduceWindow},
{TI(ngraph::op::SelectAndScatter), &runtime::gpu::GPU_Emitter::EmitSelectAndScatter},
{TI(ngraph::op::Result), &runtime::gpu::GPU_Emitter::EmitResult},
};
runtime::gpu::GPU_ExternalFunction::GPU_ExternalFunction(
......@@ -657,7 +658,6 @@ using namespace std;
// Emit operation epilogue
if (!node->is_parameter() && !node->is_constant())
{
handle_output_alias(writer, *node, output_alias_map);
if (m_emit_timing)
{
emit_debug_function_exit(writer, node.get(), in, out);
......
......@@ -1305,6 +1305,7 @@ TEST(${BACKEND_NAME}, backwards_slice)
TEST(${BACKEND_NAME}, backwards_softmax_all)
{
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
auto manager = runtime::Manager::get("${BACKEND_NAME}");
auto backend = manager->allocate_backend();
......@@ -1322,6 +1323,7 @@ TEST(${BACKEND_NAME}, backwards_softmax_all)
TEST(${BACKEND_NAME}, backwards_softmax_axis)
{
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
auto manager = runtime::Manager::get("${BACKEND_NAME}");
auto backend = manager->allocate_backend();
......@@ -1339,6 +1341,7 @@ TEST(${BACKEND_NAME}, backwards_softmax_axis)
TEST(${BACKEND_NAME}, backwards_softmax_underflow)
{
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
auto manager = runtime::Manager::get("${BACKEND_NAME}");
auto backend = manager->allocate_backend();
......@@ -1358,6 +1361,7 @@ TEST(${BACKEND_NAME}, backwards_softmax_underflow)
TEST(${BACKEND_NAME}, backwards_softmax_3d)
{
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
auto manager = runtime::Manager::get("${BACKEND_NAME}");
auto backend = manager->allocate_backend();
......
......@@ -120,6 +120,7 @@ TEST(${BACKEND_NAME}, component_cleanup)
TEST(${BACKEND_NAME}, aliased_output)
{
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape{2, 2};
auto A = make_shared<op::Parameter>(element::f32, shape);
auto B = make_shared<op::Parameter>(element::f32, shape);
......@@ -8441,6 +8442,7 @@ TEST(${BACKEND_NAME}, relu_4Dbackprop)
TEST(${BACKEND_NAME}, softmax_all)
{
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape{2, 3};
auto A = make_shared<op::Parameter>(element::f32, shape);
auto f =
......@@ -8473,6 +8475,7 @@ TEST(${BACKEND_NAME}, softmax_all)
TEST(${BACKEND_NAME}, softmax_axis)
{
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape{2, 3};
auto A = make_shared<op::Parameter>(element::f32, shape);
auto f = make_shared<Function>(make_shared<op::Softmax>(A, AxisSet{1}), op::ParameterVector{A});
......@@ -8501,6 +8504,7 @@ TEST(${BACKEND_NAME}, softmax_axis)
TEST(${BACKEND_NAME}, softmax_underflow)
{
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape{2, 3};
auto A = make_shared<op::Parameter>(element::f32, shape);
auto f = make_shared<Function>(make_shared<op::Softmax>(A, AxisSet{0}), op::ParameterVector{A});
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment