Commit 905cafd2 authored by Chris Sullivan's avatar Chris Sullivan Committed by Robert Kimball

GPU op::Result implementation (#611)

* Added GPU emitter for op::Result.
For now it simply copies the output tensor.

All but 3 tests now pass. The remaining
failing tests are:
* GPU.dot_0_0
* GPU.dot_matrix_2x0_0x2
* GPU.dot_2x0_0

* Removed call to handle memory aliasing in gpu_external_function.

* fix gpu emitter bug that will return in the middle of function

* Merge pull request #609 from NervanaSystems/tfl/fix_return_bug

fix gpu emitter bug that will return in the middle of function

* GPU backend skips added for recent softmax test and updated aliased output test that uses op::Constant.
parent 9db548c6
...@@ -63,6 +63,10 @@ void runtime::gpu::GPU_Emitter::EmitAbs(codegen::CodeWriter& writer, ...@@ -63,6 +63,10 @@ void runtime::gpu::GPU_Emitter::EmitAbs(codegen::CodeWriter& writer,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args, const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out) const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{ {
if (out[0].get_size() == 0)
{
return;
}
writer << "{ // " << n->get_name() << "\n"; writer << "{ // " << n->get_name() << "\n";
writer.indent++; writer.indent++;
writer << "int count = " << out[0].get_size() << ";\n"; writer << "int count = " << out[0].get_size() << ";\n";
...@@ -78,10 +82,13 @@ void runtime::gpu::GPU_Emitter::EmitAdd(codegen::CodeWriter& writer, ...@@ -78,10 +82,13 @@ void runtime::gpu::GPU_Emitter::EmitAdd(codegen::CodeWriter& writer,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args, const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out) const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{ {
if (out[0].get_size() == 0)
{
return;
}
writer << "{ // " << n->get_name() << "\n"; writer << "{ // " << n->get_name() << "\n";
writer.indent++; writer.indent++;
writer << "int count = " << out[0].get_size() << ";\n"; writer << "int count = " << out[0].get_size() << ";\n";
writer << "if(count == 0) return;\n";
writer += R"( writer += R"(
float alpha1 = 1.0, alpha2 = 1.0, beta = 0; float alpha1 = 1.0, alpha2 = 1.0, beta = 0;
cudnnTensorDescriptor_t descriptor; cudnnTensorDescriptor_t descriptor;
...@@ -126,6 +133,11 @@ void runtime::gpu::GPU_Emitter::EmitDot(codegen::CodeWriter& writer, ...@@ -126,6 +133,11 @@ void runtime::gpu::GPU_Emitter::EmitDot(codegen::CodeWriter& writer,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args, const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out) const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{ {
if (out[0].get_size() == 0)
{
return;
}
const ngraph::op::Dot* dot = static_cast<const ngraph::op::Dot*>(n); const ngraph::op::Dot* dot = static_cast<const ngraph::op::Dot*>(n);
const Shape& arg0_shape = args[0].get_shape(); const Shape& arg0_shape = args[0].get_shape();
const Shape& arg1_shape = args[1].get_shape(); const Shape& arg1_shape = args[1].get_shape();
...@@ -136,7 +148,6 @@ void runtime::gpu::GPU_Emitter::EmitDot(codegen::CodeWriter& writer, ...@@ -136,7 +148,6 @@ void runtime::gpu::GPU_Emitter::EmitDot(codegen::CodeWriter& writer,
writer << "{ // " << n->get_name() << "\n"; writer << "{ // " << n->get_name() << "\n";
writer.indent++; writer.indent++;
writer << "int count = " << second.get_size() << ";\n"; writer << "int count = " << second.get_size() << ";\n";
writer << "if(count == 0) return;\n";
writer << "cublasScopy(" writer << "cublasScopy("
<< "cublas_handle," << "cublas_handle,"
<< "count ," << second.get_name() << "," << "count ," << second.get_name() << ","
...@@ -149,17 +160,6 @@ void runtime::gpu::GPU_Emitter::EmitDot(codegen::CodeWriter& writer, ...@@ -149,17 +160,6 @@ void runtime::gpu::GPU_Emitter::EmitDot(codegen::CodeWriter& writer,
return; return;
} }
//return if output size is 0;
if (out[0].get_size() == 0)
{
writer << "{ // " << n->get_name() << "\n";
writer.indent++;
writer << "return;\n";
writer.indent--;
writer << "}\n";
return;
}
//set output to 0 if input size is 0 //set output to 0 if input size is 0
if (args[0].get_size() == 0 || args[1].get_size() == 0) if (args[0].get_size() == 0 || args[1].get_size() == 0)
{ {
...@@ -167,7 +167,6 @@ void runtime::gpu::GPU_Emitter::EmitDot(codegen::CodeWriter& writer, ...@@ -167,7 +167,6 @@ void runtime::gpu::GPU_Emitter::EmitDot(codegen::CodeWriter& writer,
writer.indent++; writer.indent++;
writer << "runtime::gpu::cuda_memset(" << out[0].get_name() << ", 0, " << out[0].get_size() writer << "runtime::gpu::cuda_memset(" << out[0].get_name() << ", 0, " << out[0].get_size()
<< " * sizeof(float));\n"; << " * sizeof(float));\n";
writer << "return;\n";
writer.indent--; writer.indent--;
writer << "}\n"; writer << "}\n";
return; return;
...@@ -307,10 +306,13 @@ void runtime::gpu::GPU_Emitter::EmitMaximum(codegen::CodeWriter& writer, ...@@ -307,10 +306,13 @@ void runtime::gpu::GPU_Emitter::EmitMaximum(codegen::CodeWriter& writer,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args, const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out) const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{ {
if (out[0].get_size() == 0)
{
return;
}
writer << "{ // " << n->get_name() << "\n"; writer << "{ // " << n->get_name() << "\n";
writer.indent++; writer.indent++;
writer << "int count = " << out[0].get_size() << ";\n"; writer << "int count = " << out[0].get_size() << ";\n";
writer << "if(count == 0) return;\n";
writer += R"( writer += R"(
float alpha1 = 1.0, alpha2 = 1.0, beta = 0; float alpha1 = 1.0, alpha2 = 1.0, beta = 0;
cudnnTensorDescriptor_t descriptor; cudnnTensorDescriptor_t descriptor;
...@@ -348,10 +350,13 @@ void runtime::gpu::GPU_Emitter::EmitMinimum(codegen::CodeWriter& writer, ...@@ -348,10 +350,13 @@ void runtime::gpu::GPU_Emitter::EmitMinimum(codegen::CodeWriter& writer,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args, const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out) const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{ {
if (out[0].get_size() == 0)
{
return;
}
writer << "{ // " << n->get_name() << "\n"; writer << "{ // " << n->get_name() << "\n";
writer.indent++; writer.indent++;
writer << "int count = " << out[0].get_size() << ";\n"; writer << "int count = " << out[0].get_size() << ";\n";
writer << "if(count == 0) return;\n";
writer += R"( writer += R"(
float alpha1 = 1.0, alpha2 = 1.0, beta = 0; float alpha1 = 1.0, alpha2 = 1.0, beta = 0;
cudnnTensorDescriptor_t descriptor; cudnnTensorDescriptor_t descriptor;
...@@ -390,10 +395,13 @@ void runtime::gpu::GPU_Emitter::EmitNegative( ...@@ -390,10 +395,13 @@ void runtime::gpu::GPU_Emitter::EmitNegative(
const vector<runtime::gpu::GPU_TensorViewWrapper>& args, const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out) const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{ {
if (out[0].get_size() == 0)
{
return;
}
writer << "{ // " << n->get_name() << "\n"; writer << "{ // " << n->get_name() << "\n";
writer.indent++; writer.indent++;
writer << "int count = " << out[0].get_size() << ";\n"; writer << "int count = " << out[0].get_size() << ";\n";
writer << "if(count == 0) return;\n";
writer += R"( writer += R"(
float alpha1 = -1.0, alpha2 = 0, beta = 0; float alpha1 = -1.0, alpha2 = 0, beta = 0;
cudnnTensorDescriptor_t descriptor; cudnnTensorDescriptor_t descriptor;
...@@ -457,6 +465,10 @@ void runtime::gpu::GPU_Emitter::EmitBroadcast( ...@@ -457,6 +465,10 @@ void runtime::gpu::GPU_Emitter::EmitBroadcast(
const vector<runtime::gpu::GPU_TensorViewWrapper>& args, const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out) const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{ {
if (out[0].get_size() == 0)
{
return;
}
auto broadcast = static_cast<const ngraph::op::Broadcast*>(n); auto broadcast = static_cast<const ngraph::op::Broadcast*>(n);
auto arg_shape = args[0].get_shape(); auto arg_shape = args[0].get_shape();
auto result_shape = out[0].get_shape(); auto result_shape = out[0].get_shape();
...@@ -539,6 +551,10 @@ void runtime::gpu::GPU_Emitter::EmitReshape(codegen::CodeWriter& writer, ...@@ -539,6 +551,10 @@ void runtime::gpu::GPU_Emitter::EmitReshape(codegen::CodeWriter& writer,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args, const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out) const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{ {
if (out[0].get_size() == 0)
{
return;
}
auto reshape = static_cast<const op::Reshape*>(n); auto reshape = static_cast<const op::Reshape*>(n);
writer << "{ // " << n->get_name() << "\n"; writer << "{ // " << n->get_name() << "\n";
writer.indent++; writer.indent++;
...@@ -646,10 +662,13 @@ void runtime::gpu::GPU_Emitter::EmitMultiply( ...@@ -646,10 +662,13 @@ void runtime::gpu::GPU_Emitter::EmitMultiply(
const vector<runtime::gpu::GPU_TensorViewWrapper>& args, const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out) const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{ {
if (out[0].get_size() == 0)
{
return;
}
writer << "{ // " << n->get_name() << "\n"; writer << "{ // " << n->get_name() << "\n";
writer.indent++; writer.indent++;
writer << "int count = " << out[0].get_size() << ";\n"; writer << "int count = " << out[0].get_size() << ";\n";
writer << "if(count == 0) return;\n";
writer += R"( writer += R"(
float alpha1 = 1.0, alpha2 = 1.0, beta = 0; float alpha1 = 1.0, alpha2 = 1.0, beta = 0;
cudnnTensorDescriptor_t descriptor; cudnnTensorDescriptor_t descriptor;
...@@ -808,10 +827,13 @@ void runtime::gpu::GPU_Emitter::EmitSqrt(codegen::CodeWriter& writer, ...@@ -808,10 +827,13 @@ void runtime::gpu::GPU_Emitter::EmitSqrt(codegen::CodeWriter& writer,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args, const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out) const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{ {
if (out[0].get_size() == 0)
{
return;
}
writer << "{ // " << n->get_name() << "\n"; writer << "{ // " << n->get_name() << "\n";
writer.indent++; writer.indent++;
writer << "int count = " << out[0].get_size() << ";\n"; writer << "int count = " << out[0].get_size() << ";\n";
writer << "if(count == 0) return;\n";
writer += R"( writer += R"(
float alpha1 = 1.0, alpha2 = 0, beta = 0; float alpha1 = 1.0, alpha2 = 0, beta = 0;
cudnnTensorDescriptor_t descriptor; cudnnTensorDescriptor_t descriptor;
...@@ -894,3 +916,17 @@ void runtime::gpu::GPU_Emitter::EmitSelectAndScatter( ...@@ -894,3 +916,17 @@ void runtime::gpu::GPU_Emitter::EmitSelectAndScatter(
{ {
throw std::runtime_error(n->get_name() + " is not implemented."); throw std::runtime_error(n->get_name() + " is not implemented.");
} }
void runtime::gpu::GPU_Emitter::EmitResult(codegen::CodeWriter& writer,
const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
writer << "{ //" << n->get_name() << "\n";
writer.indent++;
writer << "runtime::gpu::cuda_memcpyDtD(" << out[0].get_name() << ", " << args[0].get_name()
<< ", " << out[0].get_size() << " * " << out[0].get_element_type().size() << ");\n";
writer.indent--;
writer << "}\n";
return;
}
...@@ -92,6 +92,7 @@ namespace ngraph ...@@ -92,6 +92,7 @@ namespace ngraph
static void EMITTER_DECL(EmitReverse); static void EMITTER_DECL(EmitReverse);
static void EMITTER_DECL(EmitReduceWindow); static void EMITTER_DECL(EmitReduceWindow);
static void EMITTER_DECL(EmitSelectAndScatter); static void EMITTER_DECL(EmitSelectAndScatter);
static void EMITTER_DECL(EmitResult);
}; };
} }
} }
......
...@@ -197,6 +197,7 @@ static const runtime::gpu::OpMap dispatcher{ ...@@ -197,6 +197,7 @@ static const runtime::gpu::OpMap dispatcher{
{TI(ngraph::op::Reverse), &runtime::gpu::GPU_Emitter::EmitReverse}, {TI(ngraph::op::Reverse), &runtime::gpu::GPU_Emitter::EmitReverse},
{TI(ngraph::op::ReduceWindow), &runtime::gpu::GPU_Emitter::EmitReduceWindow}, {TI(ngraph::op::ReduceWindow), &runtime::gpu::GPU_Emitter::EmitReduceWindow},
{TI(ngraph::op::SelectAndScatter), &runtime::gpu::GPU_Emitter::EmitSelectAndScatter}, {TI(ngraph::op::SelectAndScatter), &runtime::gpu::GPU_Emitter::EmitSelectAndScatter},
{TI(ngraph::op::Result), &runtime::gpu::GPU_Emitter::EmitResult},
}; };
runtime::gpu::GPU_ExternalFunction::GPU_ExternalFunction( runtime::gpu::GPU_ExternalFunction::GPU_ExternalFunction(
...@@ -657,7 +658,6 @@ using namespace std; ...@@ -657,7 +658,6 @@ using namespace std;
// Emit operation epilogue // Emit operation epilogue
if (!node->is_parameter() && !node->is_constant()) if (!node->is_parameter() && !node->is_constant())
{ {
handle_output_alias(writer, *node, output_alias_map);
if (m_emit_timing) if (m_emit_timing)
{ {
emit_debug_function_exit(writer, node.get(), in, out); emit_debug_function_exit(writer, node.get(), in, out);
......
...@@ -1305,6 +1305,7 @@ TEST(${BACKEND_NAME}, backwards_slice) ...@@ -1305,6 +1305,7 @@ TEST(${BACKEND_NAME}, backwards_slice)
TEST(${BACKEND_NAME}, backwards_softmax_all) TEST(${BACKEND_NAME}, backwards_softmax_all)
{ {
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
auto manager = runtime::Manager::get("${BACKEND_NAME}"); auto manager = runtime::Manager::get("${BACKEND_NAME}");
auto backend = manager->allocate_backend(); auto backend = manager->allocate_backend();
...@@ -1322,6 +1323,7 @@ TEST(${BACKEND_NAME}, backwards_softmax_all) ...@@ -1322,6 +1323,7 @@ TEST(${BACKEND_NAME}, backwards_softmax_all)
TEST(${BACKEND_NAME}, backwards_softmax_axis) TEST(${BACKEND_NAME}, backwards_softmax_axis)
{ {
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
auto manager = runtime::Manager::get("${BACKEND_NAME}"); auto manager = runtime::Manager::get("${BACKEND_NAME}");
auto backend = manager->allocate_backend(); auto backend = manager->allocate_backend();
...@@ -1339,6 +1341,7 @@ TEST(${BACKEND_NAME}, backwards_softmax_axis) ...@@ -1339,6 +1341,7 @@ TEST(${BACKEND_NAME}, backwards_softmax_axis)
TEST(${BACKEND_NAME}, backwards_softmax_underflow) TEST(${BACKEND_NAME}, backwards_softmax_underflow)
{ {
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
auto manager = runtime::Manager::get("${BACKEND_NAME}"); auto manager = runtime::Manager::get("${BACKEND_NAME}");
auto backend = manager->allocate_backend(); auto backend = manager->allocate_backend();
...@@ -1358,6 +1361,7 @@ TEST(${BACKEND_NAME}, backwards_softmax_underflow) ...@@ -1358,6 +1361,7 @@ TEST(${BACKEND_NAME}, backwards_softmax_underflow)
TEST(${BACKEND_NAME}, backwards_softmax_3d) TEST(${BACKEND_NAME}, backwards_softmax_3d)
{ {
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
auto manager = runtime::Manager::get("${BACKEND_NAME}"); auto manager = runtime::Manager::get("${BACKEND_NAME}");
auto backend = manager->allocate_backend(); auto backend = manager->allocate_backend();
......
...@@ -120,6 +120,7 @@ TEST(${BACKEND_NAME}, component_cleanup) ...@@ -120,6 +120,7 @@ TEST(${BACKEND_NAME}, component_cleanup)
TEST(${BACKEND_NAME}, aliased_output) TEST(${BACKEND_NAME}, aliased_output)
{ {
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape{2, 2}; Shape shape{2, 2};
auto A = make_shared<op::Parameter>(element::f32, shape); auto A = make_shared<op::Parameter>(element::f32, shape);
auto B = make_shared<op::Parameter>(element::f32, shape); auto B = make_shared<op::Parameter>(element::f32, shape);
...@@ -8441,6 +8442,7 @@ TEST(${BACKEND_NAME}, relu_4Dbackprop) ...@@ -8441,6 +8442,7 @@ TEST(${BACKEND_NAME}, relu_4Dbackprop)
TEST(${BACKEND_NAME}, softmax_all) TEST(${BACKEND_NAME}, softmax_all)
{ {
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape{2, 3}; Shape shape{2, 3};
auto A = make_shared<op::Parameter>(element::f32, shape); auto A = make_shared<op::Parameter>(element::f32, shape);
auto f = auto f =
...@@ -8473,6 +8475,7 @@ TEST(${BACKEND_NAME}, softmax_all) ...@@ -8473,6 +8475,7 @@ TEST(${BACKEND_NAME}, softmax_all)
TEST(${BACKEND_NAME}, softmax_axis) TEST(${BACKEND_NAME}, softmax_axis)
{ {
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape{2, 3}; Shape shape{2, 3};
auto A = make_shared<op::Parameter>(element::f32, shape); auto A = make_shared<op::Parameter>(element::f32, shape);
auto f = make_shared<Function>(make_shared<op::Softmax>(A, AxisSet{1}), op::ParameterVector{A}); auto f = make_shared<Function>(make_shared<op::Softmax>(A, AxisSet{1}), op::ParameterVector{A});
...@@ -8501,6 +8504,7 @@ TEST(${BACKEND_NAME}, softmax_axis) ...@@ -8501,6 +8504,7 @@ TEST(${BACKEND_NAME}, softmax_axis)
TEST(${BACKEND_NAME}, softmax_underflow) TEST(${BACKEND_NAME}, softmax_underflow)
{ {
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape{2, 3}; Shape shape{2, 3};
auto A = make_shared<op::Parameter>(element::f32, shape); auto A = make_shared<op::Parameter>(element::f32, shape);
auto f = make_shared<Function>(make_shared<op::Softmax>(A, AxisSet{0}), op::ParameterVector{A}); auto f = make_shared<Function>(make_shared<op::Softmax>(A, AxisSet{0}), op::ParameterVector{A});
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment