Commit 95312b8e authored by Fenglei's avatar Fenglei Committed by Robert Kimball

gpu emitter using template function (#610)

* update gpu_emitter use template

* add template
parent b3d2ff59
...@@ -28,21 +28,69 @@ ...@@ -28,21 +28,69 @@
#include <vector> #include <vector>
#include "ngraph/node.hpp" #include "ngraph/node.hpp"
#include "ngraph/ops/abs.hpp"
#include "ngraph/ops/acos.hpp"
#include "ngraph/ops/add.hpp"
#include "ngraph/ops/allreduce.hpp"
#include "ngraph/ops/asin.hpp"
#include "ngraph/ops/atan.hpp"
#include "ngraph/ops/avg_pool.hpp"
#include "ngraph/ops/batch_norm.hpp"
#include "ngraph/ops/broadcast.hpp" #include "ngraph/ops/broadcast.hpp"
#include "ngraph/ops/ceiling.hpp"
#include "ngraph/ops/concat.hpp" #include "ngraph/ops/concat.hpp"
#include "ngraph/ops/constant.hpp" #include "ngraph/ops/constant.hpp"
#include "ngraph/ops/convert.hpp"
#include "ngraph/ops/convolution.hpp" #include "ngraph/ops/convolution.hpp"
#include "ngraph/ops/cos.hpp"
#include "ngraph/ops/cosh.hpp"
#include "ngraph/ops/divide.hpp"
#include "ngraph/ops/dot.hpp" #include "ngraph/ops/dot.hpp"
#include "ngraph/ops/equal.hpp"
#include "ngraph/ops/exp.hpp"
#include "ngraph/ops/floor.hpp"
#include "ngraph/ops/function_call.hpp" #include "ngraph/ops/function_call.hpp"
#include "ngraph/ops/get_output_element.hpp" #include "ngraph/ops/get_output_element.hpp"
#include "ngraph/ops/greater.hpp"
#include "ngraph/ops/greater_eq.hpp"
#include "ngraph/ops/less.hpp"
#include "ngraph/ops/less_eq.hpp"
#include "ngraph/ops/log.hpp"
#include "ngraph/ops/max.hpp"
#include "ngraph/ops/max_pool.hpp" #include "ngraph/ops/max_pool.hpp"
#include "ngraph/ops/maximum.hpp"
#include "ngraph/ops/min.hpp"
#include "ngraph/ops/minimum.hpp"
#include "ngraph/ops/multiply.hpp"
#include "ngraph/ops/negative.hpp"
#include "ngraph/ops/not.hpp"
#include "ngraph/ops/not_equal.hpp"
#include "ngraph/ops/one_hot.hpp" #include "ngraph/ops/one_hot.hpp"
#include "ngraph/ops/op.hpp"
#include "ngraph/ops/pad.hpp"
#include "ngraph/ops/parameter.hpp"
#include "ngraph/ops/power.hpp"
#include "ngraph/ops/product.hpp"
#include "ngraph/ops/reduce.hpp" #include "ngraph/ops/reduce.hpp"
#include "ngraph/ops/reduce_window.hpp"
#include "ngraph/ops/relu.hpp"
#include "ngraph/ops/remainder.hpp"
#include "ngraph/ops/replace_slice.hpp" #include "ngraph/ops/replace_slice.hpp"
#include "ngraph/ops/reshape.hpp" #include "ngraph/ops/reshape.hpp"
#include "ngraph/ops/result.hpp"
#include "ngraph/ops/reverse.hpp" #include "ngraph/ops/reverse.hpp"
#include "ngraph/ops/select.hpp"
#include "ngraph/ops/select_and_scatter.hpp"
#include "ngraph/ops/sign.hpp"
#include "ngraph/ops/sin.hpp"
#include "ngraph/ops/sinh.hpp"
#include "ngraph/ops/slice.hpp" #include "ngraph/ops/slice.hpp"
#include "ngraph/ops/softmax.hpp"
#include "ngraph/ops/sqrt.hpp"
#include "ngraph/ops/subtract.hpp"
#include "ngraph/ops/sum.hpp" #include "ngraph/ops/sum.hpp"
#include "ngraph/ops/tan.hpp"
#include "ngraph/ops/tanh.hpp"
#include "ngraph/runtime/gpu/gpu_cuda_kernel_emitters.hpp" #include "ngraph/runtime/gpu/gpu_cuda_kernel_emitters.hpp"
#include "ngraph/runtime/gpu/gpu_emitter.hpp" #include "ngraph/runtime/gpu/gpu_emitter.hpp"
#include "ngraph/runtime/gpu/gpu_kernel_emitters.hpp" #include "ngraph/runtime/gpu/gpu_kernel_emitters.hpp"
...@@ -51,44 +99,57 @@ ...@@ -51,44 +99,57 @@
using namespace std; using namespace std;
using namespace ngraph; using namespace ngraph;
void runtime::gpu::GPU_Emitter::EmitNop(codegen::CodeWriter& writer, namespace ngraph
const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{ {
} namespace runtime
{
namespace gpu
{
template <>
void GPU_Emitter::EMITTER_DECL(ngraph::op::Abs)
{
if (out[0].get_size() == 0)
{
return;
}
writer << "{ // " << node->get_name() << "\n";
writer.indent++;
writer << "int count = " << out[0].get_size() << ";\n";
writer << "ngraph::runtime::gpu::emit_abs((void*) " << args[0].get_name()
<< ", (void*) " << out[0].get_name() << ", count);\n";
writer.indent--;
writer << "}\n";
}
void runtime::gpu::GPU_Emitter::EmitUnaryElementwise( void GPU_Emitter::EmitUnaryElementwise(GPU_ExternalFunction* external_function,
codegen::CodeWriter& writer, codegen::CodeWriter& writer,
const ngraph::Node* n, const ngraph::Node* node,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args, const std::vector<GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out) const std::vector<GPU_TensorViewWrapper>& out)
{ {
if (out[0].get_size() == 0) if (out[0].get_size() == 0)
{ {
return; return;
} }
writer << "{ // " << n->get_name() << "\n"; writer << "{ // " << node->get_name() << "\n";
writer.indent++; writer.indent++;
writer << "int count = " << out[0].get_size() << ";\n"; writer << "int count = " << out[0].get_size() << ";\n";
writer << "if(count == 0) return;\n"; writer << "if(count == 0) return;\n";
writer << "ngraph::runtime::gpu::emit_unary_elementwise_op<ngraph::op::" << n->description() writer << "ngraph::runtime::gpu::emit_unary_elementwise_op<ngraph::op::"
<< ">((void*) " << args[0].get_name() << ", (void*) " << out[0].get_name() << node->description() << ">((void*) " << args[0].get_name() << ", (void*) "
<< ", count, \"" << n->description() << "\");\n"; << out[0].get_name() << ", count, \"" << node->description() << "\");\n";
writer.indent--; writer.indent--;
writer << "}\n"; writer << "}\n";
} }
void runtime::gpu::GPU_Emitter::EmitAdd(codegen::CodeWriter& writer, template <>
const ngraph::Node* n, void GPU_Emitter::EMITTER_DECL(ngraph::op::Add)
const vector<runtime::gpu::GPU_TensorViewWrapper>& args, {
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
if (out[0].get_size() == 0) if (out[0].get_size() == 0)
{ {
return; return;
} }
writer << "{ // " << n->get_name() << "\n"; writer << "{ // " << node->get_name() << "\n";
writer.indent++; writer.indent++;
writer << "int count = " << out[0].get_size() << ";\n"; writer << "int count = " << out[0].get_size() << ";\n";
writer += R"( writer += R"(
...@@ -121,33 +182,24 @@ cudnnSetOpTensorDescriptor(opTensorDesc, ...@@ -121,33 +182,24 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
<< "descriptor," << out[0].get_name() << ");\n"; << "descriptor," << out[0].get_name() << ");\n";
writer.indent--; writer.indent--;
writer << "}\n"; writer << "}\n";
} }
void runtime::gpu::GPU_Emitter::EmitConcat(codegen::CodeWriter& writer,
const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
}
void runtime::gpu::GPU_Emitter::EmitDot(codegen::CodeWriter& writer, template <>
const ngraph::Node* n, void GPU_Emitter::EMITTER_DECL(ngraph::op::Dot)
const vector<runtime::gpu::GPU_TensorViewWrapper>& args, {
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
if (out[0].get_size() == 0) if (out[0].get_size() == 0)
{ {
return; return;
} }
const ngraph::op::Dot* dot = static_cast<const ngraph::op::Dot*>(n); const ngraph::op::Dot* dot = static_cast<const ngraph::op::Dot*>(node);
const Shape& arg0_shape = args[0].get_shape(); const Shape& arg0_shape = args[0].get_shape();
const Shape& arg1_shape = args[1].get_shape(); const Shape& arg1_shape = args[1].get_shape();
if (arg0_shape.empty() || arg1_shape.empty()) if (arg0_shape.empty() || arg1_shape.empty())
{ {
auto& first = (arg0_shape.empty() ? args[0] : args[1]); auto& first = (arg0_shape.empty() ? args[0] : args[1]);
auto& second = (arg0_shape.empty() ? args[1] : args[0]); auto& second = (arg0_shape.empty() ? args[1] : args[0]);
writer << "{ // " << n->get_name() << "\n"; writer << "{ // " << node->get_name() << "\n";
writer.indent++; writer.indent++;
writer << "int count = " << second.get_size() << ";\n"; writer << "int count = " << second.get_size() << ";\n";
writer << "cublasScopy(" writer << "cublasScopy("
...@@ -156,7 +208,8 @@ void runtime::gpu::GPU_Emitter::EmitDot(codegen::CodeWriter& writer, ...@@ -156,7 +208,8 @@ void runtime::gpu::GPU_Emitter::EmitDot(codegen::CodeWriter& writer,
<< "1," << out[0].get_name() << ", 1);\n"; << "1," << out[0].get_name() << ", 1);\n";
writer << "cublasSscal(" writer << "cublasSscal("
<< "cublas_handle," << "cublas_handle,"
<< "count ," << first.get_name() << "," << out[0].get_name() << ", 1);\n"; << "count ," << first.get_name() << "," << out[0].get_name()
<< ", 1);\n";
writer.indent--; writer.indent--;
writer << "}\n"; writer << "}\n";
return; return;
...@@ -165,10 +218,10 @@ void runtime::gpu::GPU_Emitter::EmitDot(codegen::CodeWriter& writer, ...@@ -165,10 +218,10 @@ void runtime::gpu::GPU_Emitter::EmitDot(codegen::CodeWriter& writer,
//set output to 0 if input size is 0 //set output to 0 if input size is 0
if (args[0].get_size() == 0 || args[1].get_size() == 0) if (args[0].get_size() == 0 || args[1].get_size() == 0)
{ {
writer << "{ // " << n->get_name() << "\n"; writer << "{ // " << node->get_name() << "\n";
writer.indent++; writer.indent++;
writer << "runtime::gpu::cuda_memset(" << out[0].get_name() << ", 0, " << out[0].get_size() writer << "runtime::gpu::cuda_memset(" << out[0].get_name() << ", 0, "
<< " * sizeof(float));\n"; << out[0].get_size() << " * sizeof(float));\n";
writer.indent--; writer.indent--;
writer << "}\n"; writer << "}\n";
return; return;
...@@ -176,7 +229,7 @@ void runtime::gpu::GPU_Emitter::EmitDot(codegen::CodeWriter& writer, ...@@ -176,7 +229,7 @@ void runtime::gpu::GPU_Emitter::EmitDot(codegen::CodeWriter& writer,
if ((arg0_shape.size() == 1) && (arg1_shape.size() == 1)) if ((arg0_shape.size() == 1) && (arg1_shape.size() == 1))
{ {
writer << "{ // " << n->get_name() << "\n"; writer << "{ // " << node->get_name() << "\n";
writer.indent++; writer.indent++;
writer << "cublasSdot(" writer << "cublasSdot("
<< "cublas_handle," << arg0_shape[0] << "," << args[0].get_name() << "," << "cublas_handle," << arg0_shape[0] << "," << args[0].get_name() << ","
...@@ -187,7 +240,7 @@ void runtime::gpu::GPU_Emitter::EmitDot(codegen::CodeWriter& writer, ...@@ -187,7 +240,7 @@ void runtime::gpu::GPU_Emitter::EmitDot(codegen::CodeWriter& writer,
} }
else if ((arg0_shape.size() == 2) && (arg1_shape.size() == 1)) else if ((arg0_shape.size() == 2) && (arg1_shape.size() == 1))
{ {
writer << "{ // " << n->get_name() << "\n"; writer << "{ // " << node->get_name() << "\n";
writer.indent++; writer.indent++;
writer << "const float alpha = 1.0;\n"; writer << "const float alpha = 1.0;\n";
writer << "const float beta = 0;\n"; writer << "const float beta = 0;\n";
...@@ -196,7 +249,8 @@ void runtime::gpu::GPU_Emitter::EmitDot(codegen::CodeWriter& writer, ...@@ -196,7 +249,8 @@ void runtime::gpu::GPU_Emitter::EmitDot(codegen::CodeWriter& writer,
<< "cublas_handle," << "cublas_handle,"
<< "CUBLAS_OP_T," << arg0_shape[0] << "," << arg0_shape[1] << "," << "CUBLAS_OP_T," << arg0_shape[0] << "," << arg0_shape[1] << ","
<< "&alpha," // Alpha << "&alpha," // Alpha
<< args[0].get_name() << "," << arg0_shape[1] << "," << args[1].get_name() << "," << args[0].get_name() << "," << arg0_shape[1] << ","
<< args[1].get_name() << ","
<< "1," << "1,"
<< "&beta," // beta << "&beta," // beta
<< out[0].get_name() << "," << out[0].get_name() << ","
...@@ -214,7 +268,7 @@ void runtime::gpu::GPU_Emitter::EmitDot(codegen::CodeWriter& writer, ...@@ -214,7 +268,7 @@ void runtime::gpu::GPU_Emitter::EmitDot(codegen::CodeWriter& writer,
{ {
throw std::runtime_error("input and output shape is not correct for dot;"); throw std::runtime_error("input and output shape is not correct for dot;");
} }
writer << "{ // " << n->get_name() << "\n"; writer << "{ // " << node->get_name() << "\n";
writer.indent++; writer.indent++;
writer << "const float alpha = 1.0;\n"; writer << "const float alpha = 1.0;\n";
writer << "const float beta = 0.0;\n"; writer << "const float beta = 0.0;\n";
...@@ -242,69 +296,19 @@ void runtime::gpu::GPU_Emitter::EmitDot(codegen::CodeWriter& writer, ...@@ -242,69 +296,19 @@ void runtime::gpu::GPU_Emitter::EmitDot(codegen::CodeWriter& writer,
} }
else else
{ {
throw std::runtime_error(n->get_name() + " with more then 2D is not implemented."); throw std::runtime_error(node->get_name() +
" with more then 2D is not implemented.");
}
} }
}
void runtime::gpu::GPU_Emitter::EmitDivide(codegen::CodeWriter& writer,
const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw std::runtime_error(n->get_name() + " is not implemented.");
}
void runtime::gpu::GPU_Emitter::EmitEqual(codegen::CodeWriter& writer,
const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw std::runtime_error(n->get_name() + " is not implemented.");
}
void runtime::gpu::GPU_Emitter::EmitGreater(codegen::CodeWriter& writer,
const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw std::runtime_error(n->get_name() + " is not implemented.");
}
void runtime::gpu::GPU_Emitter::EmitGreaterEq(
codegen::CodeWriter& writer,
const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw std::runtime_error(n->get_name() + " is not implemented.");
}
void runtime::gpu::GPU_Emitter::EmitLess(codegen::CodeWriter& writer,
const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw std::runtime_error(n->get_name() + " is not implemented.");
}
void runtime::gpu::GPU_Emitter::EmitLessEq(codegen::CodeWriter& writer,
const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw std::runtime_error(n->get_name() + " is not implemented.");
}
void runtime::gpu::GPU_Emitter::EmitMaximum(codegen::CodeWriter& writer, template <>
const ngraph::Node* n, void GPU_Emitter::EMITTER_DECL(ngraph::op::Maximum)
const vector<runtime::gpu::GPU_TensorViewWrapper>& args, {
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
if (out[0].get_size() == 0) if (out[0].get_size() == 0)
{ {
return; return;
} }
writer << "{ // " << n->get_name() << "\n"; writer << "{ // " << node->get_name() << "\n";
writer.indent++; writer.indent++;
writer << "int count = " << out[0].get_size() << ";\n"; writer << "int count = " << out[0].get_size() << ";\n";
writer += R"( writer += R"(
...@@ -337,18 +341,16 @@ cudnnSetOpTensorDescriptor(opTensorDesc, ...@@ -337,18 +341,16 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
<< "descriptor," << out[0].get_name() << ");\n"; << "descriptor," << out[0].get_name() << ");\n";
writer.indent--; writer.indent--;
writer << "}\n"; writer << "}\n";
} }
void runtime::gpu::GPU_Emitter::EmitMinimum(codegen::CodeWriter& writer, template <>
const ngraph::Node* n, void GPU_Emitter::EMITTER_DECL(ngraph::op::Minimum)
const vector<runtime::gpu::GPU_TensorViewWrapper>& args, {
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
if (out[0].get_size() == 0) if (out[0].get_size() == 0)
{ {
return; return;
} }
writer << "{ // " << n->get_name() << "\n"; writer << "{ // " << node->get_name() << "\n";
writer.indent++; writer.indent++;
writer << "int count = " << out[0].get_size() << ";\n"; writer << "int count = " << out[0].get_size() << ";\n";
writer += R"( writer += R"(
...@@ -381,19 +383,16 @@ cudnnSetOpTensorDescriptor(opTensorDesc, ...@@ -381,19 +383,16 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
<< "descriptor," << out[0].get_name() << ");\n"; << "descriptor," << out[0].get_name() << ");\n";
writer.indent--; writer.indent--;
writer << "}\n"; writer << "}\n";
} }
void runtime::gpu::GPU_Emitter::EmitNegative( template <>
codegen::CodeWriter& writer, void GPU_Emitter::EMITTER_DECL(ngraph::op::Negative)
const ngraph::Node* n, {
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
if (out[0].get_size() == 0) if (out[0].get_size() == 0)
{ {
return; return;
} }
writer << "{ // " << n->get_name() << "\n"; writer << "{ // " << node->get_name() << "\n";
writer.indent++; writer.indent++;
writer << "int count = " << out[0].get_size() << ";\n"; writer << "int count = " << out[0].get_size() << ";\n";
writer += R"( writer += R"(
...@@ -426,44 +425,16 @@ cudnnSetOpTensorDescriptor(opTensorDesc, ...@@ -426,44 +425,16 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
<< "descriptor," << out[0].get_name() << ");\n"; << "descriptor," << out[0].get_name() << ");\n";
writer.indent--; writer.indent--;
writer << "}\n"; writer << "}\n";
} }
void runtime::gpu::GPU_Emitter::EmitNotEqual(
codegen::CodeWriter& writer,
const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw std::runtime_error(n->get_name() + " is not implemented.");
}
void runtime::gpu::GPU_Emitter::EmitSelect(codegen::CodeWriter& writer,
const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw std::runtime_error(n->get_name() + " is not implemented.");
}
void runtime::gpu::GPU_Emitter::EmitSubtract(
codegen::CodeWriter& writer,
const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw std::runtime_error(n->get_name() + " is not implemented.");
}
void runtime::gpu::GPU_Emitter::EmitBroadcast( template <>
codegen::CodeWriter& writer, void GPU_Emitter::EMITTER_DECL(ngraph::op::Broadcast)
const ngraph::Node* n, {
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
if (out[0].get_size() == 0) if (out[0].get_size() == 0)
{ {
return; return;
} }
auto broadcast = static_cast<const ngraph::op::Broadcast*>(n); auto broadcast = static_cast<const ngraph::op::Broadcast*>(node);
auto arg_shape = args[0].get_shape(); auto arg_shape = args[0].get_shape();
auto result_shape = out[0].get_shape(); auto result_shape = out[0].get_shape();
...@@ -471,10 +442,11 @@ void runtime::gpu::GPU_Emitter::EmitBroadcast( ...@@ -471,10 +442,11 @@ void runtime::gpu::GPU_Emitter::EmitBroadcast(
//broadcast axes is empty, do a copy //broadcast axes is empty, do a copy
if (axes.empty()) if (axes.empty())
{ {
writer << "{ // " << n->get_name() << " \n"; writer << "{ // " << node->get_name() << " \n";
writer.indent++; writer.indent++;
writer << "runtime::gpu::cuda_memcpyDtD(" << out[0].get_name() << ", " << args[0].get_name() writer << "runtime::gpu::cuda_memcpyDtD(" << out[0].get_name() << ", "
<< ", " << out[0].get_size() << " * " << out[0].get_element_type().size() << ");\n"; << args[0].get_name() << ", " << out[0].get_size() << " * "
<< out[0].get_element_type().size() << ");\n";
writer.indent--; writer.indent--;
writer << "}\n"; writer << "}\n";
return; return;
...@@ -510,47 +482,34 @@ void runtime::gpu::GPU_Emitter::EmitBroadcast( ...@@ -510,47 +482,34 @@ void runtime::gpu::GPU_Emitter::EmitBroadcast(
repeat_size *= result_shape[i]; repeat_size *= result_shape[i];
} }
writer << "{ // " << n->get_name() << " \n"; writer << "{ // " << node->get_name() << " \n";
writer.indent++; writer.indent++;
writer << "runtime::gpu::emit_broadcast(" << args[0].get_name() << ", " << out[0].get_name() writer << "runtime::gpu::emit_broadcast(" << args[0].get_name() << ", "
<< ", " << repeat_size << ", " << repeat_times << ", " << out[0].get_size() << out[0].get_name() << ", " << repeat_size << ", " << repeat_times
<< ");\n"; << ", " << out[0].get_size() << ");\n";
writer.indent--; writer.indent--;
writer << "}\n"; writer << "}\n";
} }
else else
{ {
throw std::runtime_error(n->get_name() + " is not implemented."); throw std::runtime_error(node->get_name() + " is not implemented.");
}
} }
}
void runtime::gpu::GPU_Emitter::EmitConvert(codegen::CodeWriter& writer,
const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw std::runtime_error(n->get_name() + " is not implemented.");
}
void runtime::gpu::GPU_Emitter::EmitConstant( template <>
codegen::CodeWriter& writer, void GPU_Emitter::EMITTER_DECL(ngraph::op::Constant)
const ngraph::Node* n, {
const vector<runtime::gpu::GPU_TensorViewWrapper>& args, }
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
}
void runtime::gpu::GPU_Emitter::EmitReshape(codegen::CodeWriter& writer, template <>
const ngraph::Node* n, void GPU_Emitter::EMITTER_DECL(ngraph::op::Reshape)
const vector<runtime::gpu::GPU_TensorViewWrapper>& args, {
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
if (out[0].get_size() == 0) if (out[0].get_size() == 0)
{ {
return; return;
} }
auto reshape = static_cast<const op::Reshape*>(n); auto reshape = static_cast<const op::Reshape*>(node);
writer << "{ // " << n->get_name() << "\n"; writer << "{ // " << node->get_name() << "\n";
writer.indent++; writer.indent++;
auto arg_shape = args[0].get_shape(); auto arg_shape = args[0].get_shape();
auto arg_rank = arg_shape.size(); auto arg_rank = arg_shape.size();
...@@ -571,10 +530,11 @@ void runtime::gpu::GPU_Emitter::EmitReshape(codegen::CodeWriter& writer, ...@@ -571,10 +530,11 @@ void runtime::gpu::GPU_Emitter::EmitReshape(codegen::CodeWriter& writer,
// we can just copy. // we can just copy.
if (same_layout || result_shape_product < 2) if (same_layout || result_shape_product < 2)
{ {
writer << "{ // " << n->get_name() << " 1\n"; writer << "{ // " << node->get_name() << " 1\n";
writer.indent++; writer.indent++;
writer << "runtime::gpu::cuda_memcpyDtD(" << out[0].get_name() << ", " << args[0].get_name() writer << "runtime::gpu::cuda_memcpyDtD(" << out[0].get_name() << ", "
<< ", " << out[0].get_size() << " * " << out[0].get_element_type().size() << ");\n"; << args[0].get_name() << ", " << out[0].get_size() << " * "
<< out[0].get_element_type().size() << ");\n";
writer.indent--; writer.indent--;
writer << "}\n"; writer << "}\n";
} }
...@@ -582,7 +542,7 @@ void runtime::gpu::GPU_Emitter::EmitReshape(codegen::CodeWriter& writer, ...@@ -582,7 +542,7 @@ void runtime::gpu::GPU_Emitter::EmitReshape(codegen::CodeWriter& writer,
else if (arg_rank == 2) else if (arg_rank == 2)
{ {
// TODO Assert arg0_shape[0] == arg1_shape[0]? // TODO Assert arg0_shape[0] == arg1_shape[0]?
writer << "{ // " << n->get_name() << "\n"; writer << "{ // " << node->get_name() << "\n";
writer.indent++; writer.indent++;
writer << "const float alpha = 1.0;\n"; writer << "const float alpha = 1.0;\n";
writer << "const float beta = 0;\n"; writer << "const float beta = 0;\n";
...@@ -594,8 +554,8 @@ void runtime::gpu::GPU_Emitter::EmitReshape(codegen::CodeWriter& writer, ...@@ -594,8 +554,8 @@ void runtime::gpu::GPU_Emitter::EmitReshape(codegen::CodeWriter& writer,
<< "&alpha," // Alpha << "&alpha," // Alpha
<< args[0].get_name() << "," << arg_shape[1] << "," << args[0].get_name() << "," << arg_shape[1] << ","
<< "&beta," // beta << "&beta," // beta
<< args[0].get_name() << "," << arg_shape[1] << "," << out[0].get_name() << "," << args[0].get_name() << "," << arg_shape[1] << "," << out[0].get_name()
<< result_shape[1] << ");\n"; << "," << result_shape[1] << ");\n";
writer << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_DEVICE);\n"; writer << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_DEVICE);\n";
writer.indent--; writer.indent--;
writer << "}\n"; writer << "}\n";
...@@ -604,55 +564,26 @@ void runtime::gpu::GPU_Emitter::EmitReshape(codegen::CodeWriter& writer, ...@@ -604,55 +564,26 @@ void runtime::gpu::GPU_Emitter::EmitReshape(codegen::CodeWriter& writer,
else else
{ {
throw runtime_error( throw runtime_error(
"Axis permutation in reshape is not implemented yet for tensors with rank>2"); "Axis permutation in reshape is not implemented yet for tensors with "
"rank>2");
} }
writer.indent--; writer.indent--;
writer << "}\n"; writer << "}\n";
} }
void runtime::gpu::GPU_Emitter::EmitFunctionCall(
codegen::CodeWriter& writer,
const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
}
void runtime::gpu::GPU_Emitter::EmitReduce(codegen::CodeWriter& writer,
const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw std::runtime_error(n->get_name() + " is not implemented.");
}
void runtime::gpu::GPU_Emitter::EmitSlice(codegen::CodeWriter& writer,
const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw std::runtime_error(n->get_name() + " is not implemented.");
}
void runtime::gpu::GPU_Emitter::EmitSum(codegen::CodeWriter& writer, template <>
const ngraph::Node* n, void GPU_Emitter::EMITTER_DECL(ngraph::op::FunctionCall)
const vector<runtime::gpu::GPU_TensorViewWrapper>& args, {
const vector<runtime::gpu::GPU_TensorViewWrapper>& out) }
{
throw std::runtime_error(n->get_name() + " is not implemented.");
}
void runtime::gpu::GPU_Emitter::EmitMultiply( template <>
codegen::CodeWriter& writer, void GPU_Emitter::EMITTER_DECL(ngraph::op::Multiply)
const ngraph::Node* n, {
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
if (out[0].get_size() == 0) if (out[0].get_size() == 0)
{ {
return; return;
} }
writer << "{ // " << n->get_name() << "\n"; writer << "{ // " << node->get_name() << "\n";
writer.indent++; writer.indent++;
writer << "int count = " << out[0].get_size() << ";\n"; writer << "int count = " << out[0].get_size() << ";\n";
writer += R"( writer += R"(
...@@ -685,43 +616,16 @@ cudnnSetOpTensorDescriptor(opTensorDesc, ...@@ -685,43 +616,16 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
<< "descriptor," << out[0].get_name() << ");\n"; << "descriptor," << out[0].get_name() << ");\n";
writer.indent--; writer.indent--;
writer << "}\n"; writer << "}\n";
} }
void runtime::gpu::GPU_Emitter::EmitPower(codegen::CodeWriter& writer,
const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw std::runtime_error(n->get_name() + " is not implemented.");
}
void runtime::gpu::GPU_Emitter::EmitReplaceSlice(
codegen::CodeWriter& writer,
const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw std::runtime_error(n->get_name() + " is not implemented.");
}
void runtime::gpu::GPU_Emitter::EmitOneHot(codegen::CodeWriter& writer,
const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw std::runtime_error(n->get_name() + " is not implemented.");
}
void runtime::gpu::GPU_Emitter::EmitSqrt(codegen::CodeWriter& writer, template <>
const ngraph::Node* n, void GPU_Emitter::EMITTER_DECL(ngraph::op::Sqrt)
const vector<runtime::gpu::GPU_TensorViewWrapper>& args, {
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
if (out[0].get_size() == 0) if (out[0].get_size() == 0)
{ {
return; return;
} }
writer << "{ // " << n->get_name() << "\n"; writer << "{ // " << node->get_name() << "\n";
writer.indent++; writer.indent++;
writer << "int count = " << out[0].get_size() << ";\n"; writer << "int count = " << out[0].get_size() << ";\n";
writer += R"( writer += R"(
...@@ -754,61 +658,20 @@ cudnnSetOpTensorDescriptor(opTensorDesc, ...@@ -754,61 +658,20 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
<< "descriptor," << out[0].get_name() << ");\n"; << "descriptor," << out[0].get_name() << ");\n";
writer.indent--; writer.indent--;
writer << "}\n"; writer << "}\n";
} }
void runtime::gpu::GPU_Emitter::EmitConvolution(
codegen::CodeWriter& writer,
const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw std::runtime_error(n->get_name() + " is not implemented.");
}
void runtime::gpu::GPU_Emitter::EmitMaxPool(codegen::CodeWriter& writer,
const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw std::runtime_error(n->get_name() + " is not implemented.");
}
void runtime::gpu::GPU_Emitter::EmitReverse(codegen::CodeWriter& writer,
const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw std::runtime_error(n->get_name() + " is not implemented.");
}
void runtime::gpu::GPU_Emitter::EmitReduceWindow(
codegen::CodeWriter& writer,
const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw std::runtime_error(n->get_name() + " is not implemented.");
}
void runtime::gpu::GPU_Emitter::EmitSelectAndScatter(
codegen::CodeWriter& writer,
const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw std::runtime_error(n->get_name() + " is not implemented.");
}
void runtime::gpu::GPU_Emitter::EmitResult(codegen::CodeWriter& writer, template <>
const ngraph::Node* n, void GPU_Emitter::EMITTER_DECL(ngraph::op::Result)
const vector<runtime::gpu::GPU_TensorViewWrapper>& args, {
const vector<runtime::gpu::GPU_TensorViewWrapper>& out) writer << "{ //" << node->get_name() << "\n";
{
writer << "{ //" << n->get_name() << "\n";
writer.indent++; writer.indent++;
writer << "runtime::gpu::cuda_memcpyDtD(" << out[0].get_name() << ", " << args[0].get_name() writer << "runtime::gpu::cuda_memcpyDtD(" << out[0].get_name() << ", "
<< ", " << out[0].get_size() << " * " << out[0].get_element_type().size() << ");\n"; << args[0].get_name() << ", " << out[0].get_size() << " * "
<< out[0].get_element_type().size() << ");\n";
writer.indent--; writer.indent--;
writer << "}\n"; writer << "}\n";
return; return;
}
}
}
} }
...@@ -24,12 +24,12 @@ ...@@ -24,12 +24,12 @@
#include "ngraph/runtime/gpu/gpu_external_function.hpp" #include "ngraph/runtime/gpu/gpu_external_function.hpp"
#include "ngraph/runtime/gpu/gpu_tensor_view_wrapper.hpp" #include "ngraph/runtime/gpu/gpu_tensor_view_wrapper.hpp"
#define EMITTER_DECL(E) \ #define EMITTER_DECL(op_name) \
E(codegen::CodeWriter& writer, \ emit<op_name>(GPU_ExternalFunction * external_function, \
const ngraph::Node* n, \ codegen::CodeWriter & writer, \
const std::vector<ngraph::runtime::gpu::GPU_TensorViewWrapper>& args, \ const ngraph::Node* node, \
const std::vector<ngraph::runtime::gpu::GPU_TensorViewWrapper>& out) const std::vector<GPU_TensorViewWrapper>& args, \
const std::vector<GPU_TensorViewWrapper>& out)
namespace ngraph namespace ngraph
{ {
namespace runtime namespace runtime
...@@ -39,45 +39,30 @@ namespace ngraph ...@@ -39,45 +39,30 @@ namespace ngraph
class GPU_Emitter class GPU_Emitter
{ {
public: public:
static void EMITTER_DECL(EmitNop); template <typename OP>
static void EMITTER_DECL(EmitAdd); static void emit(GPU_ExternalFunction* external_function,
static void EMITTER_DECL(EmitDot); codegen::CodeWriter& writer,
static void EMITTER_DECL(EmitMultiply); const ngraph::Node* node,
static void EMITTER_DECL(EmitGetOutputElement); const std::vector<GPU_TensorViewWrapper>& args,
static void EMITTER_DECL(EmitXLAGetTupleElement); const std::vector<GPU_TensorViewWrapper>& out)
static void EMITTER_DECL(EmitUnaryElementwise); {
static void EMITTER_DECL(EmitTuple); throw std::runtime_error("Unimplemented op in GPU emitter for " +
static void EMITTER_DECL(EmitConcat); node->get_name());
static void EMITTER_DECL(EmitDivide); }
static void EMITTER_DECL(EmitEqual);
static void EMITTER_DECL(EmitGreater); static void nop(GPU_ExternalFunction* external_function,
static void EMITTER_DECL(EmitGreaterEq); codegen::CodeWriter& writer,
static void EMITTER_DECL(EmitLess); const ngraph::Node* node,
static void EMITTER_DECL(EmitLessEq); const std::vector<GPU_TensorViewWrapper>& args,
static void EMITTER_DECL(EmitMaximum); const std::vector<GPU_TensorViewWrapper>& out)
static void EMITTER_DECL(EmitMinimum); {
static void EMITTER_DECL(EmitNegative); }
static void EMITTER_DECL(EmitNotEqual);
static void EMITTER_DECL(EmitSelect); static void EmitUnaryElementwise(GPU_ExternalFunction* external_function,
static void EMITTER_DECL(EmitSubtract); codegen::CodeWriter& writer,
static void EMITTER_DECL(EmitBroadcast); const ngraph::Node* node,
static void EMITTER_DECL(EmitConvert); const std::vector<GPU_TensorViewWrapper>& args,
static void EMITTER_DECL(EmitConstant); const std::vector<GPU_TensorViewWrapper>& out);
static void EMITTER_DECL(EmitReshape);
static void EMITTER_DECL(EmitFunctionCall);
static void EMITTER_DECL(EmitReduce);
static void EMITTER_DECL(EmitSlice);
static void EMITTER_DECL(EmitSum);
static void EMITTER_DECL(EmitPower);
static void EMITTER_DECL(EmitReplaceSlice);
static void EMITTER_DECL(EmitOneHot);
static void EMITTER_DECL(EmitSqrt);
static void EMITTER_DECL(EmitConvolution);
static void EMITTER_DECL(EmitMaxPool);
static void EMITTER_DECL(EmitReverse);
static void EMITTER_DECL(EmitReduceWindow);
static void EMITTER_DECL(EmitSelectAndScatter);
static void EMITTER_DECL(EmitResult);
}; };
} }
} }
......
...@@ -41,8 +41,11 @@ ...@@ -41,8 +41,11 @@
#include "ngraph/ops/abs.hpp" #include "ngraph/ops/abs.hpp"
#include "ngraph/ops/acos.hpp" #include "ngraph/ops/acos.hpp"
#include "ngraph/ops/add.hpp" #include "ngraph/ops/add.hpp"
#include "ngraph/ops/allreduce.hpp"
#include "ngraph/ops/asin.hpp" #include "ngraph/ops/asin.hpp"
#include "ngraph/ops/atan.hpp" #include "ngraph/ops/atan.hpp"
#include "ngraph/ops/avg_pool.hpp"
#include "ngraph/ops/batch_norm.hpp"
#include "ngraph/ops/broadcast.hpp" #include "ngraph/ops/broadcast.hpp"
#include "ngraph/ops/ceiling.hpp" #include "ngraph/ops/ceiling.hpp"
#include "ngraph/ops/concat.hpp" #include "ngraph/ops/concat.hpp"
...@@ -57,24 +60,34 @@ ...@@ -57,24 +60,34 @@
#include "ngraph/ops/exp.hpp" #include "ngraph/ops/exp.hpp"
#include "ngraph/ops/floor.hpp" #include "ngraph/ops/floor.hpp"
#include "ngraph/ops/function_call.hpp" #include "ngraph/ops/function_call.hpp"
#include "ngraph/ops/get_output_element.hpp"
#include "ngraph/ops/greater.hpp" #include "ngraph/ops/greater.hpp"
#include "ngraph/ops/greater_eq.hpp" #include "ngraph/ops/greater_eq.hpp"
#include "ngraph/ops/less.hpp" #include "ngraph/ops/less.hpp"
#include "ngraph/ops/less_eq.hpp" #include "ngraph/ops/less_eq.hpp"
#include "ngraph/ops/log.hpp" #include "ngraph/ops/log.hpp"
#include "ngraph/ops/max.hpp"
#include "ngraph/ops/max_pool.hpp" #include "ngraph/ops/max_pool.hpp"
#include "ngraph/ops/maximum.hpp" #include "ngraph/ops/maximum.hpp"
#include "ngraph/ops/min.hpp"
#include "ngraph/ops/minimum.hpp" #include "ngraph/ops/minimum.hpp"
#include "ngraph/ops/multiply.hpp" #include "ngraph/ops/multiply.hpp"
#include "ngraph/ops/negative.hpp" #include "ngraph/ops/negative.hpp"
#include "ngraph/ops/not.hpp" #include "ngraph/ops/not.hpp"
#include "ngraph/ops/not_equal.hpp" #include "ngraph/ops/not_equal.hpp"
#include "ngraph/ops/one_hot.hpp" #include "ngraph/ops/one_hot.hpp"
#include "ngraph/ops/op.hpp"
#include "ngraph/ops/pad.hpp"
#include "ngraph/ops/parameter.hpp"
#include "ngraph/ops/power.hpp" #include "ngraph/ops/power.hpp"
#include "ngraph/ops/product.hpp"
#include "ngraph/ops/reduce.hpp" #include "ngraph/ops/reduce.hpp"
#include "ngraph/ops/reduce_window.hpp" #include "ngraph/ops/reduce_window.hpp"
#include "ngraph/ops/relu.hpp"
#include "ngraph/ops/remainder.hpp"
#include "ngraph/ops/replace_slice.hpp" #include "ngraph/ops/replace_slice.hpp"
#include "ngraph/ops/reshape.hpp" #include "ngraph/ops/reshape.hpp"
#include "ngraph/ops/result.hpp"
#include "ngraph/ops/reverse.hpp" #include "ngraph/ops/reverse.hpp"
#include "ngraph/ops/select.hpp" #include "ngraph/ops/select.hpp"
#include "ngraph/ops/select_and_scatter.hpp" #include "ngraph/ops/select_and_scatter.hpp"
...@@ -82,6 +95,7 @@ ...@@ -82,6 +95,7 @@
#include "ngraph/ops/sin.hpp" #include "ngraph/ops/sin.hpp"
#include "ngraph/ops/sinh.hpp" #include "ngraph/ops/sinh.hpp"
#include "ngraph/ops/slice.hpp" #include "ngraph/ops/slice.hpp"
#include "ngraph/ops/softmax.hpp"
#include "ngraph/ops/sqrt.hpp" #include "ngraph/ops/sqrt.hpp"
#include "ngraph/ops/subtract.hpp" #include "ngraph/ops/subtract.hpp"
#include "ngraph/ops/sum.hpp" #include "ngraph/ops/sum.hpp"
...@@ -100,7 +114,6 @@ ...@@ -100,7 +114,6 @@
#include "ngraph/runtime/gpu/gpu_kernel_emitters.hpp" #include "ngraph/runtime/gpu/gpu_kernel_emitters.hpp"
using namespace std; using namespace std;
using namespace ngraph;
static const string s_output_dir = "gpu_codegen"; static const string s_output_dir = "gpu_codegen";
...@@ -146,82 +159,110 @@ static StaticInitializers s_static_initializers; ...@@ -146,82 +159,110 @@ static StaticInitializers s_static_initializers;
#define TI(x) type_index(typeid(x)) #define TI(x) type_index(typeid(x))
static const runtime::gpu::OpMap dispatcher{ namespace ngraph
{TI(ngraph::op::Add), &runtime::gpu::GPU_Emitter::EmitAdd}, {
{TI(ngraph::op::Dot), &runtime::gpu::GPU_Emitter::EmitDot}, namespace runtime
{TI(ngraph::op::Multiply), &runtime::gpu::GPU_Emitter::EmitMultiply}, {
{TI(ngraph::op::Parameter), &runtime::gpu::GPU_Emitter::EmitNop}, namespace gpu
{TI(ngraph::op::Abs), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise}, {
{TI(ngraph::op::Concat), &runtime::gpu::GPU_Emitter::EmitConcat}, static const OpMap dispatcher{
{TI(ngraph::op::Divide), &runtime::gpu::GPU_Emitter::EmitDivide}, {TI(ngraph::op::Add), &GPU_Emitter::emit<ngraph::op::Add>},
{TI(ngraph::op::Equal), &runtime::gpu::GPU_Emitter::EmitEqual}, {TI(ngraph::op::Dot), &GPU_Emitter::emit<ngraph::op::Dot>},
{TI(ngraph::op::Greater), &runtime::gpu::GPU_Emitter::EmitGreater}, {TI(ngraph::op::Multiply), &GPU_Emitter::emit<ngraph::op::Multiply>},
{TI(ngraph::op::GreaterEq), &runtime::gpu::GPU_Emitter::EmitGreaterEq}, {TI(ngraph::op::Parameter), &GPU_Emitter::nop},
{TI(ngraph::op::Less), &runtime::gpu::GPU_Emitter::EmitLess}, {TI(ngraph::op::Abs), &GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::LessEq), &runtime::gpu::GPU_Emitter::EmitLessEq}, {TI(ngraph::op::Concat), &GPU_Emitter::emit<ngraph::op::Concat>},
{TI(ngraph::op::Log), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise}, {TI(ngraph::op::Divide), &GPU_Emitter::emit<ngraph::op::Divide>},
{TI(ngraph::op::Maximum), &runtime::gpu::GPU_Emitter::EmitMaximum}, {TI(ngraph::op::Equal), &GPU_Emitter::emit<ngraph::op::Equal>},
{TI(ngraph::op::Minimum), &runtime::gpu::GPU_Emitter::EmitMinimum}, {TI(ngraph::op::GetOutputElement),
{TI(ngraph::op::Negative), &runtime::gpu::GPU_Emitter::EmitNegative}, &GPU_Emitter::emit<ngraph::op::GetOutputElement>},
{TI(ngraph::op::NotEqual), &runtime::gpu::GPU_Emitter::EmitNotEqual}, {TI(ngraph::op::Greater), &GPU_Emitter::emit<ngraph::op::Greater>},
{TI(ngraph::op::Power), &runtime::gpu::GPU_Emitter::EmitPower}, {TI(ngraph::op::GreaterEq), &GPU_Emitter::emit<ngraph::op::GreaterEq>},
{TI(ngraph::op::Select), &runtime::gpu::GPU_Emitter::EmitSelect}, {TI(ngraph::op::Less), &GPU_Emitter::emit<ngraph::op::Less>},
{TI(ngraph::op::Subtract), &runtime::gpu::GPU_Emitter::EmitSubtract}, {TI(ngraph::op::LessEq), &GPU_Emitter::emit<ngraph::op::LessEq>},
{TI(ngraph::op::Broadcast), &runtime::gpu::GPU_Emitter::EmitBroadcast}, {TI(ngraph::op::Log), &GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::Convert), &runtime::gpu::GPU_Emitter::EmitConvert}, {TI(ngraph::op::Maximum), &GPU_Emitter::emit<ngraph::op::Maximum>},
{TI(ngraph::op::Constant), &runtime::gpu::GPU_Emitter::EmitConstant}, {TI(ngraph::op::Minimum), &GPU_Emitter::emit<ngraph::op::Minimum>},
{TI(ngraph::op::Reshape), &runtime::gpu::GPU_Emitter::EmitReshape}, {TI(ngraph::op::Negative), &GPU_Emitter::emit<ngraph::op::Negative>},
{TI(ngraph::op::FunctionCall), &runtime::gpu::GPU_Emitter::EmitFunctionCall}, {TI(ngraph::op::NotEqual), &GPU_Emitter::emit<ngraph::op::NotEqual>},
{TI(ngraph::op::Reduce), &runtime::gpu::GPU_Emitter::EmitReduce}, {TI(ngraph::op::Power), &GPU_Emitter::emit<ngraph::op::Power>},
{TI(ngraph::op::Sign), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise}, {TI(ngraph::op::Select), &GPU_Emitter::emit<ngraph::op::Select>},
{TI(ngraph::op::Slice), &runtime::gpu::GPU_Emitter::EmitSlice}, {TI(ngraph::op::Subtract), &GPU_Emitter::emit<ngraph::op::Subtract>},
{TI(ngraph::op::Sum), &runtime::gpu::GPU_Emitter::EmitSum}, {TI(ngraph::op::Broadcast), &GPU_Emitter::emit<ngraph::op::Broadcast>},
{TI(ngraph::op::Exp), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise}, {TI(ngraph::op::Convert), &GPU_Emitter::emit<ngraph::op::Convert>},
{TI(ngraph::op::Sin), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise}, {TI(ngraph::op::Constant), &GPU_Emitter::emit<ngraph::op::Constant>},
{TI(ngraph::op::Sinh), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise}, {TI(ngraph::op::Reshape), &GPU_Emitter::emit<ngraph::op::Reshape>},
{TI(ngraph::op::Cos), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise}, {TI(ngraph::op::FunctionCall), &GPU_Emitter::emit<ngraph::op::FunctionCall>},
{TI(ngraph::op::Cosh), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise}, {TI(ngraph::op::Reduce), &GPU_Emitter::emit<ngraph::op::Reduce>},
{TI(ngraph::op::Tan), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise}, {TI(ngraph::op::Sign), &GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::Tanh), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise}, {TI(ngraph::op::Slice), &GPU_Emitter::emit<ngraph::op::Slice>},
{TI(ngraph::op::Asin), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise}, {TI(ngraph::op::Sum), &GPU_Emitter::emit<ngraph::op::Sum>},
{TI(ngraph::op::Acos), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise}, {TI(ngraph::op::Exp), &GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::Atan), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise}, {TI(ngraph::op::Sin), &GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::ReplaceSlice), &runtime::gpu::GPU_Emitter::EmitReplaceSlice}, {TI(ngraph::op::Sinh), &GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::OneHot), &runtime::gpu::GPU_Emitter::EmitOneHot}, {TI(ngraph::op::Cos), &GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::Floor), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise}, {TI(ngraph::op::Cosh), &GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::Ceiling), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise}, {TI(ngraph::op::Tan), &GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::Sqrt), &runtime::gpu::GPU_Emitter::EmitSqrt}, {TI(ngraph::op::Tanh), &GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::Convolution), &runtime::gpu::GPU_Emitter::EmitConvolution}, {TI(ngraph::op::Asin), &GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::Not), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise}, {TI(ngraph::op::Acos), &GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::MaxPool), &runtime::gpu::GPU_Emitter::EmitMaxPool}, {TI(ngraph::op::Atan), &GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::Reverse), &runtime::gpu::GPU_Emitter::EmitReverse}, {TI(ngraph::op::ReplaceSlice), &GPU_Emitter::emit<ngraph::op::ReplaceSlice>},
{TI(ngraph::op::ReduceWindow), &runtime::gpu::GPU_Emitter::EmitReduceWindow}, {TI(ngraph::op::OneHot), &GPU_Emitter::emit<ngraph::op::OneHot>},
{TI(ngraph::op::SelectAndScatter), &runtime::gpu::GPU_Emitter::EmitSelectAndScatter}, {TI(ngraph::op::Floor), &GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::Result), &runtime::gpu::GPU_Emitter::EmitResult}, {TI(ngraph::op::Ceiling), &GPU_Emitter::EmitUnaryElementwise},
}; {TI(ngraph::op::Sqrt), &GPU_Emitter::emit<ngraph::op::Sqrt>},
{TI(ngraph::op::Convolution), &GPU_Emitter::emit<ngraph::op::Convolution>},
runtime::gpu::GPU_ExternalFunction::GPU_ExternalFunction( {TI(ngraph::op::ConvolutionBackpropFilters),
const shared_ptr<ngraph::Function>& function, bool release_function) &GPU_Emitter::emit<ngraph::op::ConvolutionBackpropFilters>},
{TI(ngraph::op::ConvolutionBackpropData),
&GPU_Emitter::emit<ngraph::op::ConvolutionBackpropData>},
{TI(ngraph::op::Not), &GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::MaxPool), &GPU_Emitter::emit<ngraph::op::MaxPool>},
{TI(ngraph::op::Reverse), &GPU_Emitter::emit<ngraph::op::Reverse>},
{TI(ngraph::op::Result), &GPU_Emitter::emit<ngraph::op::Result>},
{TI(ngraph::op::ReduceWindow), &GPU_Emitter::emit<ngraph::op::ReduceWindow>},
{TI(ngraph::op::SelectAndScatter),
&GPU_Emitter::emit<ngraph::op::SelectAndScatter>},
{TI(ngraph::op::AvgPool), &GPU_Emitter::emit<ngraph::op::AvgPool>},
{TI(ngraph::op::AvgPoolBackprop), &GPU_Emitter::emit<ngraph::op::AvgPoolBackprop>},
{TI(ngraph::op::Pad), &GPU_Emitter::emit<ngraph::op::Pad>},
{TI(ngraph::op::BatchNorm), &GPU_Emitter::emit<ngraph::op::BatchNorm>},
{TI(ngraph::op::BatchNormBackprop),
&GPU_Emitter::emit<ngraph::op::BatchNormBackprop>},
{TI(ngraph::op::MaxPoolBackprop), &GPU_Emitter::emit<ngraph::op::MaxPoolBackprop>},
{TI(ngraph::op::Product), &GPU_Emitter::emit<ngraph::op::Product>},
{TI(ngraph::op::Max), &GPU_Emitter::emit<ngraph::op::Max>},
{TI(ngraph::op::Min), &GPU_Emitter::emit<ngraph::op::Min>},
{TI(ngraph::op::Relu), &GPU_Emitter::emit<ngraph::op::Relu>},
{TI(ngraph::op::ReluBackprop), &GPU_Emitter::emit<ngraph::op::ReluBackprop>},
{TI(ngraph::op::Softmax), &GPU_Emitter::emit<ngraph::op::Softmax>},
};
GPU_ExternalFunction::GPU_ExternalFunction(const shared_ptr<ngraph::Function>& function,
bool release_function)
: ngraph::runtime::ExternalFunction(function, release_function) : ngraph::runtime::ExternalFunction(function, release_function)
, m_compiled_function(nullptr) , m_compiled_function(nullptr)
, m_emit_timing(std::getenv("NGRAPH_GPU_EMIT_TIMING") != nullptr) , m_emit_timing(std::getenv("NGRAPH_GPU_EMIT_TIMING") != nullptr)
{ {
} }
void runtime::gpu::GPU_ExternalFunction::compile() void GPU_ExternalFunction::compile()
{ {
if (m_is_compiled) if (m_is_compiled)
{ {
return; return;
} }
string function_name = m_function->get_name(); string function_name = m_function->get_name();
string dump_filename = file_util::path_join(s_output_dir, function_name + "_ops.txt"); string dump_filename =
file_util::path_join(s_output_dir, function_name + "_ops.txt");
pass::Manager pass_manager; pass::Manager pass_manager;
// pass_manager.register_pass<pass::TopologicalSort>(); // pass_manager.register_pass<pass::TopologicalSort>();
// For now, just make everyone row-major. // For now, just make everyone row-major.
pass_manager.register_pass<pass::AssignLayout<descriptor::layout::DenseTensorViewLayout>>(); pass_manager
.register_pass<pass::AssignLayout<descriptor::layout::DenseTensorViewLayout>>();
pass_manager.register_pass<pass::Liveness>(); pass_manager.register_pass<pass::Liveness>();
pass_manager.register_pass<pass::MemoryLayout>(64); pass_manager.register_pass<pass::MemoryLayout>(64);
pass_manager.register_pass<pass::DumpSorted>(dump_filename); pass_manager.register_pass<pass::DumpSorted>(dump_filename);
...@@ -267,7 +308,8 @@ using namespace std; ...@@ -267,7 +308,8 @@ using namespace std;
{ {
writer << "// Declare debug timers\n"; writer << "// Declare debug timers\n";
vector<string> names; vector<string> names;
for (shared_ptr<Function> current_function : pass_manager.get_state().get_functions()) for (shared_ptr<Function> current_function :
pass_manager.get_state().get_functions())
{ {
for (shared_ptr<Node> node : current_function->get_ordered_ops()) for (shared_ptr<Node> node : current_function->get_ordered_ops())
{ {
...@@ -281,8 +323,8 @@ using namespace std; ...@@ -281,8 +323,8 @@ using namespace std;
{ {
writer << "ngraph::stopwatch timer_" << s << ";\n"; writer << "ngraph::stopwatch timer_" << s << ";\n";
} }
writer << "extern \"C\" size_t get_debug_timer_count() { return " << names.size() writer << "extern \"C\" size_t get_debug_timer_count() { return "
<< "; }\n"; << names.size() << "; }\n";
writer << "extern \"C\" const char* get_debug_timer_name(size_t index)\n"; writer << "extern \"C\" const char* get_debug_timer_name(size_t index)\n";
writer << "{\n"; writer << "{\n";
writer.indent++; writer.indent++;
...@@ -298,7 +340,8 @@ using namespace std; ...@@ -298,7 +340,8 @@ using namespace std;
writer << "return rc;\n"; writer << "return rc;\n";
writer.indent--; writer.indent--;
writer << "}\n"; writer << "}\n";
writer << "extern \"C\" const size_t get_debug_timer_microseconds(size_t index)\n"; writer
<< "extern \"C\" const size_t get_debug_timer_microseconds(size_t index)\n";
writer << "{\n"; writer << "{\n";
writer.indent++; writer.indent++;
writer << "size_t rc;\n"; writer << "size_t rc;\n";
...@@ -314,7 +357,8 @@ using namespace std; ...@@ -314,7 +357,8 @@ using namespace std;
writer << "return rc;\n"; writer << "return rc;\n";
writer.indent--; writer.indent--;
writer << "}\n"; writer << "}\n";
writer << "extern \"C\" const size_t get_debug_timer_call_count(size_t index)\n"; writer
<< "extern \"C\" const size_t get_debug_timer_call_count(size_t index)\n";
writer << "{\n"; writer << "{\n";
writer.indent++; writer.indent++;
writer << "size_t rc;\n"; writer << "size_t rc;\n";
...@@ -322,7 +366,8 @@ using namespace std; ...@@ -322,7 +366,8 @@ using namespace std;
writer << "{\n"; writer << "{\n";
for (size_t i = 0; i < names.size(); i++) for (size_t i = 0; i < names.size(); i++)
{ {
writer << "case " << i << ": rc = timer_" << names[i] << ".get_call_count(); break;\n"; writer << "case " << i << ": rc = timer_" << names[i]
<< ".get_call_count(); break;\n";
} }
writer << "default: rc = 0;\n"; writer << "default: rc = 0;\n";
writer << "}\n"; writer << "}\n";
...@@ -338,26 +383,31 @@ using namespace std; ...@@ -338,26 +383,31 @@ using namespace std;
writer << "void *__dso_handle = 0;\n\n"; writer << "void *__dso_handle = 0;\n\n";
writer << "// Declare all constants\n"; writer << "// Declare all constants\n";
for (shared_ptr<Function> current_function : pass_manager.get_state().get_functions()) for (shared_ptr<Function> current_function :
pass_manager.get_state().get_functions())
{ {
for (shared_ptr<Node> node : current_function->get_ordered_ops()) for (shared_ptr<Node> node : current_function->get_ordered_ops())
{ {
const op::Constant* c = dynamic_cast<op::Constant*>(node.get()); const op::Constant* c = dynamic_cast<ngraph::op::Constant*>(node.get());
if (c) if (c)
{ {
shared_ptr<descriptor::TensorView> tv = node->get_outputs()[0].get_tensor_view(); shared_ptr<descriptor::TensorView> tv =
node->get_outputs()[0].get_tensor_view();
auto c_value_strings = c->get_value_strings(); auto c_value_strings = c->get_value_strings();
writer << "static " << tv->get_tensor().get_element_type().c_type_string() << " " writer << "static "
<< tv->get_tensor().get_name() << "_cpu[" << c_value_strings.size() << tv->get_tensor().get_element_type().c_type_string() << " "
<< "] =\n"; << tv->get_tensor().get_name() << "_cpu["
<< c_value_strings.size() << "] =\n";
writer << "{\n"; writer << "{\n";
writer.indent++; writer.indent++;
writer << emit_string_array(c_value_strings, 100 - writer.indent * 4); writer << emit_string_array(c_value_strings, 100 - writer.indent * 4);
writer.indent--; writer.indent--;
writer << "\n};\n\n"; writer << "\n};\n\n";
writer << "static " << tv->get_tensor().get_element_type().c_type_string() << " *" writer << "static "
<< tv->get_tensor().get_element_type().c_type_string() << " *"
<< tv->get_tensor().get_name() << ";\n"; << tv->get_tensor().get_name() << ";\n";
m_variable_name_map[tv->get_tensor().get_name()] = tv->get_tensor().get_name(); m_variable_name_map[tv->get_tensor().get_name()] =
tv->get_tensor().get_name();
} }
} }
} }
...@@ -365,7 +415,8 @@ using namespace std; ...@@ -365,7 +415,8 @@ using namespace std;
writer << "// Declare all functions\n"; writer << "// Declare all functions\n";
for (shared_ptr<Function> f : pass_manager.get_state().get_functions()) for (shared_ptr<Function> f : pass_manager.get_state().get_functions())
{ {
writer << "extern \"C\" void " << f->get_name() << "(void** inputs, void** outputs, " writer << "extern \"C\" void " << f->get_name()
<< "(void** inputs, void** outputs, "
"cublasHandle_t& cublas_handle, " "cublasHandle_t& cublas_handle, "
"cudnnHandle_t& cudnn_handle);\n"; "cudnnHandle_t& cudnn_handle);\n";
} }
...@@ -373,7 +424,8 @@ using namespace std; ...@@ -373,7 +424,8 @@ using namespace std;
writer << "\n"; writer << "\n";
unordered_map<Node*, string> match_functions; unordered_map<Node*, string> match_functions;
for (shared_ptr<Function> current_function : pass_manager.get_state().get_functions()) for (shared_ptr<Function> current_function :
pass_manager.get_state().get_functions())
{ {
bool temporaries_used = false; bool temporaries_used = false;
size_t worst_case_tmp_size = 0; size_t worst_case_tmp_size = 0;
...@@ -459,14 +511,15 @@ using namespace std; ...@@ -459,14 +511,15 @@ using namespace std;
writer << "\n)\n"; writer << "\n)\n";
writer << "{\n"; writer << "{\n";
writer.indent++; writer.indent++;
handler->second(writer, &n, in, out); handler->second(this, writer, &n, in, out);
writer.indent--; writer.indent--;
writer << "}\n"; writer << "}\n";
} }
} }
} }
for (shared_ptr<Function> current_function : pass_manager.get_state().get_functions()) for (shared_ptr<Function> current_function :
pass_manager.get_state().get_functions())
{ {
set<string> output_names; set<string> output_names;
for (shared_ptr<Node> op : current_function->get_results()) for (shared_ptr<Node> op : current_function->get_results())
...@@ -477,20 +530,23 @@ using namespace std; ...@@ -477,20 +530,23 @@ using namespace std;
set<descriptor::TensorView*> constants; set<descriptor::TensorView*> constants;
for (shared_ptr<Node> node : current_function->get_ordered_ops()) for (shared_ptr<Node> node : current_function->get_ordered_ops())
{ {
if (dynamic_cast<op::Constant*>(node.get())) if (dynamic_cast<ngraph::op::Constant*>(node.get()))
{ {
shared_ptr<descriptor::TensorView> tv = node->get_outputs()[0].get_tensor_view(); shared_ptr<descriptor::TensorView> tv =
node->get_outputs()[0].get_tensor_view();
constants.insert(tv.get()); constants.insert(tv.get());
} }
} }
writer << "extern \"C\" void " << current_function->get_name(); writer << "extern \"C\" void " << current_function->get_name();
writer << "(void** inputs, void** outputs, cublasHandle_t& cublas_handle, cudnnHandle_t& " writer << "(void** inputs, void** outputs, cublasHandle_t& cublas_handle, "
"cudnnHandle_t& "
"cudnn_handle)\n"; "cudnn_handle)\n";
writer << "{\n"; writer << "{\n";
writer.indent++; writer.indent++;
for (shared_ptr<Function> current_function : pass_manager.get_state().get_functions()) for (shared_ptr<Function> current_function :
pass_manager.get_state().get_functions())
{ {
for (shared_ptr<Node> node : current_function->get_ordered_ops()) for (shared_ptr<Node> node : current_function->get_ordered_ops())
{ {
...@@ -502,9 +558,10 @@ using namespace std; ...@@ -502,9 +558,10 @@ using namespace std;
writer << "if(" << tv->get_tensor().get_name() << " == NULL)\n"; writer << "if(" << tv->get_tensor().get_name() << " == NULL)\n";
writer << "{\n"; writer << "{\n";
writer.indent++; writer.indent++;
writer << "runtime::gpu::cuda_memcpyHtD(" << tv->get_tensor().get_name() << ", " writer << "runtime::gpu::cuda_memcpyHtD("
<< tv->get_tensor().get_name() << "_cpu, " << tv->get_tensor().size() << tv->get_tensor().get_name() << ", "
<< ");\n"; << tv->get_tensor().get_name() << "_cpu, "
<< tv->get_tensor().size() << ");\n";
writer.indent--; writer.indent--;
writer << "}\n"; writer << "}\n";
} }
...@@ -528,8 +585,8 @@ using namespace std; ...@@ -528,8 +585,8 @@ using namespace std;
size_t temp_pool_size = current_function->get_temporary_pool_size(); size_t temp_pool_size = current_function->get_temporary_pool_size();
writer << "// Allocate the memory pool\n"; writer << "// Allocate the memory pool\n";
// TODO memory pool malloc. // TODO memory pool malloc.
writer << "void* pool_base_ptr = runtime::gpu::create_gpu_buffer(" << temp_pool_size writer << "void* pool_base_ptr = ngraph::runtime::gpu::create_gpu_buffer("
<< ");\n"; << temp_pool_size << ");\n";
// Add temporaries to the variable name map // Add temporaries to the variable name map
for (shared_ptr<Node> node : current_function->get_ordered_ops()) for (shared_ptr<Node> node : current_function->get_ordered_ops())
...@@ -538,7 +595,8 @@ using namespace std; ...@@ -538,7 +595,8 @@ using namespace std;
{ {
stringstream ss; stringstream ss;
ss << "((" << tensor->get_element_type().c_type_string() ss << "((" << tensor->get_element_type().c_type_string()
<< "*)((char *)pool_base_ptr + " << tensor->get_pool_offset() << "))"; << "*)((char *)pool_base_ptr + " << tensor->get_pool_offset()
<< "))";
m_variable_name_map[tensor->get_name()] = ss.str(); m_variable_name_map[tensor->get_name()] = ss.str();
} }
} }
...@@ -546,12 +604,15 @@ using namespace std; ...@@ -546,12 +604,15 @@ using namespace std;
// Add inputs to the variable name map // Add inputs to the variable name map
size_t arg_index = 0; size_t arg_index = 0;
for (shared_ptr<op::Parameter> param : current_function->get_parameters()) for (shared_ptr<ngraph::op::Parameter> param :
current_function->get_parameters())
{ {
for (size_t i = 0; i < param->get_output_size(); ++i) for (size_t i = 0; i < param->get_output_size(); ++i)
{ {
shared_ptr<descriptor::TensorView> tv = param->get_output_tensor_view(i); shared_ptr<descriptor::TensorView> tv =
const element::Type& et = tv->get_tensor_view_type()->get_element_type(); param->get_output_tensor_view(i);
const element::Type& et =
tv->get_tensor_view_type()->get_element_type();
string type = et.c_type_string(); string type = et.c_type_string();
stringstream ss; stringstream ss;
ss << "((" << type << "*)(inputs[" << arg_index << "]))"; ss << "((" << type << "*)(inputs[" << arg_index << "]))";
...@@ -585,7 +646,8 @@ using namespace std; ...@@ -585,7 +646,8 @@ using namespace std;
shared_ptr<descriptor::TensorView> tv = op->get_output_tensor_view(); shared_ptr<descriptor::TensorView> tv = op->get_output_tensor_view();
const element::Type& et = tv->get_tensor_view_type()->get_element_type(); const element::Type& et = tv->get_tensor_view_type()->get_element_type();
bool parameter_as_output = false; bool parameter_as_output = false;
for (shared_ptr<op::Parameter> param : current_function->get_parameters()) for (shared_ptr<ngraph::op::Parameter> param :
current_function->get_parameters())
{ {
for (const descriptor::Output& pout : param->get_outputs()) for (const descriptor::Output& pout : param->get_outputs())
{ {
...@@ -593,8 +655,10 @@ using namespace std; ...@@ -593,8 +655,10 @@ using namespace std;
if (tv == ptv) if (tv == ptv)
{ {
parameter_as_output = true; parameter_as_output = true;
writer << "runtime::gpu::cuda_memcpyDtD(reinterpret_cast<" writer
<< et.c_type_string() << "*>(outputs[" << output_index << "]), " << "ngraph::runtime::gpu::cuda_memcpyDtD(reinterpret_cast<"
<< et.c_type_string() << "*>(outputs[" << output_index
<< "]), "
<< m_variable_name_map[ptv->get_tensor().get_name()] << ", " << m_variable_name_map[ptv->get_tensor().get_name()] << ", "
<< ptv->get_tensor().size() << ");\n"; << ptv->get_tensor().size() << ");\n";
break; break;
...@@ -605,9 +669,9 @@ using namespace std; ...@@ -605,9 +669,9 @@ using namespace std;
{ {
if (contains(constants, tv.get())) if (contains(constants, tv.get()))
{ {
writer << "runtime::gpu::cuda_memcpyHtD(outputs[" << output_index << "], " writer << "ngraph::runtime::gpu::cuda_memcpyHtD(outputs["
<< tv->get_tensor().get_name() << ", " << tv->get_tensor().size() << output_index << "], " << tv->get_tensor().get_name()
<< ");\n"; << ", " << tv->get_tensor().size() << ");\n";
} }
else else
{ {
...@@ -622,27 +686,29 @@ using namespace std; ...@@ -622,27 +686,29 @@ using namespace std;
for (shared_ptr<Node> node : current_function->get_ordered_ops()) for (shared_ptr<Node> node : current_function->get_ordered_ops())
{ {
auto& n = *node; // Work around a compiler warning (*node inside typeid may have effects auto& n =
*node; // Work around a compiler warning (*node inside typeid may have effects
// with shared pointers, which is fine here but clang doesn't like it.) // with shared pointers, which is fine here but clang doesn't like it.)
auto handler = dispatcher.find(type_index(typeid(n))); auto handler = dispatcher.find(type_index(typeid(n)));
if (handler == dispatcher.end()) if (handler == dispatcher.end())
{ {
throw ngraph_error("Unhandled op during code generation : " + node->description()); throw ngraph_error("Unhandled op during code generation : " +
node->description());
} }
vector<GPU_TensorViewWrapper> in; vector<GPU_TensorViewWrapper> in;
for (const descriptor::Input& input : node->get_inputs()) for (const descriptor::Input& input : node->get_inputs())
{ {
const descriptor::Output& output = input.get_output(); const descriptor::Output& output = input.get_output();
shared_ptr<descriptor::TensorView> tv = output.get_tensor_view(); shared_ptr<descriptor::TensorView> tv = output.get_tensor_view();
in.push_back( in.push_back(GPU_TensorViewWrapper(
GPU_TensorViewWrapper(tv, m_variable_name_map[tv->get_tensor().get_name()])); tv, m_variable_name_map[tv->get_tensor().get_name()]));
} }
vector<GPU_TensorViewWrapper> out; vector<GPU_TensorViewWrapper> out;
for (const descriptor::Output& output : node->get_outputs()) for (const descriptor::Output& output : node->get_outputs())
{ {
shared_ptr<descriptor::TensorView> tv = output.get_tensor_view(); shared_ptr<descriptor::TensorView> tv = output.get_tensor_view();
out.push_back( out.push_back(GPU_TensorViewWrapper(
GPU_TensorViewWrapper(tv, m_variable_name_map[tv->get_tensor().get_name()])); tv, m_variable_name_map[tv->get_tensor().get_name()]));
} }
// Emit operation prologue // Emit operation prologue
...@@ -663,7 +729,7 @@ using namespace std; ...@@ -663,7 +729,7 @@ using namespace std;
} }
if (func_name.empty()) if (func_name.empty())
{ {
handler->second(writer, node.get(), in, out); handler->second(this, writer, node.get(), in, out);
} }
else else
{ {
...@@ -696,7 +762,8 @@ using namespace std; ...@@ -696,7 +762,8 @@ using namespace std;
// TODO: Cleanup and make this a utility function // TODO: Cleanup and make this a utility function
file_util::make_directory(s_output_dir); file_util::make_directory(s_output_dir);
string filename = file_util::path_join(s_output_dir, function_name + "_codegen.cpp"); string filename =
file_util::path_join(s_output_dir, function_name + "_codegen.cpp");
ofstream out(filename); ofstream out(filename);
string code = writer.get_code(); string code = writer.get_code();
out << code; out << code;
...@@ -715,7 +782,8 @@ using namespace std; ...@@ -715,7 +782,8 @@ using namespace std;
} }
m_execution_engine->add_module(codegen_module); m_execution_engine->add_module(codegen_module);
m_execution_engine->finalize(); m_execution_engine->finalize();
m_compiled_function = m_execution_engine->find_function<EntryPoint_t>(function_name); m_compiled_function =
m_execution_engine->find_function<EntryPoint_t>(function_name);
assert(m_compiled_function); assert(m_compiled_function);
m_is_compiled = true; m_is_compiled = true;
...@@ -723,13 +791,13 @@ using namespace std; ...@@ -723,13 +791,13 @@ using namespace std;
{ {
release_function(); release_function();
} }
} }
void runtime::gpu::GPU_ExternalFunction::handle_output_alias( void GPU_ExternalFunction::handle_output_alias(
codegen::CodeWriter& writer, codegen::CodeWriter& writer,
const Node& node, const Node& node,
const unordered_map<descriptor::TensorView*, vector<size_t>>& output_alias_map) const unordered_map<descriptor::TensorView*, vector<size_t>>& output_alias_map)
{ {
for (const descriptor::Output& output : node.get_outputs()) for (const descriptor::Output& output : node.get_outputs())
{ {
shared_ptr<descriptor::TensorView> otv = output.get_tensor_view(); shared_ptr<descriptor::TensorView> otv = output.get_tensor_view();
...@@ -743,42 +811,46 @@ void runtime::gpu::GPU_ExternalFunction::handle_output_alias( ...@@ -743,42 +811,46 @@ void runtime::gpu::GPU_ExternalFunction::handle_output_alias(
writer.indent++; writer.indent++;
for (size_t i = 1; i < outputs.size(); i++) for (size_t i = 1; i < outputs.size(); i++)
{ {
writer << "runtime::gpu::cuda_memcpyDtD(static_cast<void*>(outputs[" writer << "ngraph::runtime::gpu::cuda_memcpyDtD(static_cast<void*>("
<< outputs[i] << "]), static_cast<void*>(outputs[" << outputs[0] "outputs["
<< "]), " << otv->get_tensor().size() << ");\n"; << outputs[i] << "]), static_cast<void*>(outputs["
<< outputs[0] << "]), " << otv->get_tensor().size()
<< ");\n";
} }
writer.indent--; writer.indent--;
writer << "}\n"; writer << "}\n";
} }
} }
} }
} }
shared_ptr<ngraph::runtime::CallFrame> runtime::gpu::GPU_ExternalFunction::make_call_frame() shared_ptr<ngraph::runtime::CallFrame> GPU_ExternalFunction::make_call_frame()
{ {
if (!m_is_compiled) if (!m_is_compiled)
{ {
compile(); compile();
} }
return make_shared<ngraph::runtime::gpu::GPU_CallFrame>(shared_from_this(), return make_shared<GPU_CallFrame>(shared_from_this(), m_compiled_function);
m_compiled_function); }
}
void runtime::gpu::GPU_ExternalFunction::emit_debug_function_entry( void GPU_ExternalFunction::emit_debug_function_entry(
codegen::CodeWriter& writer, codegen::CodeWriter& writer,
Node* node, Node* node,
const std::vector<GPU_TensorViewWrapper>& in, const std::vector<GPU_TensorViewWrapper>& in,
const std::vector<GPU_TensorViewWrapper>& out) const std::vector<GPU_TensorViewWrapper>& out)
{ {
writer << "timer_" << node->get_name() << ".start();\n"; writer << "timer_" << node->get_name() << ".start();\n";
} }
void runtime::gpu::GPU_ExternalFunction::emit_debug_function_exit( void GPU_ExternalFunction::emit_debug_function_exit(
codegen::CodeWriter& writer, codegen::CodeWriter& writer,
Node* node, Node* node,
const std::vector<GPU_TensorViewWrapper>& in, const std::vector<GPU_TensorViewWrapper>& in,
const std::vector<GPU_TensorViewWrapper>& out) const std::vector<GPU_TensorViewWrapper>& out)
{ {
writer << "timer_" << node->get_name() << ".stop();\n"; writer << "timer_" << node->get_name() << ".stop();\n";
}
}
}
} }
\ No newline at end of file
...@@ -41,7 +41,8 @@ namespace ngraph ...@@ -41,7 +41,8 @@ namespace ngraph
class GPU_CallFrame; class GPU_CallFrame;
using OpFunction = using OpFunction =
std::function<void(codegen::CodeWriter&, std::function<void(GPU_ExternalFunction* external_function,
codegen::CodeWriter&,
const ngraph::Node*, const ngraph::Node*,
const std::vector<GPU_TensorViewWrapper>& inputs, const std::vector<GPU_TensorViewWrapper>& inputs,
const std::vector<GPU_TensorViewWrapper>& outputs)>; const std::vector<GPU_TensorViewWrapper>& outputs)>;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment