Commit 95312b8e authored by Fenglei's avatar Fenglei Committed by Robert Kimball

gpu emitter using template function (#610)

* update gpu_emitter use template

* add template
parent b3d2ff59
...@@ -28,21 +28,69 @@ ...@@ -28,21 +28,69 @@
#include <vector> #include <vector>
#include "ngraph/node.hpp" #include "ngraph/node.hpp"
#include "ngraph/ops/abs.hpp"
#include "ngraph/ops/acos.hpp"
#include "ngraph/ops/add.hpp"
#include "ngraph/ops/allreduce.hpp"
#include "ngraph/ops/asin.hpp"
#include "ngraph/ops/atan.hpp"
#include "ngraph/ops/avg_pool.hpp"
#include "ngraph/ops/batch_norm.hpp"
#include "ngraph/ops/broadcast.hpp" #include "ngraph/ops/broadcast.hpp"
#include "ngraph/ops/ceiling.hpp"
#include "ngraph/ops/concat.hpp" #include "ngraph/ops/concat.hpp"
#include "ngraph/ops/constant.hpp" #include "ngraph/ops/constant.hpp"
#include "ngraph/ops/convert.hpp"
#include "ngraph/ops/convolution.hpp" #include "ngraph/ops/convolution.hpp"
#include "ngraph/ops/cos.hpp"
#include "ngraph/ops/cosh.hpp"
#include "ngraph/ops/divide.hpp"
#include "ngraph/ops/dot.hpp" #include "ngraph/ops/dot.hpp"
#include "ngraph/ops/equal.hpp"
#include "ngraph/ops/exp.hpp"
#include "ngraph/ops/floor.hpp"
#include "ngraph/ops/function_call.hpp" #include "ngraph/ops/function_call.hpp"
#include "ngraph/ops/get_output_element.hpp" #include "ngraph/ops/get_output_element.hpp"
#include "ngraph/ops/greater.hpp"
#include "ngraph/ops/greater_eq.hpp"
#include "ngraph/ops/less.hpp"
#include "ngraph/ops/less_eq.hpp"
#include "ngraph/ops/log.hpp"
#include "ngraph/ops/max.hpp"
#include "ngraph/ops/max_pool.hpp" #include "ngraph/ops/max_pool.hpp"
#include "ngraph/ops/maximum.hpp"
#include "ngraph/ops/min.hpp"
#include "ngraph/ops/minimum.hpp"
#include "ngraph/ops/multiply.hpp"
#include "ngraph/ops/negative.hpp"
#include "ngraph/ops/not.hpp"
#include "ngraph/ops/not_equal.hpp"
#include "ngraph/ops/one_hot.hpp" #include "ngraph/ops/one_hot.hpp"
#include "ngraph/ops/op.hpp"
#include "ngraph/ops/pad.hpp"
#include "ngraph/ops/parameter.hpp"
#include "ngraph/ops/power.hpp"
#include "ngraph/ops/product.hpp"
#include "ngraph/ops/reduce.hpp" #include "ngraph/ops/reduce.hpp"
#include "ngraph/ops/reduce_window.hpp"
#include "ngraph/ops/relu.hpp"
#include "ngraph/ops/remainder.hpp"
#include "ngraph/ops/replace_slice.hpp" #include "ngraph/ops/replace_slice.hpp"
#include "ngraph/ops/reshape.hpp" #include "ngraph/ops/reshape.hpp"
#include "ngraph/ops/result.hpp"
#include "ngraph/ops/reverse.hpp" #include "ngraph/ops/reverse.hpp"
#include "ngraph/ops/select.hpp"
#include "ngraph/ops/select_and_scatter.hpp"
#include "ngraph/ops/sign.hpp"
#include "ngraph/ops/sin.hpp"
#include "ngraph/ops/sinh.hpp"
#include "ngraph/ops/slice.hpp" #include "ngraph/ops/slice.hpp"
#include "ngraph/ops/softmax.hpp"
#include "ngraph/ops/sqrt.hpp"
#include "ngraph/ops/subtract.hpp"
#include "ngraph/ops/sum.hpp" #include "ngraph/ops/sum.hpp"
#include "ngraph/ops/tan.hpp"
#include "ngraph/ops/tanh.hpp"
#include "ngraph/runtime/gpu/gpu_cuda_kernel_emitters.hpp" #include "ngraph/runtime/gpu/gpu_cuda_kernel_emitters.hpp"
#include "ngraph/runtime/gpu/gpu_emitter.hpp" #include "ngraph/runtime/gpu/gpu_emitter.hpp"
#include "ngraph/runtime/gpu/gpu_kernel_emitters.hpp" #include "ngraph/runtime/gpu/gpu_kernel_emitters.hpp"
...@@ -51,47 +99,60 @@ ...@@ -51,47 +99,60 @@
using namespace std; using namespace std;
using namespace ngraph; using namespace ngraph;
void runtime::gpu::GPU_Emitter::EmitNop(codegen::CodeWriter& writer, namespace ngraph
const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{ {
} namespace runtime
void runtime::gpu::GPU_Emitter::EmitUnaryElementwise(
codegen::CodeWriter& writer,
const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
if (out[0].get_size() == 0)
{ {
return; namespace gpu
} {
writer << "{ // " << n->get_name() << "\n"; template <>
writer.indent++; void GPU_Emitter::EMITTER_DECL(ngraph::op::Abs)
writer << "int count = " << out[0].get_size() << ";\n"; {
writer << "if(count == 0) return;\n"; if (out[0].get_size() == 0)
writer << "ngraph::runtime::gpu::emit_unary_elementwise_op<ngraph::op::" << n->description() {
<< ">((void*) " << args[0].get_name() << ", (void*) " << out[0].get_name() return;
<< ", count, \"" << n->description() << "\");\n"; }
writer.indent--; writer << "{ // " << node->get_name() << "\n";
writer << "}\n"; writer.indent++;
} writer << "int count = " << out[0].get_size() << ";\n";
writer << "ngraph::runtime::gpu::emit_abs((void*) " << args[0].get_name()
<< ", (void*) " << out[0].get_name() << ", count);\n";
writer.indent--;
writer << "}\n";
}
void runtime::gpu::GPU_Emitter::EmitAdd(codegen::CodeWriter& writer, void GPU_Emitter::EmitUnaryElementwise(GPU_ExternalFunction* external_function,
const ngraph::Node* n, codegen::CodeWriter& writer,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args, const ngraph::Node* node,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out) const std::vector<GPU_TensorViewWrapper>& args,
{ const std::vector<GPU_TensorViewWrapper>& out)
if (out[0].get_size() == 0) {
{ if (out[0].get_size() == 0)
return; {
} return;
writer << "{ // " << n->get_name() << "\n"; }
writer.indent++; writer << "{ // " << node->get_name() << "\n";
writer << "int count = " << out[0].get_size() << ";\n"; writer.indent++;
writer += R"( writer << "int count = " << out[0].get_size() << ";\n";
writer << "if(count == 0) return;\n";
writer << "ngraph::runtime::gpu::emit_unary_elementwise_op<ngraph::op::"
<< node->description() << ">((void*) " << args[0].get_name() << ", (void*) "
<< out[0].get_name() << ", count, \"" << node->description() << "\");\n";
writer.indent--;
writer << "}\n";
}
template <>
void GPU_Emitter::EMITTER_DECL(ngraph::op::Add)
{
if (out[0].get_size() == 0)
{
return;
}
writer << "{ // " << node->get_name() << "\n";
writer.indent++;
writer << "int count = " << out[0].get_size() << ";\n";
writer += R"(
float alpha1 = 1.0, alpha2 = 1.0, beta = 0; float alpha1 = 1.0, alpha2 = 1.0, beta = 0;
cudnnTensorDescriptor_t descriptor; cudnnTensorDescriptor_t descriptor;
cudnnCreateTensorDescriptor(&descriptor); cudnnCreateTensorDescriptor(&descriptor);
...@@ -111,203 +172,146 @@ cudnnSetOpTensorDescriptor(opTensorDesc, ...@@ -111,203 +172,146 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
CUDNN_NOT_PROPAGATE_NAN); CUDNN_NOT_PROPAGATE_NAN);
)"; )";
writer << "cudnnOpTensor(cudnn_handle," writer << "cudnnOpTensor(cudnn_handle,"
<< "opTensorDesc," << "opTensorDesc,"
<< "&alpha1," << "&alpha1,"
<< "descriptor," << args[0].get_name() << "," << "descriptor," << args[0].get_name() << ","
<< "&alpha2," << "&alpha2,"
<< "descriptor," << args[1].get_name() << "," << "descriptor," << args[1].get_name() << ","
<< "&beta," << "&beta,"
<< "descriptor," << out[0].get_name() << ");\n"; << "descriptor," << out[0].get_name() << ");\n";
writer.indent--; writer.indent--;
writer << "}\n"; writer << "}\n";
} }
void runtime::gpu::GPU_Emitter::EmitConcat(codegen::CodeWriter& writer,
const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
}
void runtime::gpu::GPU_Emitter::EmitDot(codegen::CodeWriter& writer,
const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
if (out[0].get_size() == 0)
{
return;
}
const ngraph::op::Dot* dot = static_cast<const ngraph::op::Dot*>(n);
const Shape& arg0_shape = args[0].get_shape();
const Shape& arg1_shape = args[1].get_shape();
if (arg0_shape.empty() || arg1_shape.empty())
{
auto& first = (arg0_shape.empty() ? args[0] : args[1]);
auto& second = (arg0_shape.empty() ? args[1] : args[0]);
writer << "{ // " << n->get_name() << "\n";
writer.indent++;
writer << "int count = " << second.get_size() << ";\n";
writer << "cublasScopy("
<< "cublas_handle,"
<< "count ," << second.get_name() << ","
<< "1," << out[0].get_name() << ", 1);\n";
writer << "cublasSscal("
<< "cublas_handle,"
<< "count ," << first.get_name() << "," << out[0].get_name() << ", 1);\n";
writer.indent--;
writer << "}\n";
return;
}
//set output to 0 if input size is 0
if (args[0].get_size() == 0 || args[1].get_size() == 0)
{
writer << "{ // " << n->get_name() << "\n";
writer.indent++;
writer << "runtime::gpu::cuda_memset(" << out[0].get_name() << ", 0, " << out[0].get_size()
<< " * sizeof(float));\n";
writer.indent--;
writer << "}\n";
return;
}
if ((arg0_shape.size() == 1) && (arg1_shape.size() == 1))
{
writer << "{ // " << n->get_name() << "\n";
writer.indent++;
writer << "cublasSdot("
<< "cublas_handle," << arg0_shape[0] << "," << args[0].get_name() << ","
<< "1," << args[1].get_name() << ","
<< "1," << out[0].get_name() << ");\n";
writer.indent--;
writer << "}\n";
}
else if ((arg0_shape.size() == 2) && (arg1_shape.size() == 1))
{
writer << "{ // " << n->get_name() << "\n";
writer.indent++;
writer << "const float alpha = 1.0;\n";
writer << "const float beta = 0;\n";
writer << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_HOST);\n";
writer << "cublasSgemv("
<< "cublas_handle,"
<< "CUBLAS_OP_T," << arg0_shape[0] << "," << arg0_shape[1] << ","
<< "&alpha," // Alpha
<< args[0].get_name() << "," << arg0_shape[1] << "," << args[1].get_name() << ","
<< "1,"
<< "&beta," // beta
<< out[0].get_name() << ","
<< "1);\n";
writer << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_DEVICE);\n";
writer.indent--;
writer << "}\n";
}
else if ((arg0_shape.size() == 2) && (arg1_shape.size() == 2))
{
// GEMM Call
if (arg0_shape[0] != out[0].get_shape()[0] || // m
arg1_shape[1] != out[0].get_shape()[1] || // n
arg0_shape[1] != arg1_shape[0]) // k
{
throw std::runtime_error("input and output shape is not correct for dot;");
}
writer << "{ // " << n->get_name() << "\n";
writer.indent++;
writer << "const float alpha = 1.0;\n";
writer << "const float beta = 0.0;\n";
writer << "int m = " << arg0_shape[0] << ";\n";
writer << "int n = " << arg1_shape[1] << ";\n";
writer << "int k = " << arg0_shape[0] << ";\n";
writer << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_HOST);\n";
writer << "cublasSgemm("
<< "cublas_handle,"
<< "CUBLAS_OP_N,"
<< "CUBLAS_OP_N,"
<< "n,"
<< "m,"
<< "k,"
<< "&alpha," // Alpha
<< args[1].get_name() << ","
<< "n," << args[0].get_name() << ","
<< "k,"
<< "&beta," // beta
<< out[0].get_name() << ","
<< "n);\n";
writer << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_DEVICE);\n";
writer.indent--;
writer << "}\n";
}
else
{
throw std::runtime_error(n->get_name() + " with more then 2D is not implemented.");
}
}
void runtime::gpu::GPU_Emitter::EmitDivide(codegen::CodeWriter& writer,
const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw std::runtime_error(n->get_name() + " is not implemented.");
}
void runtime::gpu::GPU_Emitter::EmitEqual(codegen::CodeWriter& writer,
const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw std::runtime_error(n->get_name() + " is not implemented.");
}
void runtime::gpu::GPU_Emitter::EmitGreater(codegen::CodeWriter& writer,
const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw std::runtime_error(n->get_name() + " is not implemented.");
}
void runtime::gpu::GPU_Emitter::EmitGreaterEq(
codegen::CodeWriter& writer,
const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw std::runtime_error(n->get_name() + " is not implemented.");
}
void runtime::gpu::GPU_Emitter::EmitLess(codegen::CodeWriter& writer,
const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw std::runtime_error(n->get_name() + " is not implemented.");
}
void runtime::gpu::GPU_Emitter::EmitLessEq(codegen::CodeWriter& writer, template <>
const ngraph::Node* n, void GPU_Emitter::EMITTER_DECL(ngraph::op::Dot)
const vector<runtime::gpu::GPU_TensorViewWrapper>& args, {
const vector<runtime::gpu::GPU_TensorViewWrapper>& out) if (out[0].get_size() == 0)
{ {
throw std::runtime_error(n->get_name() + " is not implemented."); return;
} }
const ngraph::op::Dot* dot = static_cast<const ngraph::op::Dot*>(node);
const Shape& arg0_shape = args[0].get_shape();
const Shape& arg1_shape = args[1].get_shape();
if (arg0_shape.empty() || arg1_shape.empty())
{
auto& first = (arg0_shape.empty() ? args[0] : args[1]);
auto& second = (arg0_shape.empty() ? args[1] : args[0]);
writer << "{ // " << node->get_name() << "\n";
writer.indent++;
writer << "int count = " << second.get_size() << ";\n";
writer << "cublasScopy("
<< "cublas_handle,"
<< "count ," << second.get_name() << ","
<< "1," << out[0].get_name() << ", 1);\n";
writer << "cublasSscal("
<< "cublas_handle,"
<< "count ," << first.get_name() << "," << out[0].get_name()
<< ", 1);\n";
writer.indent--;
writer << "}\n";
return;
}
//set output to 0 if input size is 0
if (args[0].get_size() == 0 || args[1].get_size() == 0)
{
writer << "{ // " << node->get_name() << "\n";
writer.indent++;
writer << "runtime::gpu::cuda_memset(" << out[0].get_name() << ", 0, "
<< out[0].get_size() << " * sizeof(float));\n";
writer.indent--;
writer << "}\n";
return;
}
if ((arg0_shape.size() == 1) && (arg1_shape.size() == 1))
{
writer << "{ // " << node->get_name() << "\n";
writer.indent++;
writer << "cublasSdot("
<< "cublas_handle," << arg0_shape[0] << "," << args[0].get_name() << ","
<< "1," << args[1].get_name() << ","
<< "1," << out[0].get_name() << ");\n";
writer.indent--;
writer << "}\n";
}
else if ((arg0_shape.size() == 2) && (arg1_shape.size() == 1))
{
writer << "{ // " << node->get_name() << "\n";
writer.indent++;
writer << "const float alpha = 1.0;\n";
writer << "const float beta = 0;\n";
writer << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_HOST);\n";
writer << "cublasSgemv("
<< "cublas_handle,"
<< "CUBLAS_OP_T," << arg0_shape[0] << "," << arg0_shape[1] << ","
<< "&alpha," // Alpha
<< args[0].get_name() << "," << arg0_shape[1] << ","
<< args[1].get_name() << ","
<< "1,"
<< "&beta," // beta
<< out[0].get_name() << ","
<< "1);\n";
writer << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_DEVICE);\n";
writer.indent--;
writer << "}\n";
}
else if ((arg0_shape.size() == 2) && (arg1_shape.size() == 2))
{
// GEMM Call
if (arg0_shape[0] != out[0].get_shape()[0] || // m
arg1_shape[1] != out[0].get_shape()[1] || // n
arg0_shape[1] != arg1_shape[0]) // k
{
throw std::runtime_error("input and output shape is not correct for dot;");
}
writer << "{ // " << node->get_name() << "\n";
writer.indent++;
writer << "const float alpha = 1.0;\n";
writer << "const float beta = 0.0;\n";
writer << "int m = " << arg0_shape[0] << ";\n";
writer << "int n = " << arg1_shape[1] << ";\n";
writer << "int k = " << arg0_shape[0] << ";\n";
writer << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_HOST);\n";
writer << "cublasSgemm("
<< "cublas_handle,"
<< "CUBLAS_OP_N,"
<< "CUBLAS_OP_N,"
<< "n,"
<< "m,"
<< "k,"
<< "&alpha," // Alpha
<< args[1].get_name() << ","
<< "n," << args[0].get_name() << ","
<< "k,"
<< "&beta," // beta
<< out[0].get_name() << ","
<< "n);\n";
writer << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_DEVICE);\n";
writer.indent--;
writer << "}\n";
}
else
{
throw std::runtime_error(node->get_name() +
" with more then 2D is not implemented.");
}
}
void runtime::gpu::GPU_Emitter::EmitMaximum(codegen::CodeWriter& writer, template <>
const ngraph::Node* n, void GPU_Emitter::EMITTER_DECL(ngraph::op::Maximum)
const vector<runtime::gpu::GPU_TensorViewWrapper>& args, {
const vector<runtime::gpu::GPU_TensorViewWrapper>& out) if (out[0].get_size() == 0)
{ {
if (out[0].get_size() == 0) return;
{ }
return; writer << "{ // " << node->get_name() << "\n";
} writer.indent++;
writer << "{ // " << n->get_name() << "\n"; writer << "int count = " << out[0].get_size() << ";\n";
writer.indent++; writer += R"(
writer << "int count = " << out[0].get_size() << ";\n";
writer += R"(
float alpha1 = 1.0, alpha2 = 1.0, beta = 0; float alpha1 = 1.0, alpha2 = 1.0, beta = 0;
cudnnTensorDescriptor_t descriptor; cudnnTensorDescriptor_t descriptor;
cudnnCreateTensorDescriptor(&descriptor); cudnnCreateTensorDescriptor(&descriptor);
...@@ -327,31 +331,29 @@ cudnnSetOpTensorDescriptor(opTensorDesc, ...@@ -327,31 +331,29 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
CUDNN_NOT_PROPAGATE_NAN); CUDNN_NOT_PROPAGATE_NAN);
)"; )";
writer << "cudnnOpTensor(cudnn_handle," writer << "cudnnOpTensor(cudnn_handle,"
<< "opTensorDesc," << "opTensorDesc,"
<< "&alpha1," << "&alpha1,"
<< "descriptor," << args[0].get_name() << "," << "descriptor," << args[0].get_name() << ","
<< "&alpha2," << "&alpha2,"
<< "descriptor," << args[1].get_name() << "," << "descriptor," << args[1].get_name() << ","
<< "&beta," << "&beta,"
<< "descriptor," << out[0].get_name() << ");\n"; << "descriptor," << out[0].get_name() << ");\n";
writer.indent--; writer.indent--;
writer << "}\n"; writer << "}\n";
} }
void runtime::gpu::GPU_Emitter::EmitMinimum(codegen::CodeWriter& writer, template <>
const ngraph::Node* n, void GPU_Emitter::EMITTER_DECL(ngraph::op::Minimum)
const vector<runtime::gpu::GPU_TensorViewWrapper>& args, {
const vector<runtime::gpu::GPU_TensorViewWrapper>& out) if (out[0].get_size() == 0)
{ {
if (out[0].get_size() == 0) return;
{ }
return; writer << "{ // " << node->get_name() << "\n";
} writer.indent++;
writer << "{ // " << n->get_name() << "\n"; writer << "int count = " << out[0].get_size() << ";\n";
writer.indent++; writer += R"(
writer << "int count = " << out[0].get_size() << ";\n";
writer += R"(
float alpha1 = 1.0, alpha2 = 1.0, beta = 0; float alpha1 = 1.0, alpha2 = 1.0, beta = 0;
cudnnTensorDescriptor_t descriptor; cudnnTensorDescriptor_t descriptor;
cudnnCreateTensorDescriptor(&descriptor); cudnnCreateTensorDescriptor(&descriptor);
...@@ -371,32 +373,29 @@ cudnnSetOpTensorDescriptor(opTensorDesc, ...@@ -371,32 +373,29 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
CUDNN_NOT_PROPAGATE_NAN); CUDNN_NOT_PROPAGATE_NAN);
)"; )";
writer << "cudnnOpTensor(cudnn_handle," writer << "cudnnOpTensor(cudnn_handle,"
<< "opTensorDesc," << "opTensorDesc,"
<< "&alpha1," << "&alpha1,"
<< "descriptor," << args[0].get_name() << "," << "descriptor," << args[0].get_name() << ","
<< "&alpha2," << "&alpha2,"
<< "descriptor," << args[1].get_name() << "," << "descriptor," << args[1].get_name() << ","
<< "&beta," << "&beta,"
<< "descriptor," << out[0].get_name() << ");\n"; << "descriptor," << out[0].get_name() << ");\n";
writer.indent--; writer.indent--;
writer << "}\n"; writer << "}\n";
} }
void runtime::gpu::GPU_Emitter::EmitNegative( template <>
codegen::CodeWriter& writer, void GPU_Emitter::EMITTER_DECL(ngraph::op::Negative)
const ngraph::Node* n, {
const vector<runtime::gpu::GPU_TensorViewWrapper>& args, if (out[0].get_size() == 0)
const vector<runtime::gpu::GPU_TensorViewWrapper>& out) {
{ return;
if (out[0].get_size() == 0) }
{ writer << "{ // " << node->get_name() << "\n";
return; writer.indent++;
} writer << "int count = " << out[0].get_size() << ";\n";
writer << "{ // " << n->get_name() << "\n"; writer += R"(
writer.indent++;
writer << "int count = " << out[0].get_size() << ";\n";
writer += R"(
float alpha1 = -1.0, alpha2 = 0, beta = 0; float alpha1 = -1.0, alpha2 = 0, beta = 0;
cudnnTensorDescriptor_t descriptor; cudnnTensorDescriptor_t descriptor;
cudnnCreateTensorDescriptor(&descriptor); cudnnCreateTensorDescriptor(&descriptor);
...@@ -416,246 +415,178 @@ cudnnSetOpTensorDescriptor(opTensorDesc, ...@@ -416,246 +415,178 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
CUDNN_NOT_PROPAGATE_NAN); CUDNN_NOT_PROPAGATE_NAN);
)"; )";
writer << "cudnnOpTensor(cudnn_handle," writer << "cudnnOpTensor(cudnn_handle,"
<< "opTensorDesc," << "opTensorDesc,"
<< "&alpha1," << "&alpha1,"
<< "descriptor," << args[0].get_name() << "," << "descriptor," << args[0].get_name() << ","
<< "&alpha2," << "&alpha2,"
<< "descriptor," << args[0].get_name() << "," << "descriptor," << args[0].get_name() << ","
<< "&beta," << "&beta,"
<< "descriptor," << out[0].get_name() << ");\n"; << "descriptor," << out[0].get_name() << ");\n";
writer.indent--; writer.indent--;
writer << "}\n"; writer << "}\n";
} }
void runtime::gpu::GPU_Emitter::EmitNotEqual(
codegen::CodeWriter& writer,
const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw std::runtime_error(n->get_name() + " is not implemented.");
}
void runtime::gpu::GPU_Emitter::EmitSelect(codegen::CodeWriter& writer,
const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw std::runtime_error(n->get_name() + " is not implemented.");
}
void runtime::gpu::GPU_Emitter::EmitSubtract(
codegen::CodeWriter& writer,
const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw std::runtime_error(n->get_name() + " is not implemented.");
}
void runtime::gpu::GPU_Emitter::EmitBroadcast(
codegen::CodeWriter& writer,
const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
if (out[0].get_size() == 0)
{
return;
}
auto broadcast = static_cast<const ngraph::op::Broadcast*>(n);
auto arg_shape = args[0].get_shape();
auto result_shape = out[0].get_shape();
auto& axes = broadcast->get_broadcast_axes();
//broadcast axes is empty, do a copy
if (axes.empty())
{
writer << "{ // " << n->get_name() << " \n";
writer.indent++;
writer << "runtime::gpu::cuda_memcpyDtD(" << out[0].get_name() << ", " << args[0].get_name()
<< ", " << out[0].get_size() << " * " << out[0].get_element_type().size() << ");\n";
writer.indent--;
writer << "}\n";
return;
}
//broadcast axes size is 1, or can be group to 1 (consecutive axes, like 01 or 12 or 123 etc) template <>
vector<int> axes_v; void GPU_Emitter::EMITTER_DECL(ngraph::op::Broadcast)
std::copy(axes.begin(), axes.end(), std::back_inserter(axes_v));
std::sort(axes_v.begin(), axes_v.end());
bool is_one_axes = true;
if (axes.size() != 1)
{
for (int i = 1; i < axes_v.size(); i++)
{
if (axes_v[i] != axes_v[i - 1] + 1)
{ {
is_one_axes = false; if (out[0].get_size() == 0)
break; {
return;
}
auto broadcast = static_cast<const ngraph::op::Broadcast*>(node);
auto arg_shape = args[0].get_shape();
auto result_shape = out[0].get_shape();
auto& axes = broadcast->get_broadcast_axes();
//broadcast axes is empty, do a copy
if (axes.empty())
{
writer << "{ // " << node->get_name() << " \n";
writer.indent++;
writer << "runtime::gpu::cuda_memcpyDtD(" << out[0].get_name() << ", "
<< args[0].get_name() << ", " << out[0].get_size() << " * "
<< out[0].get_element_type().size() << ");\n";
writer.indent--;
writer << "}\n";
return;
}
//broadcast axes size is 1, or can be group to 1 (consecutive axes, like 01 or 12 or 123 etc)
vector<int> axes_v;
std::copy(axes.begin(), axes.end(), std::back_inserter(axes_v));
std::sort(axes_v.begin(), axes_v.end());
bool is_one_axes = true;
if (axes.size() != 1)
{
for (int i = 1; i < axes_v.size(); i++)
{
if (axes_v[i] != axes_v[i - 1] + 1)
{
is_one_axes = false;
break;
}
}
}
if (is_one_axes)
{
int repeat_times = 1;
for (int i = 0; i < axes_v.size(); i++)
{
repeat_times *= result_shape[axes_v[i]];
}
int repeat_size = 1;
for (int i = *axes_v.rbegin() + 1; i < result_shape.size(); i++)
{
repeat_size *= result_shape[i];
}
writer << "{ // " << node->get_name() << " \n";
writer.indent++;
writer << "runtime::gpu::emit_broadcast(" << args[0].get_name() << ", "
<< out[0].get_name() << ", " << repeat_size << ", " << repeat_times
<< ", " << out[0].get_size() << ");\n";
writer.indent--;
writer << "}\n";
}
else
{
throw std::runtime_error(node->get_name() + " is not implemented.");
}
} }
}
}
if (is_one_axes)
{
int repeat_times = 1;
for (int i = 0; i < axes_v.size(); i++)
{
repeat_times *= result_shape[axes_v[i]];
}
int repeat_size = 1;
for (int i = *axes_v.rbegin() + 1; i < result_shape.size(); i++)
{
repeat_size *= result_shape[i];
}
writer << "{ // " << n->get_name() << " \n";
writer.indent++;
writer << "runtime::gpu::emit_broadcast(" << args[0].get_name() << ", " << out[0].get_name()
<< ", " << repeat_size << ", " << repeat_times << ", " << out[0].get_size()
<< ");\n";
writer.indent--;
writer << "}\n";
}
else
{
throw std::runtime_error(n->get_name() + " is not implemented.");
}
}
void runtime::gpu::GPU_Emitter::EmitConvert(codegen::CodeWriter& writer,
const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw std::runtime_error(n->get_name() + " is not implemented.");
}
void runtime::gpu::GPU_Emitter::EmitConstant( template <>
codegen::CodeWriter& writer, void GPU_Emitter::EMITTER_DECL(ngraph::op::Constant)
const ngraph::Node* n, {
const vector<runtime::gpu::GPU_TensorViewWrapper>& args, }
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
}
void runtime::gpu::GPU_Emitter::EmitReshape(codegen::CodeWriter& writer,
const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
if (out[0].get_size() == 0)
{
return;
}
auto reshape = static_cast<const op::Reshape*>(n);
writer << "{ // " << n->get_name() << "\n";
writer.indent++;
auto arg_shape = args[0].get_shape();
auto arg_rank = arg_shape.size();
auto result_shape = out[0].get_shape();
auto& result_element_type = out[0].get_element_type();
auto input_order = reshape->get_input_order();
bool same_layout = is_sorted(input_order.begin(), input_order.end());
size_t result_shape_product = 1;
for (auto i : result_shape)
{
result_shape_product *= i;
}
// If there is no layout change or we are just going from 1^n to 1^m or a zero-size tensor,
// we can just copy.
if (same_layout || result_shape_product < 2)
{
writer << "{ // " << n->get_name() << " 1\n";
writer.indent++;
writer << "runtime::gpu::cuda_memcpyDtD(" << out[0].get_name() << ", " << args[0].get_name()
<< ", " << out[0].get_size() << " * " << out[0].get_element_type().size() << ");\n";
writer.indent--;
writer << "}\n";
}
// If there *is* a layout change in the 2D case, we transpose the input.
else if (arg_rank == 2)
{
// TODO Assert arg0_shape[0] == arg1_shape[0]?
writer << "{ // " << n->get_name() << "\n";
writer.indent++;
writer << "const float alpha = 1.0;\n";
writer << "const float beta = 0;\n";
writer << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_HOST);\n";
writer << "cublasSgeam("
<< "cublas_handle,"
<< "CUBLAS_OP_T,"
<< "CUBLAS_OP_T," << arg_shape[0] << "," << arg_shape[1] << ","
<< "&alpha," // Alpha
<< args[0].get_name() << "," << arg_shape[1] << ","
<< "&beta," // beta
<< args[0].get_name() << "," << arg_shape[1] << "," << out[0].get_name() << ","
<< result_shape[1] << ");\n";
writer << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_DEVICE);\n";
writer.indent--;
writer << "}\n";
}
// Other cases (reordering of axes for tensors with rank>2) are not handled yet.
else
{
throw runtime_error(
"Axis permutation in reshape is not implemented yet for tensors with rank>2");
}
writer.indent--;
writer << "}\n";
}
void runtime::gpu::GPU_Emitter::EmitFunctionCall(
codegen::CodeWriter& writer,
const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
}
void runtime::gpu::GPU_Emitter::EmitReduce(codegen::CodeWriter& writer,
const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw std::runtime_error(n->get_name() + " is not implemented.");
}
void runtime::gpu::GPU_Emitter::EmitSlice(codegen::CodeWriter& writer, template <>
const ngraph::Node* n, void GPU_Emitter::EMITTER_DECL(ngraph::op::Reshape)
const vector<runtime::gpu::GPU_TensorViewWrapper>& args, {
const vector<runtime::gpu::GPU_TensorViewWrapper>& out) if (out[0].get_size() == 0)
{ {
throw std::runtime_error(n->get_name() + " is not implemented."); return;
} }
auto reshape = static_cast<const op::Reshape*>(node);
writer << "{ // " << node->get_name() << "\n";
writer.indent++;
auto arg_shape = args[0].get_shape();
auto arg_rank = arg_shape.size();
auto result_shape = out[0].get_shape();
auto& result_element_type = out[0].get_element_type();
auto input_order = reshape->get_input_order();
bool same_layout = is_sorted(input_order.begin(), input_order.end());
size_t result_shape_product = 1;
for (auto i : result_shape)
{
result_shape_product *= i;
}
// If there is no layout change or we are just going from 1^n to 1^m or a zero-size tensor,
// we can just copy.
if (same_layout || result_shape_product < 2)
{
writer << "{ // " << node->get_name() << " 1\n";
writer.indent++;
writer << "runtime::gpu::cuda_memcpyDtD(" << out[0].get_name() << ", "
<< args[0].get_name() << ", " << out[0].get_size() << " * "
<< out[0].get_element_type().size() << ");\n";
writer.indent--;
writer << "}\n";
}
// If there *is* a layout change in the 2D case, we transpose the input.
else if (arg_rank == 2)
{
// TODO Assert arg0_shape[0] == arg1_shape[0]?
writer << "{ // " << node->get_name() << "\n";
writer.indent++;
writer << "const float alpha = 1.0;\n";
writer << "const float beta = 0;\n";
writer << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_HOST);\n";
writer << "cublasSgeam("
<< "cublas_handle,"
<< "CUBLAS_OP_T,"
<< "CUBLAS_OP_T," << arg_shape[0] << "," << arg_shape[1] << ","
<< "&alpha," // Alpha
<< args[0].get_name() << "," << arg_shape[1] << ","
<< "&beta," // beta
<< args[0].get_name() << "," << arg_shape[1] << "," << out[0].get_name()
<< "," << result_shape[1] << ");\n";
writer << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_DEVICE);\n";
writer.indent--;
writer << "}\n";
}
// Other cases (reordering of axes for tensors with rank>2) are not handled yet.
else
{
throw runtime_error(
"Axis permutation in reshape is not implemented yet for tensors with "
"rank>2");
}
writer.indent--;
writer << "}\n";
}
void runtime::gpu::GPU_Emitter::EmitSum(codegen::CodeWriter& writer, template <>
const ngraph::Node* n, void GPU_Emitter::EMITTER_DECL(ngraph::op::FunctionCall)
const vector<runtime::gpu::GPU_TensorViewWrapper>& args, {
const vector<runtime::gpu::GPU_TensorViewWrapper>& out) }
{
throw std::runtime_error(n->get_name() + " is not implemented.");
}
void runtime::gpu::GPU_Emitter::EmitMultiply( template <>
codegen::CodeWriter& writer, void GPU_Emitter::EMITTER_DECL(ngraph::op::Multiply)
const ngraph::Node* n, {
const vector<runtime::gpu::GPU_TensorViewWrapper>& args, if (out[0].get_size() == 0)
const vector<runtime::gpu::GPU_TensorViewWrapper>& out) {
{ return;
if (out[0].get_size() == 0) }
{ writer << "{ // " << node->get_name() << "\n";
return; writer.indent++;
} writer << "int count = " << out[0].get_size() << ";\n";
writer << "{ // " << n->get_name() << "\n"; writer += R"(
writer.indent++;
writer << "int count = " << out[0].get_size() << ";\n";
writer += R"(
float alpha1 = 1.0, alpha2 = 1.0, beta = 0; float alpha1 = 1.0, alpha2 = 1.0, beta = 0;
cudnnTensorDescriptor_t descriptor; cudnnTensorDescriptor_t descriptor;
cudnnCreateTensorDescriptor(&descriptor); cudnnCreateTensorDescriptor(&descriptor);
...@@ -675,56 +606,29 @@ cudnnSetOpTensorDescriptor(opTensorDesc, ...@@ -675,56 +606,29 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
CUDNN_NOT_PROPAGATE_NAN); CUDNN_NOT_PROPAGATE_NAN);
)"; )";
writer << "cudnnOpTensor(cudnn_handle," writer << "cudnnOpTensor(cudnn_handle,"
<< "opTensorDesc," << "opTensorDesc,"
<< "&alpha1," << "&alpha1,"
<< "descriptor," << args[0].get_name() << "," << "descriptor," << args[0].get_name() << ","
<< "&alpha2," << "&alpha2,"
<< "descriptor," << args[1].get_name() << "," << "descriptor," << args[1].get_name() << ","
<< "&beta," << "&beta,"
<< "descriptor," << out[0].get_name() << ");\n"; << "descriptor," << out[0].get_name() << ");\n";
writer.indent--; writer.indent--;
writer << "}\n"; writer << "}\n";
} }
void runtime::gpu::GPU_Emitter::EmitPower(codegen::CodeWriter& writer,
const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw std::runtime_error(n->get_name() + " is not implemented.");
}
void runtime::gpu::GPU_Emitter::EmitReplaceSlice(
codegen::CodeWriter& writer,
const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw std::runtime_error(n->get_name() + " is not implemented.");
}
void runtime::gpu::GPU_Emitter::EmitOneHot(codegen::CodeWriter& writer,
const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw std::runtime_error(n->get_name() + " is not implemented.");
}
void runtime::gpu::GPU_Emitter::EmitSqrt(codegen::CodeWriter& writer, template <>
const ngraph::Node* n, void GPU_Emitter::EMITTER_DECL(ngraph::op::Sqrt)
const vector<runtime::gpu::GPU_TensorViewWrapper>& args, {
const vector<runtime::gpu::GPU_TensorViewWrapper>& out) if (out[0].get_size() == 0)
{ {
if (out[0].get_size() == 0) return;
{ }
return; writer << "{ // " << node->get_name() << "\n";
} writer.indent++;
writer << "{ // " << n->get_name() << "\n"; writer << "int count = " << out[0].get_size() << ";\n";
writer.indent++; writer += R"(
writer << "int count = " << out[0].get_size() << ";\n";
writer += R"(
float alpha1 = 1.0, alpha2 = 0, beta = 0; float alpha1 = 1.0, alpha2 = 0, beta = 0;
cudnnTensorDescriptor_t descriptor; cudnnTensorDescriptor_t descriptor;
cudnnCreateTensorDescriptor(&descriptor); cudnnCreateTensorDescriptor(&descriptor);
...@@ -744,71 +648,30 @@ cudnnSetOpTensorDescriptor(opTensorDesc, ...@@ -744,71 +648,30 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
CUDNN_NOT_PROPAGATE_NAN); CUDNN_NOT_PROPAGATE_NAN);
)"; )";
writer << "cudnnOpTensor(cudnn_handle," writer << "cudnnOpTensor(cudnn_handle,"
<< "opTensorDesc," << "opTensorDesc,"
<< "&alpha1," << "&alpha1,"
<< "descriptor," << args[0].get_name() << "," << "descriptor," << args[0].get_name() << ","
<< "&alpha2," << "&alpha2,"
<< "descriptor," << args[0].get_name() << "," << "descriptor," << args[0].get_name() << ","
<< "&beta," << "&beta,"
<< "descriptor," << out[0].get_name() << ");\n"; << "descriptor," << out[0].get_name() << ");\n";
writer.indent--; writer.indent--;
writer << "}\n"; writer << "}\n";
} }
void runtime::gpu::GPU_Emitter::EmitConvolution(
codegen::CodeWriter& writer,
const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw std::runtime_error(n->get_name() + " is not implemented.");
}
void runtime::gpu::GPU_Emitter::EmitMaxPool(codegen::CodeWriter& writer,
const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw std::runtime_error(n->get_name() + " is not implemented.");
}
void runtime::gpu::GPU_Emitter::EmitReverse(codegen::CodeWriter& writer,
const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw std::runtime_error(n->get_name() + " is not implemented.");
}
void runtime::gpu::GPU_Emitter::EmitReduceWindow(
codegen::CodeWriter& writer,
const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw std::runtime_error(n->get_name() + " is not implemented.");
}
void runtime::gpu::GPU_Emitter::EmitSelectAndScatter(
codegen::CodeWriter& writer,
const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw std::runtime_error(n->get_name() + " is not implemented.");
}
void runtime::gpu::GPU_Emitter::EmitResult(codegen::CodeWriter& writer, template <>
const ngraph::Node* n, void GPU_Emitter::EMITTER_DECL(ngraph::op::Result)
const vector<runtime::gpu::GPU_TensorViewWrapper>& args, {
const vector<runtime::gpu::GPU_TensorViewWrapper>& out) writer << "{ //" << node->get_name() << "\n";
{ writer.indent++;
writer << "{ //" << n->get_name() << "\n"; writer << "runtime::gpu::cuda_memcpyDtD(" << out[0].get_name() << ", "
writer.indent++; << args[0].get_name() << ", " << out[0].get_size() << " * "
writer << "runtime::gpu::cuda_memcpyDtD(" << out[0].get_name() << ", " << args[0].get_name() << out[0].get_element_type().size() << ");\n";
<< ", " << out[0].get_size() << " * " << out[0].get_element_type().size() << ");\n"; writer.indent--;
writer.indent--; writer << "}\n";
writer << "}\n"; return;
return; }
}
}
} }
...@@ -24,12 +24,12 @@ ...@@ -24,12 +24,12 @@
#include "ngraph/runtime/gpu/gpu_external_function.hpp" #include "ngraph/runtime/gpu/gpu_external_function.hpp"
#include "ngraph/runtime/gpu/gpu_tensor_view_wrapper.hpp" #include "ngraph/runtime/gpu/gpu_tensor_view_wrapper.hpp"
#define EMITTER_DECL(E) \ #define EMITTER_DECL(op_name) \
E(codegen::CodeWriter& writer, \ emit<op_name>(GPU_ExternalFunction * external_function, \
const ngraph::Node* n, \ codegen::CodeWriter & writer, \
const std::vector<ngraph::runtime::gpu::GPU_TensorViewWrapper>& args, \ const ngraph::Node* node, \
const std::vector<ngraph::runtime::gpu::GPU_TensorViewWrapper>& out) const std::vector<GPU_TensorViewWrapper>& args, \
const std::vector<GPU_TensorViewWrapper>& out)
namespace ngraph namespace ngraph
{ {
namespace runtime namespace runtime
...@@ -39,45 +39,30 @@ namespace ngraph ...@@ -39,45 +39,30 @@ namespace ngraph
class GPU_Emitter class GPU_Emitter
{ {
public: public:
static void EMITTER_DECL(EmitNop); template <typename OP>
static void EMITTER_DECL(EmitAdd); static void emit(GPU_ExternalFunction* external_function,
static void EMITTER_DECL(EmitDot); codegen::CodeWriter& writer,
static void EMITTER_DECL(EmitMultiply); const ngraph::Node* node,
static void EMITTER_DECL(EmitGetOutputElement); const std::vector<GPU_TensorViewWrapper>& args,
static void EMITTER_DECL(EmitXLAGetTupleElement); const std::vector<GPU_TensorViewWrapper>& out)
static void EMITTER_DECL(EmitUnaryElementwise); {
static void EMITTER_DECL(EmitTuple); throw std::runtime_error("Unimplemented op in GPU emitter for " +
static void EMITTER_DECL(EmitConcat); node->get_name());
static void EMITTER_DECL(EmitDivide); }
static void EMITTER_DECL(EmitEqual);
static void EMITTER_DECL(EmitGreater); static void nop(GPU_ExternalFunction* external_function,
static void EMITTER_DECL(EmitGreaterEq); codegen::CodeWriter& writer,
static void EMITTER_DECL(EmitLess); const ngraph::Node* node,
static void EMITTER_DECL(EmitLessEq); const std::vector<GPU_TensorViewWrapper>& args,
static void EMITTER_DECL(EmitMaximum); const std::vector<GPU_TensorViewWrapper>& out)
static void EMITTER_DECL(EmitMinimum); {
static void EMITTER_DECL(EmitNegative); }
static void EMITTER_DECL(EmitNotEqual);
static void EMITTER_DECL(EmitSelect); static void EmitUnaryElementwise(GPU_ExternalFunction* external_function,
static void EMITTER_DECL(EmitSubtract); codegen::CodeWriter& writer,
static void EMITTER_DECL(EmitBroadcast); const ngraph::Node* node,
static void EMITTER_DECL(EmitConvert); const std::vector<GPU_TensorViewWrapper>& args,
static void EMITTER_DECL(EmitConstant); const std::vector<GPU_TensorViewWrapper>& out);
static void EMITTER_DECL(EmitReshape);
static void EMITTER_DECL(EmitFunctionCall);
static void EMITTER_DECL(EmitReduce);
static void EMITTER_DECL(EmitSlice);
static void EMITTER_DECL(EmitSum);
static void EMITTER_DECL(EmitPower);
static void EMITTER_DECL(EmitReplaceSlice);
static void EMITTER_DECL(EmitOneHot);
static void EMITTER_DECL(EmitSqrt);
static void EMITTER_DECL(EmitConvolution);
static void EMITTER_DECL(EmitMaxPool);
static void EMITTER_DECL(EmitReverse);
static void EMITTER_DECL(EmitReduceWindow);
static void EMITTER_DECL(EmitSelectAndScatter);
static void EMITTER_DECL(EmitResult);
}; };
} }
} }
......
...@@ -41,8 +41,11 @@ ...@@ -41,8 +41,11 @@
#include "ngraph/ops/abs.hpp" #include "ngraph/ops/abs.hpp"
#include "ngraph/ops/acos.hpp" #include "ngraph/ops/acos.hpp"
#include "ngraph/ops/add.hpp" #include "ngraph/ops/add.hpp"
#include "ngraph/ops/allreduce.hpp"
#include "ngraph/ops/asin.hpp" #include "ngraph/ops/asin.hpp"
#include "ngraph/ops/atan.hpp" #include "ngraph/ops/atan.hpp"
#include "ngraph/ops/avg_pool.hpp"
#include "ngraph/ops/batch_norm.hpp"
#include "ngraph/ops/broadcast.hpp" #include "ngraph/ops/broadcast.hpp"
#include "ngraph/ops/ceiling.hpp" #include "ngraph/ops/ceiling.hpp"
#include "ngraph/ops/concat.hpp" #include "ngraph/ops/concat.hpp"
...@@ -57,24 +60,34 @@ ...@@ -57,24 +60,34 @@
#include "ngraph/ops/exp.hpp" #include "ngraph/ops/exp.hpp"
#include "ngraph/ops/floor.hpp" #include "ngraph/ops/floor.hpp"
#include "ngraph/ops/function_call.hpp" #include "ngraph/ops/function_call.hpp"
#include "ngraph/ops/get_output_element.hpp"
#include "ngraph/ops/greater.hpp" #include "ngraph/ops/greater.hpp"
#include "ngraph/ops/greater_eq.hpp" #include "ngraph/ops/greater_eq.hpp"
#include "ngraph/ops/less.hpp" #include "ngraph/ops/less.hpp"
#include "ngraph/ops/less_eq.hpp" #include "ngraph/ops/less_eq.hpp"
#include "ngraph/ops/log.hpp" #include "ngraph/ops/log.hpp"
#include "ngraph/ops/max.hpp"
#include "ngraph/ops/max_pool.hpp" #include "ngraph/ops/max_pool.hpp"
#include "ngraph/ops/maximum.hpp" #include "ngraph/ops/maximum.hpp"
#include "ngraph/ops/min.hpp"
#include "ngraph/ops/minimum.hpp" #include "ngraph/ops/minimum.hpp"
#include "ngraph/ops/multiply.hpp" #include "ngraph/ops/multiply.hpp"
#include "ngraph/ops/negative.hpp" #include "ngraph/ops/negative.hpp"
#include "ngraph/ops/not.hpp" #include "ngraph/ops/not.hpp"
#include "ngraph/ops/not_equal.hpp" #include "ngraph/ops/not_equal.hpp"
#include "ngraph/ops/one_hot.hpp" #include "ngraph/ops/one_hot.hpp"
#include "ngraph/ops/op.hpp"
#include "ngraph/ops/pad.hpp"
#include "ngraph/ops/parameter.hpp"
#include "ngraph/ops/power.hpp" #include "ngraph/ops/power.hpp"
#include "ngraph/ops/product.hpp"
#include "ngraph/ops/reduce.hpp" #include "ngraph/ops/reduce.hpp"
#include "ngraph/ops/reduce_window.hpp" #include "ngraph/ops/reduce_window.hpp"
#include "ngraph/ops/relu.hpp"
#include "ngraph/ops/remainder.hpp"
#include "ngraph/ops/replace_slice.hpp" #include "ngraph/ops/replace_slice.hpp"
#include "ngraph/ops/reshape.hpp" #include "ngraph/ops/reshape.hpp"
#include "ngraph/ops/result.hpp"
#include "ngraph/ops/reverse.hpp" #include "ngraph/ops/reverse.hpp"
#include "ngraph/ops/select.hpp" #include "ngraph/ops/select.hpp"
#include "ngraph/ops/select_and_scatter.hpp" #include "ngraph/ops/select_and_scatter.hpp"
...@@ -82,6 +95,7 @@ ...@@ -82,6 +95,7 @@
#include "ngraph/ops/sin.hpp" #include "ngraph/ops/sin.hpp"
#include "ngraph/ops/sinh.hpp" #include "ngraph/ops/sinh.hpp"
#include "ngraph/ops/slice.hpp" #include "ngraph/ops/slice.hpp"
#include "ngraph/ops/softmax.hpp"
#include "ngraph/ops/sqrt.hpp" #include "ngraph/ops/sqrt.hpp"
#include "ngraph/ops/subtract.hpp" #include "ngraph/ops/subtract.hpp"
#include "ngraph/ops/sum.hpp" #include "ngraph/ops/sum.hpp"
...@@ -100,7 +114,6 @@ ...@@ -100,7 +114,6 @@
#include "ngraph/runtime/gpu/gpu_kernel_emitters.hpp" #include "ngraph/runtime/gpu/gpu_kernel_emitters.hpp"
using namespace std; using namespace std;
using namespace ngraph;
static const string s_output_dir = "gpu_codegen"; static const string s_output_dir = "gpu_codegen";
...@@ -146,91 +159,119 @@ static StaticInitializers s_static_initializers; ...@@ -146,91 +159,119 @@ static StaticInitializers s_static_initializers;
#define TI(x) type_index(typeid(x)) #define TI(x) type_index(typeid(x))
static const runtime::gpu::OpMap dispatcher{ namespace ngraph
{TI(ngraph::op::Add), &runtime::gpu::GPU_Emitter::EmitAdd},
{TI(ngraph::op::Dot), &runtime::gpu::GPU_Emitter::EmitDot},
{TI(ngraph::op::Multiply), &runtime::gpu::GPU_Emitter::EmitMultiply},
{TI(ngraph::op::Parameter), &runtime::gpu::GPU_Emitter::EmitNop},
{TI(ngraph::op::Abs), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::Concat), &runtime::gpu::GPU_Emitter::EmitConcat},
{TI(ngraph::op::Divide), &runtime::gpu::GPU_Emitter::EmitDivide},
{TI(ngraph::op::Equal), &runtime::gpu::GPU_Emitter::EmitEqual},
{TI(ngraph::op::Greater), &runtime::gpu::GPU_Emitter::EmitGreater},
{TI(ngraph::op::GreaterEq), &runtime::gpu::GPU_Emitter::EmitGreaterEq},
{TI(ngraph::op::Less), &runtime::gpu::GPU_Emitter::EmitLess},
{TI(ngraph::op::LessEq), &runtime::gpu::GPU_Emitter::EmitLessEq},
{TI(ngraph::op::Log), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::Maximum), &runtime::gpu::GPU_Emitter::EmitMaximum},
{TI(ngraph::op::Minimum), &runtime::gpu::GPU_Emitter::EmitMinimum},
{TI(ngraph::op::Negative), &runtime::gpu::GPU_Emitter::EmitNegative},
{TI(ngraph::op::NotEqual), &runtime::gpu::GPU_Emitter::EmitNotEqual},
{TI(ngraph::op::Power), &runtime::gpu::GPU_Emitter::EmitPower},
{TI(ngraph::op::Select), &runtime::gpu::GPU_Emitter::EmitSelect},
{TI(ngraph::op::Subtract), &runtime::gpu::GPU_Emitter::EmitSubtract},
{TI(ngraph::op::Broadcast), &runtime::gpu::GPU_Emitter::EmitBroadcast},
{TI(ngraph::op::Convert), &runtime::gpu::GPU_Emitter::EmitConvert},
{TI(ngraph::op::Constant), &runtime::gpu::GPU_Emitter::EmitConstant},
{TI(ngraph::op::Reshape), &runtime::gpu::GPU_Emitter::EmitReshape},
{TI(ngraph::op::FunctionCall), &runtime::gpu::GPU_Emitter::EmitFunctionCall},
{TI(ngraph::op::Reduce), &runtime::gpu::GPU_Emitter::EmitReduce},
{TI(ngraph::op::Sign), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::Slice), &runtime::gpu::GPU_Emitter::EmitSlice},
{TI(ngraph::op::Sum), &runtime::gpu::GPU_Emitter::EmitSum},
{TI(ngraph::op::Exp), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::Sin), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::Sinh), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::Cos), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::Cosh), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::Tan), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::Tanh), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::Asin), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::Acos), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::Atan), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::ReplaceSlice), &runtime::gpu::GPU_Emitter::EmitReplaceSlice},
{TI(ngraph::op::OneHot), &runtime::gpu::GPU_Emitter::EmitOneHot},
{TI(ngraph::op::Floor), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::Ceiling), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::Sqrt), &runtime::gpu::GPU_Emitter::EmitSqrt},
{TI(ngraph::op::Convolution), &runtime::gpu::GPU_Emitter::EmitConvolution},
{TI(ngraph::op::Not), &runtime::gpu::GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::MaxPool), &runtime::gpu::GPU_Emitter::EmitMaxPool},
{TI(ngraph::op::Reverse), &runtime::gpu::GPU_Emitter::EmitReverse},
{TI(ngraph::op::ReduceWindow), &runtime::gpu::GPU_Emitter::EmitReduceWindow},
{TI(ngraph::op::SelectAndScatter), &runtime::gpu::GPU_Emitter::EmitSelectAndScatter},
{TI(ngraph::op::Result), &runtime::gpu::GPU_Emitter::EmitResult},
};
runtime::gpu::GPU_ExternalFunction::GPU_ExternalFunction(
const shared_ptr<ngraph::Function>& function, bool release_function)
: ngraph::runtime::ExternalFunction(function, release_function)
, m_compiled_function(nullptr)
, m_emit_timing(std::getenv("NGRAPH_GPU_EMIT_TIMING") != nullptr)
{ {
} namespace runtime
void runtime::gpu::GPU_ExternalFunction::compile()
{
if (m_is_compiled)
{ {
return; namespace gpu
} {
static const OpMap dispatcher{
{TI(ngraph::op::Add), &GPU_Emitter::emit<ngraph::op::Add>},
{TI(ngraph::op::Dot), &GPU_Emitter::emit<ngraph::op::Dot>},
{TI(ngraph::op::Multiply), &GPU_Emitter::emit<ngraph::op::Multiply>},
{TI(ngraph::op::Parameter), &GPU_Emitter::nop},
{TI(ngraph::op::Abs), &GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::Concat), &GPU_Emitter::emit<ngraph::op::Concat>},
{TI(ngraph::op::Divide), &GPU_Emitter::emit<ngraph::op::Divide>},
{TI(ngraph::op::Equal), &GPU_Emitter::emit<ngraph::op::Equal>},
{TI(ngraph::op::GetOutputElement),
&GPU_Emitter::emit<ngraph::op::GetOutputElement>},
{TI(ngraph::op::Greater), &GPU_Emitter::emit<ngraph::op::Greater>},
{TI(ngraph::op::GreaterEq), &GPU_Emitter::emit<ngraph::op::GreaterEq>},
{TI(ngraph::op::Less), &GPU_Emitter::emit<ngraph::op::Less>},
{TI(ngraph::op::LessEq), &GPU_Emitter::emit<ngraph::op::LessEq>},
{TI(ngraph::op::Log), &GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::Maximum), &GPU_Emitter::emit<ngraph::op::Maximum>},
{TI(ngraph::op::Minimum), &GPU_Emitter::emit<ngraph::op::Minimum>},
{TI(ngraph::op::Negative), &GPU_Emitter::emit<ngraph::op::Negative>},
{TI(ngraph::op::NotEqual), &GPU_Emitter::emit<ngraph::op::NotEqual>},
{TI(ngraph::op::Power), &GPU_Emitter::emit<ngraph::op::Power>},
{TI(ngraph::op::Select), &GPU_Emitter::emit<ngraph::op::Select>},
{TI(ngraph::op::Subtract), &GPU_Emitter::emit<ngraph::op::Subtract>},
{TI(ngraph::op::Broadcast), &GPU_Emitter::emit<ngraph::op::Broadcast>},
{TI(ngraph::op::Convert), &GPU_Emitter::emit<ngraph::op::Convert>},
{TI(ngraph::op::Constant), &GPU_Emitter::emit<ngraph::op::Constant>},
{TI(ngraph::op::Reshape), &GPU_Emitter::emit<ngraph::op::Reshape>},
{TI(ngraph::op::FunctionCall), &GPU_Emitter::emit<ngraph::op::FunctionCall>},
{TI(ngraph::op::Reduce), &GPU_Emitter::emit<ngraph::op::Reduce>},
{TI(ngraph::op::Sign), &GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::Slice), &GPU_Emitter::emit<ngraph::op::Slice>},
{TI(ngraph::op::Sum), &GPU_Emitter::emit<ngraph::op::Sum>},
{TI(ngraph::op::Exp), &GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::Sin), &GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::Sinh), &GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::Cos), &GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::Cosh), &GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::Tan), &GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::Tanh), &GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::Asin), &GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::Acos), &GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::Atan), &GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::ReplaceSlice), &GPU_Emitter::emit<ngraph::op::ReplaceSlice>},
{TI(ngraph::op::OneHot), &GPU_Emitter::emit<ngraph::op::OneHot>},
{TI(ngraph::op::Floor), &GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::Ceiling), &GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::Sqrt), &GPU_Emitter::emit<ngraph::op::Sqrt>},
{TI(ngraph::op::Convolution), &GPU_Emitter::emit<ngraph::op::Convolution>},
{TI(ngraph::op::ConvolutionBackpropFilters),
&GPU_Emitter::emit<ngraph::op::ConvolutionBackpropFilters>},
{TI(ngraph::op::ConvolutionBackpropData),
&GPU_Emitter::emit<ngraph::op::ConvolutionBackpropData>},
{TI(ngraph::op::Not), &GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::MaxPool), &GPU_Emitter::emit<ngraph::op::MaxPool>},
{TI(ngraph::op::Reverse), &GPU_Emitter::emit<ngraph::op::Reverse>},
{TI(ngraph::op::Result), &GPU_Emitter::emit<ngraph::op::Result>},
{TI(ngraph::op::ReduceWindow), &GPU_Emitter::emit<ngraph::op::ReduceWindow>},
{TI(ngraph::op::SelectAndScatter),
&GPU_Emitter::emit<ngraph::op::SelectAndScatter>},
{TI(ngraph::op::AvgPool), &GPU_Emitter::emit<ngraph::op::AvgPool>},
{TI(ngraph::op::AvgPoolBackprop), &GPU_Emitter::emit<ngraph::op::AvgPoolBackprop>},
{TI(ngraph::op::Pad), &GPU_Emitter::emit<ngraph::op::Pad>},
{TI(ngraph::op::BatchNorm), &GPU_Emitter::emit<ngraph::op::BatchNorm>},
{TI(ngraph::op::BatchNormBackprop),
&GPU_Emitter::emit<ngraph::op::BatchNormBackprop>},
{TI(ngraph::op::MaxPoolBackprop), &GPU_Emitter::emit<ngraph::op::MaxPoolBackprop>},
{TI(ngraph::op::Product), &GPU_Emitter::emit<ngraph::op::Product>},
{TI(ngraph::op::Max), &GPU_Emitter::emit<ngraph::op::Max>},
{TI(ngraph::op::Min), &GPU_Emitter::emit<ngraph::op::Min>},
{TI(ngraph::op::Relu), &GPU_Emitter::emit<ngraph::op::Relu>},
{TI(ngraph::op::ReluBackprop), &GPU_Emitter::emit<ngraph::op::ReluBackprop>},
{TI(ngraph::op::Softmax), &GPU_Emitter::emit<ngraph::op::Softmax>},
};
GPU_ExternalFunction::GPU_ExternalFunction(const shared_ptr<ngraph::Function>& function,
bool release_function)
: ngraph::runtime::ExternalFunction(function, release_function)
, m_compiled_function(nullptr)
, m_emit_timing(std::getenv("NGRAPH_GPU_EMIT_TIMING") != nullptr)
{
}
void GPU_ExternalFunction::compile()
{
if (m_is_compiled)
{
return;
}
string function_name = m_function->get_name(); string function_name = m_function->get_name();
string dump_filename = file_util::path_join(s_output_dir, function_name + "_ops.txt"); string dump_filename =
file_util::path_join(s_output_dir, function_name + "_ops.txt");
pass::Manager pass_manager; pass::Manager pass_manager;
// pass_manager.register_pass<pass::TopologicalSort>(); // pass_manager.register_pass<pass::TopologicalSort>();
// For now, just make everyone row-major. // For now, just make everyone row-major.
pass_manager.register_pass<pass::AssignLayout<descriptor::layout::DenseTensorViewLayout>>(); pass_manager
pass_manager.register_pass<pass::Liveness>(); .register_pass<pass::AssignLayout<descriptor::layout::DenseTensorViewLayout>>();
pass_manager.register_pass<pass::MemoryLayout>(64); pass_manager.register_pass<pass::Liveness>();
pass_manager.register_pass<pass::DumpSorted>(dump_filename); pass_manager.register_pass<pass::MemoryLayout>(64);
pass_manager.run_passes(m_function); pass_manager.register_pass<pass::DumpSorted>(dump_filename);
pass_manager.run_passes(m_function);
codegen::CodeWriter writer; codegen::CodeWriter writer;
writer += writer +=
R"(// Generated by the NGraph GPU backend R"(// Generated by the NGraph GPU backend
#include <cublas_v2.h> #include <cublas_v2.h>
#include <cuda.h> #include <cuda.h>
#include <cuda_runtime.h> #include <cuda_runtime.h>
...@@ -256,529 +297,560 @@ void runtime::gpu::GPU_ExternalFunction::compile() ...@@ -256,529 +297,560 @@ void runtime::gpu::GPU_ExternalFunction::compile()
#include "ngraph/util.hpp" #include "ngraph/util.hpp"
)"; )";
string pch_header_source = writer.get_code(); string pch_header_source = writer.get_code();
writer += R"( writer += R"(
using namespace ngraph; using namespace ngraph;
using namespace std; using namespace std;
)"; )";
if (m_emit_timing) if (m_emit_timing)
{
writer << "// Declare debug timers\n";
vector<string> names;
for (shared_ptr<Function> current_function : pass_manager.get_state().get_functions())
{
for (shared_ptr<Node> node : current_function->get_ordered_ops())
{
if (!node->is_parameter() && !node->is_constant())
{ {
names.push_back(node->get_name()); writer << "// Declare debug timers\n";
vector<string> names;
for (shared_ptr<Function> current_function :
pass_manager.get_state().get_functions())
{
for (shared_ptr<Node> node : current_function->get_ordered_ops())
{
if (!node->is_parameter() && !node->is_constant())
{
names.push_back(node->get_name());
}
}
}
for (const string& s : names)
{
writer << "ngraph::stopwatch timer_" << s << ";\n";
}
writer << "extern \"C\" size_t get_debug_timer_count() { return "
<< names.size() << "; }\n";
writer << "extern \"C\" const char* get_debug_timer_name(size_t index)\n";
writer << "{\n";
writer.indent++;
writer << "const char* rc;\n";
writer << "switch(index)\n";
writer << "{\n";
for (size_t i = 0; i < names.size(); i++)
{
writer << "case " << i << ": rc = \"" << names[i] << "\"; break;\n";
}
writer << "default: rc = \"\";\n";
writer << "}\n";
writer << "return rc;\n";
writer.indent--;
writer << "}\n";
writer
<< "extern \"C\" const size_t get_debug_timer_microseconds(size_t index)\n";
writer << "{\n";
writer.indent++;
writer << "size_t rc;\n";
writer << "switch(index)\n";
writer << "{\n";
for (size_t i = 0; i < names.size(); i++)
{
writer << "case " << i << ": rc = timer_" << names[i]
<< ".get_total_microseconds(); break;\n";
}
writer << "default: rc = 0;\n";
writer << "}\n";
writer << "return rc;\n";
writer.indent--;
writer << "}\n";
writer
<< "extern \"C\" const size_t get_debug_timer_call_count(size_t index)\n";
writer << "{\n";
writer.indent++;
writer << "size_t rc;\n";
writer << "switch(index)\n";
writer << "{\n";
for (size_t i = 0; i < names.size(); i++)
{
writer << "case " << i << ": rc = timer_" << names[i]
<< ".get_call_count(); break;\n";
}
writer << "default: rc = 0;\n";
writer << "}\n";
writer << "return rc;\n";
writer.indent--;
writer << "}\n";
writer << "\n";
}
// // The "dso_handle" symbol is required by __cxa_atexit()
// // which is enabled because the JIT uses it as the default mechanism
// // to register cleanup handlers. We use it, and not atexit(), because
// // atexit() happens too late, when the JIT is no longer alive
writer << "void *__dso_handle = 0;\n\n";
writer << "// Declare all constants\n";
for (shared_ptr<Function> current_function :
pass_manager.get_state().get_functions())
{
for (shared_ptr<Node> node : current_function->get_ordered_ops())
{
const op::Constant* c = dynamic_cast<ngraph::op::Constant*>(node.get());
if (c)
{
shared_ptr<descriptor::TensorView> tv =
node->get_outputs()[0].get_tensor_view();
auto c_value_strings = c->get_value_strings();
writer << "static "
<< tv->get_tensor().get_element_type().c_type_string() << " "
<< tv->get_tensor().get_name() << "_cpu["
<< c_value_strings.size() << "] =\n";
writer << "{\n";
writer.indent++;
writer << emit_string_array(c_value_strings, 100 - writer.indent * 4);
writer.indent--;
writer << "\n};\n\n";
writer << "static "
<< tv->get_tensor().get_element_type().c_type_string() << " *"
<< tv->get_tensor().get_name() << ";\n";
m_variable_name_map[tv->get_tensor().get_name()] =
tv->get_tensor().get_name();
}
}
} }
}
}
for (const string& s : names)
{
writer << "ngraph::stopwatch timer_" << s << ";\n";
}
writer << "extern \"C\" size_t get_debug_timer_count() { return " << names.size()
<< "; }\n";
writer << "extern \"C\" const char* get_debug_timer_name(size_t index)\n";
writer << "{\n";
writer.indent++;
writer << "const char* rc;\n";
writer << "switch(index)\n";
writer << "{\n";
for (size_t i = 0; i < names.size(); i++)
{
writer << "case " << i << ": rc = \"" << names[i] << "\"; break;\n";
}
writer << "default: rc = \"\";\n";
writer << "}\n";
writer << "return rc;\n";
writer.indent--;
writer << "}\n";
writer << "extern \"C\" const size_t get_debug_timer_microseconds(size_t index)\n";
writer << "{\n";
writer.indent++;
writer << "size_t rc;\n";
writer << "switch(index)\n";
writer << "{\n";
for (size_t i = 0; i < names.size(); i++)
{
writer << "case " << i << ": rc = timer_" << names[i]
<< ".get_total_microseconds(); break;\n";
}
writer << "default: rc = 0;\n";
writer << "}\n";
writer << "return rc;\n";
writer.indent--;
writer << "}\n";
writer << "extern \"C\" const size_t get_debug_timer_call_count(size_t index)\n";
writer << "{\n";
writer.indent++;
writer << "size_t rc;\n";
writer << "switch(index)\n";
writer << "{\n";
for (size_t i = 0; i < names.size(); i++)
{
writer << "case " << i << ": rc = timer_" << names[i] << ".get_call_count(); break;\n";
}
writer << "default: rc = 0;\n";
writer << "}\n";
writer << "return rc;\n";
writer.indent--;
writer << "}\n";
writer << "\n";
}
// // The "dso_handle" symbol is required by __cxa_atexit()
// // which is enabled because the JIT uses it as the default mechanism
// // to register cleanup handlers. We use it, and not atexit(), because
// // atexit() happens too late, when the JIT is no longer alive
writer << "void *__dso_handle = 0;\n\n";
writer << "// Declare all constants\n";
for (shared_ptr<Function> current_function : pass_manager.get_state().get_functions())
{
for (shared_ptr<Node> node : current_function->get_ordered_ops())
{
const op::Constant* c = dynamic_cast<op::Constant*>(node.get());
if (c)
{
shared_ptr<descriptor::TensorView> tv = node->get_outputs()[0].get_tensor_view();
auto c_value_strings = c->get_value_strings();
writer << "static " << tv->get_tensor().get_element_type().c_type_string() << " "
<< tv->get_tensor().get_name() << "_cpu[" << c_value_strings.size()
<< "] =\n";
writer << "{\n";
writer.indent++;
writer << emit_string_array(c_value_strings, 100 - writer.indent * 4);
writer.indent--;
writer << "\n};\n\n";
writer << "static " << tv->get_tensor().get_element_type().c_type_string() << " *"
<< tv->get_tensor().get_name() << ";\n";
m_variable_name_map[tv->get_tensor().get_name()] = tv->get_tensor().get_name();
}
}
}
writer << "// Declare all functions\n";
for (shared_ptr<Function> f : pass_manager.get_state().get_functions())
{
writer << "extern \"C\" void " << f->get_name() << "(void** inputs, void** outputs, "
"cublasHandle_t& cublas_handle, "
"cudnnHandle_t& cudnn_handle);\n";
}
writer << "\n"; writer << "// Declare all functions\n";
for (shared_ptr<Function> f : pass_manager.get_state().get_functions())
{
writer << "extern \"C\" void " << f->get_name()
<< "(void** inputs, void** outputs, "
"cublasHandle_t& cublas_handle, "
"cudnnHandle_t& cudnn_handle);\n";
}
unordered_map<Node*, string> match_functions; writer << "\n";
for (shared_ptr<Function> current_function : pass_manager.get_state().get_functions())
{
bool temporaries_used = false;
size_t worst_case_tmp_size = 0;
set<string> output_names; unordered_map<Node*, string> match_functions;
for (shared_ptr<Node> op : current_function->get_results()) for (shared_ptr<Function> current_function :
{ pass_manager.get_state().get_functions())
shared_ptr<descriptor::TensorView> tv = op->get_output_tensor_view();
output_names.insert(tv->get_tensor().get_name());
}
const list<shared_ptr<Node>>& tmp = current_function->get_ordered_ops();
if (tmp.size() < 2)
{
// Since we are comparing ops there must be at least two ops to proceed.
continue;
}
vector<shared_ptr<Node>> op_list{tmp.begin(), tmp.end()};
for (size_t i = 0; i < op_list.size() - 1; i++)
{
if (op_list[i]->is_constant() || op_list[i]->is_parameter())
{
continue;
}
if (contains_key(match_functions, op_list[i].get()))
{
continue;
}
string match_function_name;
for (size_t j = i + 1; j < op_list.size(); j++)
{
if (0) //op_list[i]->is_functionally_identical(*op_list[j]))
{ {
if (match_function_name.empty()) bool temporaries_used = false;
size_t worst_case_tmp_size = 0;
set<string> output_names;
for (shared_ptr<Node> op : current_function->get_results())
{ {
match_function_name = "func_" + op_list[i]->get_name(); shared_ptr<descriptor::TensorView> tv = op->get_output_tensor_view();
match_functions.insert({op_list[i].get(), match_function_name}); output_names.insert(tv->get_tensor().get_name());
} }
match_functions.insert({op_list[j].get(), match_function_name}); const list<shared_ptr<Node>>& tmp = current_function->get_ordered_ops();
} if (tmp.size() < 2)
} {
if (!match_function_name.empty()) // Since we are comparing ops there must be at least two ops to proceed.
{ continue;
writer << "static void " << match_function_name << "("; }
writer.indent++; vector<shared_ptr<Node>> op_list{tmp.begin(), tmp.end()};
// Work around a compiler warning (*node inside typeid may have effects for (size_t i = 0; i < op_list.size() - 1; i++)
// with shared pointers, which is fine here but clang doesn't like it.)
auto& n = *op_list[i];
auto handler = dispatcher.find(type_index(typeid(n)));
vector<GPU_TensorViewWrapper> in;
size_t arg_index = 0;
set<string> arg_names;
for (const descriptor::Input& input : n.get_inputs())
{
const descriptor::Output& output = input.get_output();
shared_ptr<descriptor::TensorView> tv = output.get_tensor_view();
GPU_TensorViewWrapper tvw{tv, "_arg" + to_string(arg_index)};
if (!contains(arg_names, tvw.get_name()))
{ {
arg_names.insert(tvw.get_name()); if (op_list[i]->is_constant() || op_list[i]->is_parameter())
if (arg_index++ > 0) {
continue;
}
if (contains_key(match_functions, op_list[i].get()))
{
continue;
}
string match_function_name;
for (size_t j = i + 1; j < op_list.size(); j++)
{ {
writer << ","; if (0) //op_list[i]->is_functionally_identical(*op_list[j]))
{
if (match_function_name.empty())
{
match_function_name = "func_" + op_list[i]->get_name();
match_functions.insert({op_list[i].get(), match_function_name});
}
match_functions.insert({op_list[j].get(), match_function_name});
}
}
if (!match_function_name.empty())
{
writer << "static void " << match_function_name << "(";
writer.indent++;
// Work around a compiler warning (*node inside typeid may have effects
// with shared pointers, which is fine here but clang doesn't like it.)
auto& n = *op_list[i];
auto handler = dispatcher.find(type_index(typeid(n)));
vector<GPU_TensorViewWrapper> in;
size_t arg_index = 0;
set<string> arg_names;
for (const descriptor::Input& input : n.get_inputs())
{
const descriptor::Output& output = input.get_output();
shared_ptr<descriptor::TensorView> tv = output.get_tensor_view();
GPU_TensorViewWrapper tvw{tv, "_arg" + to_string(arg_index)};
if (!contains(arg_names, tvw.get_name()))
{
arg_names.insert(tvw.get_name());
if (arg_index++ > 0)
{
writer << ",";
}
writer << "\n";
writer << tvw.get_type() << "* " << tvw.get_name();
}
in.push_back(tvw);
}
vector<GPU_TensorViewWrapper> out;
for (const descriptor::Output& output : n.get_outputs())
{
shared_ptr<descriptor::TensorView> tv = output.get_tensor_view();
GPU_TensorViewWrapper tvw{tv, "_out" + to_string(arg_index)};
if (arg_index++ > 0)
{
writer << ",";
}
writer << "\n";
writer << tvw.get_type() << "* " << tvw.get_name();
out.push_back(tvw);
}
writer.indent--;
writer << "\n)\n";
writer << "{\n";
writer.indent++;
handler->second(this, writer, &n, in, out);
writer.indent--;
writer << "}\n";
} }
writer << "\n";
writer << tvw.get_type() << "* " << tvw.get_name();
} }
in.push_back(tvw);
} }
vector<GPU_TensorViewWrapper> out;
for (const descriptor::Output& output : n.get_outputs()) for (shared_ptr<Function> current_function :
pass_manager.get_state().get_functions())
{ {
shared_ptr<descriptor::TensorView> tv = output.get_tensor_view(); set<string> output_names;
GPU_TensorViewWrapper tvw{tv, "_out" + to_string(arg_index)}; for (shared_ptr<Node> op : current_function->get_results())
if (arg_index++ > 0)
{ {
writer << ","; shared_ptr<descriptor::TensorView> tv = op->get_output_tensor_view();
output_names.insert(tv->get_tensor().get_name());
}
set<descriptor::TensorView*> constants;
for (shared_ptr<Node> node : current_function->get_ordered_ops())
{
if (dynamic_cast<ngraph::op::Constant*>(node.get()))
{
shared_ptr<descriptor::TensorView> tv =
node->get_outputs()[0].get_tensor_view();
constants.insert(tv.get());
}
} }
writer << "\n";
writer << tvw.get_type() << "* " << tvw.get_name();
out.push_back(tvw);
}
writer.indent--;
writer << "\n)\n";
writer << "{\n";
writer.indent++;
handler->second(writer, &n, in, out);
writer.indent--;
writer << "}\n";
}
}
}
for (shared_ptr<Function> current_function : pass_manager.get_state().get_functions())
{
set<string> output_names;
for (shared_ptr<Node> op : current_function->get_results())
{
shared_ptr<descriptor::TensorView> tv = op->get_output_tensor_view();
output_names.insert(tv->get_tensor().get_name());
}
set<descriptor::TensorView*> constants;
for (shared_ptr<Node> node : current_function->get_ordered_ops())
{
if (dynamic_cast<op::Constant*>(node.get()))
{
shared_ptr<descriptor::TensorView> tv = node->get_outputs()[0].get_tensor_view();
constants.insert(tv.get());
}
}
writer << "extern \"C\" void " << current_function->get_name();
writer << "(void** inputs, void** outputs, cublasHandle_t& cublas_handle, cudnnHandle_t& "
"cudnn_handle)\n";
writer << "{\n";
writer.indent++;
for (shared_ptr<Function> current_function : pass_manager.get_state().get_functions()) writer << "extern \"C\" void " << current_function->get_name();
{ writer << "(void** inputs, void** outputs, cublasHandle_t& cublas_handle, "
for (shared_ptr<Node> node : current_function->get_ordered_ops()) "cudnnHandle_t& "
{ "cudnn_handle)\n";
const op::Constant* c = dynamic_cast<op::Constant*>(node.get());
if (c)
{
shared_ptr<descriptor::TensorView> tv =
node->get_outputs()[0].get_tensor_view();
writer << "if(" << tv->get_tensor().get_name() << " == NULL)\n";
writer << "{\n"; writer << "{\n";
writer.indent++; writer.indent++;
writer << "runtime::gpu::cuda_memcpyHtD(" << tv->get_tensor().get_name() << ", "
<< tv->get_tensor().get_name() << "_cpu, " << tv->get_tensor().size()
<< ");\n";
writer.indent--;
writer << "}\n";
}
}
}
bool temporaries_used = false;
size_t worst_case_tmp_size = 0;
for (shared_ptr<Node> node : current_function->get_ordered_ops())
{
if (node->liveness_new_list.size() > 0)
{
temporaries_used = true;
for (descriptor::Tensor* tensor : node->liveness_new_list)
{
worst_case_tmp_size += tensor->size();
}
}
}
if (temporaries_used)
{
size_t temp_pool_size = current_function->get_temporary_pool_size();
writer << "// Allocate the memory pool\n";
// TODO memory pool malloc.
writer << "void* pool_base_ptr = runtime::gpu::create_gpu_buffer(" << temp_pool_size
<< ");\n";
// Add temporaries to the variable name map
for (shared_ptr<Node> node : current_function->get_ordered_ops())
{
for (descriptor::Tensor* tensor : node->liveness_new_list)
{
stringstream ss;
ss << "((" << tensor->get_element_type().c_type_string()
<< "*)((char *)pool_base_ptr + " << tensor->get_pool_offset() << "))";
m_variable_name_map[tensor->get_name()] = ss.str();
}
}
}
// Add inputs to the variable name map for (shared_ptr<Function> current_function :
size_t arg_index = 0; pass_manager.get_state().get_functions())
for (shared_ptr<op::Parameter> param : current_function->get_parameters()) {
{ for (shared_ptr<Node> node : current_function->get_ordered_ops())
for (size_t i = 0; i < param->get_output_size(); ++i) {
{ const op::Constant* c = dynamic_cast<op::Constant*>(node.get());
shared_ptr<descriptor::TensorView> tv = param->get_output_tensor_view(i); if (c)
const element::Type& et = tv->get_tensor_view_type()->get_element_type(); {
string type = et.c_type_string(); shared_ptr<descriptor::TensorView> tv =
stringstream ss; node->get_outputs()[0].get_tensor_view();
ss << "((" << type << "*)(inputs[" << arg_index << "]))"; writer << "if(" << tv->get_tensor().get_name() << " == NULL)\n";
m_variable_name_map[tv->get_tensor().get_name()] = ss.str(); writer << "{\n";
arg_index++; writer.indent++;
} writer << "runtime::gpu::cuda_memcpyHtD("
} << tv->get_tensor().get_name() << ", "
<< tv->get_tensor().get_name() << "_cpu, "
<< tv->get_tensor().size() << ");\n";
writer.indent--;
writer << "}\n";
}
}
}
bool temporaries_used = false;
size_t worst_case_tmp_size = 0;
for (shared_ptr<Node> node : current_function->get_ordered_ops())
{
if (node->liveness_new_list.size() > 0)
{
temporaries_used = true;
for (descriptor::Tensor* tensor : node->liveness_new_list)
{
worst_case_tmp_size += tensor->size();
}
}
}
if (temporaries_used)
{
size_t temp_pool_size = current_function->get_temporary_pool_size();
writer << "// Allocate the memory pool\n";
// TODO memory pool malloc.
writer << "void* pool_base_ptr = ngraph::runtime::gpu::create_gpu_buffer("
<< temp_pool_size << ");\n";
// Add temporaries to the variable name map
for (shared_ptr<Node> node : current_function->get_ordered_ops())
{
for (descriptor::Tensor* tensor : node->liveness_new_list)
{
stringstream ss;
ss << "((" << tensor->get_element_type().c_type_string()
<< "*)((char *)pool_base_ptr + " << tensor->get_pool_offset()
<< "))";
m_variable_name_map[tensor->get_name()] = ss.str();
}
}
}
// create output alias map // Add inputs to the variable name map
size_t output_index = 0; size_t arg_index = 0;
unordered_map<descriptor::TensorView*, vector<size_t>> output_alias_map; for (shared_ptr<ngraph::op::Parameter> param :
vector<size_t> aliases; current_function->get_parameters())
for (size_t i = 0; i < current_function->get_output_size(); ++i) {
{ for (size_t i = 0; i < param->get_output_size(); ++i)
shared_ptr<Node> op = current_function->get_output_op(i); {
shared_ptr<descriptor::TensorView> otv = op->get_output_tensor_view(); shared_ptr<descriptor::TensorView> tv =
vector<size_t>& al = output_alias_map[otv.get()]; param->get_output_tensor_view(i);
al.push_back(output_index); const element::Type& et =
if (al.size() > 1) tv->get_tensor_view_type()->get_element_type();
{ string type = et.c_type_string();
aliases.push_back(output_index); stringstream ss;
} ss << "((" << type << "*)(inputs[" << arg_index << "]))";
output_index++; m_variable_name_map[tv->get_tensor().get_name()] = ss.str();
} arg_index++;
}
}
// Add outputs to the variable name map // create output alias map
output_index = 0; size_t output_index = 0;
for (size_t i = 0; i < current_function->get_output_size(); ++i) unordered_map<descriptor::TensorView*, vector<size_t>> output_alias_map;
{ vector<size_t> aliases;
shared_ptr<Node> op = current_function->get_output_op(i); for (size_t i = 0; i < current_function->get_output_size(); ++i)
shared_ptr<descriptor::TensorView> tv = op->get_output_tensor_view();
const element::Type& et = tv->get_tensor_view_type()->get_element_type();
bool parameter_as_output = false;
for (shared_ptr<op::Parameter> param : current_function->get_parameters())
{
for (const descriptor::Output& pout : param->get_outputs())
{
shared_ptr<descriptor::TensorView> ptv = pout.get_tensor_view();
if (tv == ptv)
{ {
parameter_as_output = true; shared_ptr<Node> op = current_function->get_output_op(i);
writer << "runtime::gpu::cuda_memcpyDtD(reinterpret_cast<" shared_ptr<descriptor::TensorView> otv = op->get_output_tensor_view();
<< et.c_type_string() << "*>(outputs[" << output_index << "]), " vector<size_t>& al = output_alias_map[otv.get()];
<< m_variable_name_map[ptv->get_tensor().get_name()] << ", " al.push_back(output_index);
<< ptv->get_tensor().size() << ");\n"; if (al.size() > 1)
break; {
aliases.push_back(output_index);
}
output_index++;
}
// Add outputs to the variable name map
output_index = 0;
for (size_t i = 0; i < current_function->get_output_size(); ++i)
{
shared_ptr<Node> op = current_function->get_output_op(i);
shared_ptr<descriptor::TensorView> tv = op->get_output_tensor_view();
const element::Type& et = tv->get_tensor_view_type()->get_element_type();
bool parameter_as_output = false;
for (shared_ptr<ngraph::op::Parameter> param :
current_function->get_parameters())
{
for (const descriptor::Output& pout : param->get_outputs())
{
shared_ptr<descriptor::TensorView> ptv = pout.get_tensor_view();
if (tv == ptv)
{
parameter_as_output = true;
writer
<< "ngraph::runtime::gpu::cuda_memcpyDtD(reinterpret_cast<"
<< et.c_type_string() << "*>(outputs[" << output_index
<< "]), "
<< m_variable_name_map[ptv->get_tensor().get_name()] << ", "
<< ptv->get_tensor().size() << ");\n";
break;
}
}
}
if (!parameter_as_output && !contains(aliases, output_index))
{
if (contains(constants, tv.get()))
{
writer << "ngraph::runtime::gpu::cuda_memcpyHtD(outputs["
<< output_index << "], " << tv->get_tensor().get_name()
<< ", " << tv->get_tensor().size() << ");\n";
}
else
{
string type = et.c_type_string();
stringstream ss;
ss << "((" << type << "*)(outputs[" << output_index << "]))";
m_variable_name_map[tv->get_tensor().get_name()] = ss.str();
}
}
output_index++;
}
for (shared_ptr<Node> node : current_function->get_ordered_ops())
{
auto& n =
*node; // Work around a compiler warning (*node inside typeid may have effects
// with shared pointers, which is fine here but clang doesn't like it.)
auto handler = dispatcher.find(type_index(typeid(n)));
if (handler == dispatcher.end())
{
throw ngraph_error("Unhandled op during code generation : " +
node->description());
}
vector<GPU_TensorViewWrapper> in;
for (const descriptor::Input& input : node->get_inputs())
{
const descriptor::Output& output = input.get_output();
shared_ptr<descriptor::TensorView> tv = output.get_tensor_view();
in.push_back(GPU_TensorViewWrapper(
tv, m_variable_name_map[tv->get_tensor().get_name()]));
}
vector<GPU_TensorViewWrapper> out;
for (const descriptor::Output& output : node->get_outputs())
{
shared_ptr<descriptor::TensorView> tv = output.get_tensor_view();
out.push_back(GPU_TensorViewWrapper(
tv, m_variable_name_map[tv->get_tensor().get_name()]));
}
// Emit operation prologue
if (!node->is_parameter() && !node->is_constant())
{
if (m_emit_timing)
{
emit_debug_function_entry(writer, node.get(), in, out);
}
}
// Emit operation body
string func_name;
auto it = match_functions.find(node.get());
if (it != match_functions.end())
{
func_name = it->second;
}
if (func_name.empty())
{
handler->second(this, writer, node.get(), in, out);
}
else
{
vector<string> names;
for (const GPU_TensorViewWrapper& tv : in)
{
names.push_back(tv.get_name());
}
for (const GPU_TensorViewWrapper& tv : out)
{
names.push_back(tv.get_name());
}
writer << func_name << "(" << join(names) << ");\n";
}
// Emit operation epilogue
if (!node->is_parameter() && !node->is_constant())
{
if (m_emit_timing)
{
emit_debug_function_exit(writer, node.get(), in, out);
}
}
} }
writer.indent--;
// End generated function
writer += "}\n\n";
} }
} // TODO: Cleanup and make this a utility function
if (!parameter_as_output && !contains(aliases, output_index))
{ file_util::make_directory(s_output_dir);
if (contains(constants, tv.get())) string filename =
file_util::path_join(s_output_dir, function_name + "_codegen.cpp");
ofstream out(filename);
string code = writer.get_code();
out << code;
out.close();
m_compiler.reset(new codegen::Compiler());
m_execution_engine.reset(new codegen::ExecutionEngine());
m_compiler->set_precompiled_header_source(pch_header_source);
auto codegen_module = m_compiler->compile(code);
if (codegen_module == nullptr)
{ {
writer << "runtime::gpu::cuda_memcpyHtD(outputs[" << output_index << "], " throw runtime_error("function failed to compile");
<< tv->get_tensor().get_name() << ", " << tv->get_tensor().size()
<< ");\n";
} }
else m_execution_engine->add_module(codegen_module);
m_execution_engine->finalize();
m_compiled_function =
m_execution_engine->find_function<EntryPoint_t>(function_name);
assert(m_compiled_function);
m_is_compiled = true;
if (m_release_function)
{ {
string type = et.c_type_string(); release_function();
stringstream ss;
ss << "((" << type << "*)(outputs[" << output_index << "]))";
m_variable_name_map[tv->get_tensor().get_name()] = ss.str();
} }
} }
output_index++;
}
for (shared_ptr<Node> node : current_function->get_ordered_ops()) void GPU_ExternalFunction::handle_output_alias(
{ codegen::CodeWriter& writer,
auto& n = *node; // Work around a compiler warning (*node inside typeid may have effects const Node& node,
// with shared pointers, which is fine here but clang doesn't like it.) const unordered_map<descriptor::TensorView*, vector<size_t>>& output_alias_map)
auto handler = dispatcher.find(type_index(typeid(n)));
if (handler == dispatcher.end())
{
throw ngraph_error("Unhandled op during code generation : " + node->description());
}
vector<GPU_TensorViewWrapper> in;
for (const descriptor::Input& input : node->get_inputs())
{ {
const descriptor::Output& output = input.get_output(); for (const descriptor::Output& output : node.get_outputs())
shared_ptr<descriptor::TensorView> tv = output.get_tensor_view();
in.push_back(
GPU_TensorViewWrapper(tv, m_variable_name_map[tv->get_tensor().get_name()]));
}
vector<GPU_TensorViewWrapper> out;
for (const descriptor::Output& output : node->get_outputs())
{
shared_ptr<descriptor::TensorView> tv = output.get_tensor_view();
out.push_back(
GPU_TensorViewWrapper(tv, m_variable_name_map[tv->get_tensor().get_name()]));
}
// Emit operation prologue
if (!node->is_parameter() && !node->is_constant())
{
if (m_emit_timing)
{ {
emit_debug_function_entry(writer, node.get(), in, out); shared_ptr<descriptor::TensorView> otv = output.get_tensor_view();
auto it = output_alias_map.find(otv.get());
if (it != output_alias_map.end())
{
const vector<size_t>& outputs = it->second;
if (outputs.size() > 1)
{
writer << "{ // handle output alias for previous op\n";
writer.indent++;
for (size_t i = 1; i < outputs.size(); i++)
{
writer << "ngraph::runtime::gpu::cuda_memcpyDtD(static_cast<void*>("
"outputs["
<< outputs[i] << "]), static_cast<void*>(outputs["
<< outputs[0] << "]), " << otv->get_tensor().size()
<< ");\n";
}
writer.indent--;
writer << "}\n";
}
}
} }
} }
// Emit operation body shared_ptr<ngraph::runtime::CallFrame> GPU_ExternalFunction::make_call_frame()
string func_name;
auto it = match_functions.find(node.get());
if (it != match_functions.end())
{
func_name = it->second;
}
if (func_name.empty())
{ {
handler->second(writer, node.get(), in, out); if (!m_is_compiled)
}
else
{
vector<string> names;
for (const GPU_TensorViewWrapper& tv : in)
{
names.push_back(tv.get_name());
}
for (const GPU_TensorViewWrapper& tv : out)
{ {
names.push_back(tv.get_name()); compile();
} }
writer << func_name << "(" << join(names) << ");\n";
return make_shared<GPU_CallFrame>(shared_from_this(), m_compiled_function);
} }
// Emit operation epilogue void GPU_ExternalFunction::emit_debug_function_entry(
if (!node->is_parameter() && !node->is_constant()) codegen::CodeWriter& writer,
Node* node,
const std::vector<GPU_TensorViewWrapper>& in,
const std::vector<GPU_TensorViewWrapper>& out)
{ {
if (m_emit_timing) writer << "timer_" << node->get_name() << ".start();\n";
{
emit_debug_function_exit(writer, node.get(), in, out);
}
} }
}
writer.indent--;
// End generated function
writer += "}\n\n";
}
// TODO: Cleanup and make this a utility function
file_util::make_directory(s_output_dir);
string filename = file_util::path_join(s_output_dir, function_name + "_codegen.cpp");
ofstream out(filename);
string code = writer.get_code();
out << code;
out.close();
m_compiler.reset(new codegen::Compiler());
m_execution_engine.reset(new codegen::ExecutionEngine());
m_compiler->set_precompiled_header_source(pch_header_source);
auto codegen_module = m_compiler->compile(code);
if (codegen_module == nullptr)
{
throw runtime_error("function failed to compile");
}
m_execution_engine->add_module(codegen_module);
m_execution_engine->finalize();
m_compiled_function = m_execution_engine->find_function<EntryPoint_t>(function_name);
assert(m_compiled_function);
m_is_compiled = true;
if (m_release_function)
{
release_function();
}
}
void runtime::gpu::GPU_ExternalFunction::handle_output_alias( void GPU_ExternalFunction::emit_debug_function_exit(
codegen::CodeWriter& writer, codegen::CodeWriter& writer,
const Node& node, Node* node,
const unordered_map<descriptor::TensorView*, vector<size_t>>& output_alias_map) const std::vector<GPU_TensorViewWrapper>& in,
{ const std::vector<GPU_TensorViewWrapper>& out)
for (const descriptor::Output& output : node.get_outputs())
{
shared_ptr<descriptor::TensorView> otv = output.get_tensor_view();
auto it = output_alias_map.find(otv.get());
if (it != output_alias_map.end())
{
const vector<size_t>& outputs = it->second;
if (outputs.size() > 1)
{ {
writer << "{ // handle output alias for previous op\n"; writer << "timer_" << node->get_name() << ".stop();\n";
writer.indent++;
for (size_t i = 1; i < outputs.size(); i++)
{
writer << "runtime::gpu::cuda_memcpyDtD(static_cast<void*>(outputs["
<< outputs[i] << "]), static_cast<void*>(outputs[" << outputs[0]
<< "]), " << otv->get_tensor().size() << ");\n";
}
writer.indent--;
writer << "}\n";
} }
} }
} }
} }
\ No newline at end of file
shared_ptr<ngraph::runtime::CallFrame> runtime::gpu::GPU_ExternalFunction::make_call_frame()
{
if (!m_is_compiled)
{
compile();
}
return make_shared<ngraph::runtime::gpu::GPU_CallFrame>(shared_from_this(),
m_compiled_function);
}
void runtime::gpu::GPU_ExternalFunction::emit_debug_function_entry(
codegen::CodeWriter& writer,
Node* node,
const std::vector<GPU_TensorViewWrapper>& in,
const std::vector<GPU_TensorViewWrapper>& out)
{
writer << "timer_" << node->get_name() << ".start();\n";
}
void runtime::gpu::GPU_ExternalFunction::emit_debug_function_exit(
codegen::CodeWriter& writer,
Node* node,
const std::vector<GPU_TensorViewWrapper>& in,
const std::vector<GPU_TensorViewWrapper>& out)
{
writer << "timer_" << node->get_name() << ".stop();\n";
}
...@@ -41,7 +41,8 @@ namespace ngraph ...@@ -41,7 +41,8 @@ namespace ngraph
class GPU_CallFrame; class GPU_CallFrame;
using OpFunction = using OpFunction =
std::function<void(codegen::CodeWriter&, std::function<void(GPU_ExternalFunction* external_function,
codegen::CodeWriter&,
const ngraph::Node*, const ngraph::Node*,
const std::vector<GPU_TensorViewWrapper>& inputs, const std::vector<GPU_TensorViewWrapper>& inputs,
const std::vector<GPU_TensorViewWrapper>& outputs)>; const std::vector<GPU_TensorViewWrapper>& outputs)>;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment