Unverified Commit da50410b authored by Tristan Webb's avatar Tristan Webb Committed by GitHub

GPU kernels for reshape, GEMM, EW ADD/Mult, Maximum (#440)

* GPU kernels for reshape, GEMM, EW ADD/Mult, Maximum

(A + B) * C test now with cuBLAS
Additional gemm and gemv calls
cmake updates for cuDNN calls
memcpy wrappers in gpu_util

Additional passing tests:
aliased outputs, parameter, constant tensor memcopy
parent 27fee946
......@@ -192,7 +192,7 @@ if (NGRAPH_CPU_ENABLE AND LLVM_INCLUDE_DIR AND
# GPU backend current requires CPU because they share compiler.cpp,
# and compiler.cpp requires MKLDNN
if(NGRAPH_GPU_ENABLE)
include_directories(SYSTEM ${CUDA_INCLUDE_DIRS} ${CUDNN_INCLUDE_DIR})
include_directories(SYSTEM ${CUDA_INCLUDE_DIRS} ${CUDNN_INCLUDE_DIRS})
# Add sources for the GPU backend
# and all its dependencies
......@@ -275,7 +275,7 @@ endif()
# Nvidia
if(NGRAPH_GPU_ENABLE AND CUDA_LIBRARIES)
target_link_libraries(ngraph PRIVATE ${CUDA_LIBRARIES} ${CUDA_CUBLAS_LIBRARIES})
target_link_libraries(ngraph PRIVATE ${CUDA_LIBRARIES} ${CUDA_CUBLAS_LIBRARIES} ${CUDNN_LIBRARIES})
endif()
# Argon
......
......@@ -50,6 +50,7 @@ void runtime::gpu::GPU_Emitter::EmitNop(codegen::CodeWriter& writer,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw ngraph_error("Op not supported in GPU Backend");
}
void runtime::gpu::GPU_Emitter::EmitAbs(codegen::CodeWriter& writer,
......@@ -57,6 +58,7 @@ void runtime::gpu::GPU_Emitter::EmitAbs(codegen::CodeWriter& writer,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw ngraph_error("Op not supported in GPU Backend");
}
void runtime::gpu::GPU_Emitter::EmitAdd(codegen::CodeWriter& writer,
......@@ -64,6 +66,49 @@ void runtime::gpu::GPU_Emitter::EmitAdd(codegen::CodeWriter& writer,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
const Shape& arg0_shape = args[0].get_shape();
const Shape& arg1_shape = args[1].get_shape();
else if ((arg0_shape.size() <= 2) && (arg1_shape.size() <= 2))
{
writer << "{ // " << n->get_name() << "\n";
writer.indent++;
writer << "static const float alpha = 1.0;\n";
writer << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_HOST);\n";
// clang-format off
writer << "cublasScopy("
<< "cublas_handle,"
<< out[0].get_size() << ","
<< args[0].get_name() << ","
<< "1,"
<< out[0].get_name() << ","
<< "1);\n";
writer << "cublasSaxpy("
<< "cublas_handle,"
<< out[0].get_size() << ","
<< "&alpha,"
<< args[1].get_name() << ","
<< "1,"
<< out[0].get_name() << ","
<< "1);\n";
// clang-format on
writer.indent--;
writer << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_DEVICE);\n";
writer << "}\n";
}
else if ((arg0_shape.size() == 2) && (arg1_shape.size() == 1))
{
throw ngraph_error("Argument shape not supported");
}
else if ((arg0_shape.size() == 2) && (arg1_shape.size() == 2))
{
// GEMM Call
throw ngraph_error("Argument shape not supported");
}
else
{
// General ND Call?
throw ngraph_error("Argument shape not supported");
}
}
void runtime::gpu::GPU_Emitter::EmitConcat(codegen::CodeWriter& writer,
......@@ -84,33 +129,96 @@ void runtime::gpu::GPU_Emitter::EmitDot(codegen::CodeWriter& writer,
{
auto& first = (arg0_shape.empty() ? args[0] : args[1]);
auto& second = (arg0_shape.empty() ? args[1] : args[0]);
writer << "{ // " << n->get_name() << "\n";
writer.indent++;
// clang-format off
writer << "cublasSdot("
<< "cublas_handle,"
<< second.get_size() << ","
<< first.get_name() << ","
<< "1,"
<< second.get_name() << ","
<< "1,"
<< out[0].get_name() << ");\n";
// clang-format on
writer.indent--;
writer << "}\n";
}
// clang-format off
else if ((arg0_shape.size() == 1) && (arg1_shape.size() == 1))
{
// TODO Assert arg0_shape[0] == arg1_shape[0]?
writer << "{ // " << n->get_name() << "\n";
writer.indent++;
writer << "cublasSdot("
<< "cublas_handle,"
<< arg0_shape[0] << ","
<< args[0].get_name() << ","
// Todo handle striding?
<< "1,"
<< args[1].get_name() << ","
<< "1,"
<< out[0].get_name() << ");\n";
writer.indent--;
writer << "}\n";
writer << "{ // " << n->get_name() << "\n";
writer.indent++;
// clang-format off
writer << "cublasSdot("
<< "cublas_handle,"
<< arg0_shape[0] << ","
<< args[0].get_name() << ","
<< "1,"
<< args[1].get_name() << ","
<< "1,"
<< out[0].get_name() << ");\n";
// clang-format on
writer.indent--;
writer << "}\n";
}
// clang-format on
else if ((arg0_shape.size() == 2) && (arg1_shape.size() == 1))
{
writer << "{ // " << n->get_name() << "\n";
writer.indent++;
writer << "static const float alpha = 1.0;\n";
writer << "static const float beta = 1.0;\n";
writer << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_HOST);\n";
// clang-format off
writer << "cublasSgemv("
<< "cublas_handle,"
<< "CUBLAS_OP_T,"
<< arg0_shape[0] << ","
<< arg0_shape[1] << ","
<< "&alpha,"
<< args[0].get_name() << ","
<< arg0_shape[1] << ","
<< args[1].get_name() << ","
<< "1,"
<< "&beta,"
<< out[0].get_name() << ","
<< "1);\n";
// clang-format on
writer << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_DEVICE);\n";
writer.indent--;
writer << "}\n";
}
else if ((arg0_shape.size() == 2) && (arg1_shape.size() == 2))
{
// GEMM Call
writer << "{ // " << n->get_name() << "\n";
writer.indent++;
writer << "static const float alpha = 1.0;\n";
writer << "static const float beta = 0.0;\n";
writer << "int m = " << arg0_shape[0] << ";\n";
writer << "int n = " << arg1_shape[1] << ";\n";
writer << "int k = " << arg0_shape[0] << ";\n";
writer << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_HOST);\n";
// clang-format off
writer << "cublasSgemm("
<< "cublas_handle,"
<< "CUBLAS_OP_N,"
<< "CUBLAS_OP_N,"
<< "n,"
<< "m,"
<< "k,"
<< "&alpha,"
<< args[1].get_name() << ","
<< "n,"
<< args[0].get_name() << ","
<< "k,"
<< "&beta,"
<< out[0].get_name() << ","
<< "n);\n";
// clang-format on
writer << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_DEVICE);\n";
writer.indent--;
writer << "}\n";
}
else
{
......@@ -123,6 +231,7 @@ void runtime::gpu::GPU_Emitter::EmitDivide(codegen::CodeWriter& writer,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw ngraph_error("Op not supported in GPU Backend");
}
void runtime::gpu::GPU_Emitter::EmitEqual(codegen::CodeWriter& writer,
......@@ -130,6 +239,7 @@ void runtime::gpu::GPU_Emitter::EmitEqual(codegen::CodeWriter& writer,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw ngraph_error("Op not supported in GPU Backend");
}
void runtime::gpu::GPU_Emitter::EmitGreater(codegen::CodeWriter& writer,
......@@ -137,6 +247,7 @@ void runtime::gpu::GPU_Emitter::EmitGreater(codegen::CodeWriter& writer,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw ngraph_error("Op not supported in GPU Backend");
}
void runtime::gpu::GPU_Emitter::EmitGreaterEq(
......@@ -145,6 +256,7 @@ void runtime::gpu::GPU_Emitter::EmitGreaterEq(
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw ngraph_error("Op not supported in GPU Backend");
}
void runtime::gpu::GPU_Emitter::EmitLess(codegen::CodeWriter& writer,
......@@ -152,6 +264,7 @@ void runtime::gpu::GPU_Emitter::EmitLess(codegen::CodeWriter& writer,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw ngraph_error("Op not supported in GPU Backend");
}
void runtime::gpu::GPU_Emitter::EmitLessEq(codegen::CodeWriter& writer,
......@@ -159,6 +272,7 @@ void runtime::gpu::GPU_Emitter::EmitLessEq(codegen::CodeWriter& writer,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw ngraph_error("Op not supported in GPU Backend");
}
void runtime::gpu::GPU_Emitter::EmitLog(codegen::CodeWriter& writer,
......@@ -166,6 +280,7 @@ void runtime::gpu::GPU_Emitter::EmitLog(codegen::CodeWriter& writer,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw ngraph_error("Op not supported in GPU Backend");
}
void runtime::gpu::GPU_Emitter::EmitMaximum(codegen::CodeWriter& writer,
......@@ -173,6 +288,48 @@ void runtime::gpu::GPU_Emitter::EmitMaximum(codegen::CodeWriter& writer,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
const Shape& arg0_shape = args[0].get_shape();
const Shape& arg1_shape = args[1].get_shape();
writer << "{ // " << n->get_name() << "\n";
writer.indent++;
writer << "static const int count = " << out[0].get_size() << ";\n";
writer << "static const float alpha1 = 1.0, alpha2 = 1.0, beta = 0;\n";
// TODO Move cudnn creation to backend initialization
writer += R"(
cudnnHandle_t cudnnHandle;
(cudnnCreate(&cudnnHandle));
cudnnTensorDescriptor_t descriptor;
(cudnnCreateTensorDescriptor(&descriptor));
(cudnnSetTensor4dDescriptor(descriptor,
/*format=*/CUDNN_TENSOR_NHWC,
/*dataType=*/CUDNN_DATA_FLOAT,
/*batch_size=*/1,
/*channels=*/1,
/*image_height=*/1,
/*image_width=*/count));
cudnnOpTensorDescriptor_t opTensorDesc;
(cudnnCreateOpTensorDescriptor(&opTensorDesc));
(cudnnSetOpTensorDescriptor(opTensorDesc,
CUDNN_OP_TENSOR_MAX,
CUDNN_DATA_FLOAT,
CUDNN_NOT_PROPAGATE_NAN));
)";
// clang-format off
writer << "cudnnOpTensor(cudnnHandle,"
<< "opTensorDesc,"
<< "&alpha1,"
<< "descriptor," << args[0].get_name() << ","
<< "&alpha2,"
<< "descriptor," << args[1].get_name() << ","
<< "&beta,"
<< "descriptor," << out[0].get_name() << ");\n";
// clang-format on
writer << "(cudnnDestroy(cudnnHandle));\n";
writer.indent--;
writer << "}\n";
}
void runtime::gpu::GPU_Emitter::EmitMinimum(codegen::CodeWriter& writer,
......@@ -180,6 +337,7 @@ void runtime::gpu::GPU_Emitter::EmitMinimum(codegen::CodeWriter& writer,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw ngraph_error("Op not supported in GPU Backend");
}
void runtime::gpu::GPU_Emitter::EmitNegative(
......@@ -188,6 +346,7 @@ void runtime::gpu::GPU_Emitter::EmitNegative(
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw ngraph_error("Op not supported in GPU Backend");
}
void runtime::gpu::GPU_Emitter::EmitNotEqual(
......@@ -196,12 +355,14 @@ void runtime::gpu::GPU_Emitter::EmitNotEqual(
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw ngraph_error("Op not supported in GPU Backend");
}
void runtime::gpu::GPU_Emitter::EmitSelect(codegen::CodeWriter& writer,
const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw ngraph_error("Op not supported in GPU Backend");
}
void runtime::gpu::GPU_Emitter::EmitSubtract(
......@@ -210,6 +371,7 @@ void runtime::gpu::GPU_Emitter::EmitSubtract(
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw ngraph_error("Op not supported in GPU Backend");
}
void runtime::gpu::GPU_Emitter::EmitBroadcast(
......@@ -218,6 +380,7 @@ void runtime::gpu::GPU_Emitter::EmitBroadcast(
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw ngraph_error("Op not supported in GPU Backend");
}
void runtime::gpu::GPU_Emitter::EmitConvert(codegen::CodeWriter& writer,
......@@ -225,6 +388,7 @@ void runtime::gpu::GPU_Emitter::EmitConvert(codegen::CodeWriter& writer,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw ngraph_error("Op not supported in GPU Backend");
}
void runtime::gpu::GPU_Emitter::EmitConstant(
......@@ -233,6 +397,7 @@ void runtime::gpu::GPU_Emitter::EmitConstant(
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw ngraph_error("Op not supported in GPU Backend");
}
void runtime::gpu::GPU_Emitter::EmitReshape(codegen::CodeWriter& writer,
......@@ -240,6 +405,72 @@ void runtime::gpu::GPU_Emitter::EmitReshape(codegen::CodeWriter& writer,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
auto reshape = static_cast<const op::Reshape*>(n);
writer << "{ // " << n->get_name() << "\n";
writer.indent++;
auto arg_shape = args[0].get_shape();
auto arg_rank = arg_shape.size();
auto result_shape = out[0].get_shape();
auto& result_element_type = out[0].get_element_type();
auto input_order = reshape->get_input_order();
bool same_layout = is_sorted(input_order.begin(), input_order.end());
size_t result_shape_product = 1;
for (auto i : result_shape)
{
result_shape_product *= i;
}
// If there is no layout change or we are just going from 1^n to 1^m or a zero-size tensor,
// we can just copy.
if (same_layout || result_shape_product < 2)
{
writer << "{ // " << n->get_name() << " 1\n";
writer.indent++;
writer << "runtime::gpu::cuda_memcpyDtD(" << out[0].get_name() << ", " << args[0].get_name()
<< ", " << out[0].get_size() << "," << out[0].get_element_type().size() << ");\n";
writer.indent--;
writer << "}\n";
}
// If there *is* a layout change in the 2D case, we transpose the input.
else if (arg_rank == 2)
{
writer << "{ // " << n->get_name() << "\n";
writer.indent++;
writer << "static const float alpha = 1.0;\n";
writer << "static const float beta = 0.0;\n";
writer << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_HOST);\n";
// clang-format off
writer << "cublasSgeam("
<< "cublas_handle,"
<< "CUBLAS_OP_T,"
<< "CUBLAS_OP_T,"
<< arg_shape[0] << ","
<< arg_shape[1] << ","
<< "&alpha,"
<< args[0].get_name() << ","
<< arg_shape[1] << ","
<< "&beta,"
<< args[0].get_name() << ","
<< arg_shape[1] << ","
<< out[0].get_name() << ","
<< out[0].get_shape()[1] << ");\n";
//clang-format on
writer << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_DEVICE);\n";
writer.indent--;
writer << "}\n";
}
// Other cases (reordering of axes for tensors with rank>2) are not handled yet.
else
{
throw ngraph_error(
"Axis permutation in reshape is not implemented yet for tensors with rank>2");
}
writer.indent--;
writer << "}\n";
}
void runtime::gpu::GPU_Emitter::EmitFunctionCall(
......@@ -248,6 +479,7 @@ void runtime::gpu::GPU_Emitter::EmitFunctionCall(
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw ngraph_error("Op not supported in GPU Backend");
}
void runtime::gpu::GPU_Emitter::EmitReduce(codegen::CodeWriter& writer,
......@@ -255,6 +487,7 @@ void runtime::gpu::GPU_Emitter::EmitReduce(codegen::CodeWriter& writer,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw ngraph_error("Op not supported in GPU Backend");
}
void runtime::gpu::GPU_Emitter::EmitSign(codegen::CodeWriter& writer,
......@@ -262,6 +495,7 @@ void runtime::gpu::GPU_Emitter::EmitSign(codegen::CodeWriter& writer,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw ngraph_error("Op not supported in GPU Backend");
}
void runtime::gpu::GPU_Emitter::EmitSlice(codegen::CodeWriter& writer,
......@@ -269,6 +503,7 @@ void runtime::gpu::GPU_Emitter::EmitSlice(codegen::CodeWriter& writer,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw ngraph_error("Op not supported in GPU Backend");
}
void runtime::gpu::GPU_Emitter::EmitSum(codegen::CodeWriter& writer,
......@@ -276,6 +511,7 @@ void runtime::gpu::GPU_Emitter::EmitSum(codegen::CodeWriter& writer,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw ngraph_error("Op not supported in GPU Backend");
}
void runtime::gpu::GPU_Emitter::EmitMultiply(
......@@ -284,6 +520,36 @@ void runtime::gpu::GPU_Emitter::EmitMultiply(
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
const Shape& arg0_shape = args[0].get_shape();
const Shape& arg1_shape = args[1].get_shape();
// Until we have EW kernel gen, use cuBLAS
// From https://stackoverflow.com/questions/7621520/element-wise-vector-vector-multiplication-in-bl as/7634831
writer << "{ // " << n->get_name() << "\n";
writer.indent++;
writer << "static const float alpha = 1.0;\n";
writer << "static const float beta = 0.0;\n";
writer << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_HOST);\n";;
// clang-format off
writer << "cublasSsbmv("
<< "cublas_handle,"
<< "CUBLAS_FILL_MODE_LOWER," // Corresponds to FORTRAN "L"
<< out[0].get_size() << "," // N = input size
<< "0," // k = super-diagonal i.e. just use the diagonal of A
<< "&alpha," // alpha
<< args[0].get_name() << "," // vec A (broadcast to a matrix)
<< "1," // LDA = 1
<< args[1].get_name() << "," // vector x
<< "1," // Stride x
<< "&beta," // beta
<< out[0].get_name() << "," // y
<< "1" // Stride y
<< ");\n";
// clang-format on
writer.indent--;
writer << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_DEVICE);\n";
;
writer << "}\n";
}
void runtime::gpu::GPU_Emitter::EmitExp(codegen::CodeWriter& writer,
......@@ -291,6 +557,7 @@ void runtime::gpu::GPU_Emitter::EmitExp(codegen::CodeWriter& writer,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw ngraph_error("Op not supported in GPU Backend");
}
void runtime::gpu::GPU_Emitter::EmitSin(codegen::CodeWriter& writer,
......@@ -298,6 +565,7 @@ void runtime::gpu::GPU_Emitter::EmitSin(codegen::CodeWriter& writer,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw ngraph_error("Op not supported in GPU Backend");
}
void runtime::gpu::GPU_Emitter::EmitSinh(codegen::CodeWriter& writer,
......@@ -305,6 +573,7 @@ void runtime::gpu::GPU_Emitter::EmitSinh(codegen::CodeWriter& writer,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw ngraph_error("Op not supported in GPU Backend");
}
void runtime::gpu::GPU_Emitter::EmitCos(codegen::CodeWriter& writer,
......@@ -312,6 +581,7 @@ void runtime::gpu::GPU_Emitter::EmitCos(codegen::CodeWriter& writer,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw ngraph_error("Op not supported in GPU Backend");
}
void runtime::gpu::GPU_Emitter::EmitCosh(codegen::CodeWriter& writer,
......@@ -319,6 +589,7 @@ void runtime::gpu::GPU_Emitter::EmitCosh(codegen::CodeWriter& writer,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw ngraph_error("Op not supported in GPU Backend");
}
void runtime::gpu::GPU_Emitter::EmitTan(codegen::CodeWriter& writer,
......@@ -326,6 +597,7 @@ void runtime::gpu::GPU_Emitter::EmitTan(codegen::CodeWriter& writer,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw ngraph_error("Op not supported in GPU Backend");
}
void runtime::gpu::GPU_Emitter::EmitTanh(codegen::CodeWriter& writer,
......@@ -333,6 +605,7 @@ void runtime::gpu::GPU_Emitter::EmitTanh(codegen::CodeWriter& writer,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw ngraph_error("Op not supported in GPU Backend");
}
void runtime::gpu::GPU_Emitter::EmitAsin(codegen::CodeWriter& writer,
......@@ -340,6 +613,7 @@ void runtime::gpu::GPU_Emitter::EmitAsin(codegen::CodeWriter& writer,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw ngraph_error("Op not supported in GPU Backend");
}
void runtime::gpu::GPU_Emitter::EmitAcos(codegen::CodeWriter& writer,
......@@ -347,6 +621,7 @@ void runtime::gpu::GPU_Emitter::EmitAcos(codegen::CodeWriter& writer,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw ngraph_error("Op not supported in GPU Backend");
}
void runtime::gpu::GPU_Emitter::EmitAtan(codegen::CodeWriter& writer,
......@@ -354,6 +629,7 @@ void runtime::gpu::GPU_Emitter::EmitAtan(codegen::CodeWriter& writer,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw ngraph_error("Op not supported in GPU Backend");
}
void runtime::gpu::GPU_Emitter::EmitPower(codegen::CodeWriter& writer,
......@@ -361,6 +637,7 @@ void runtime::gpu::GPU_Emitter::EmitPower(codegen::CodeWriter& writer,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw ngraph_error("Op not supported in GPU Backend");
}
void runtime::gpu::GPU_Emitter::EmitReplaceSlice(
......@@ -369,6 +646,7 @@ void runtime::gpu::GPU_Emitter::EmitReplaceSlice(
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw ngraph_error("Op not supported in GPU Backend");
}
void runtime::gpu::GPU_Emitter::EmitOneHot(codegen::CodeWriter& writer,
......@@ -376,6 +654,7 @@ void runtime::gpu::GPU_Emitter::EmitOneHot(codegen::CodeWriter& writer,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw ngraph_error("Op not supported in GPU Backend");
}
void runtime::gpu::GPU_Emitter::EmitCeiling(codegen::CodeWriter& writer,
......@@ -383,6 +662,7 @@ void runtime::gpu::GPU_Emitter::EmitCeiling(codegen::CodeWriter& writer,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw ngraph_error("Op not supported in GPU Backend");
}
void runtime::gpu::GPU_Emitter::EmitFloor(codegen::CodeWriter& writer,
......@@ -390,6 +670,7 @@ void runtime::gpu::GPU_Emitter::EmitFloor(codegen::CodeWriter& writer,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw ngraph_error("Op not supported in GPU Backend");
}
void runtime::gpu::GPU_Emitter::EmitSqrt(codegen::CodeWriter& writer,
......@@ -397,6 +678,7 @@ void runtime::gpu::GPU_Emitter::EmitSqrt(codegen::CodeWriter& writer,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw ngraph_error("Op not supported in GPU Backend");
}
void runtime::gpu::GPU_Emitter::EmitConvolution(
......@@ -405,6 +687,7 @@ void runtime::gpu::GPU_Emitter::EmitConvolution(
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw ngraph_error("Op not supported in GPU Backend");
}
void runtime::gpu::GPU_Emitter::EmitNot(codegen::CodeWriter& writer,
......@@ -412,6 +695,7 @@ void runtime::gpu::GPU_Emitter::EmitNot(codegen::CodeWriter& writer,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw ngraph_error("Op not supported in GPU Backend");
}
void runtime::gpu::GPU_Emitter::EmitMaxPool(codegen::CodeWriter& writer,
......@@ -419,6 +703,7 @@ void runtime::gpu::GPU_Emitter::EmitMaxPool(codegen::CodeWriter& writer,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw ngraph_error("Op not supported in GPU Backend");
}
void runtime::gpu::GPU_Emitter::EmitReverse(codegen::CodeWriter& writer,
......@@ -426,6 +711,7 @@ void runtime::gpu::GPU_Emitter::EmitReverse(codegen::CodeWriter& writer,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw ngraph_error("Op not supported in GPU Backend");
}
void runtime::gpu::GPU_Emitter::EmitReduceWindow(
......@@ -434,6 +720,7 @@ void runtime::gpu::GPU_Emitter::EmitReduceWindow(
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw ngraph_error("Op not supported in GPU Backend");
}
void runtime::gpu::GPU_Emitter::EmitSelectAndScatter(
......@@ -442,4 +729,5 @@ void runtime::gpu::GPU_Emitter::EmitSelectAndScatter(
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw ngraph_error("Op not supported in GPU Backend");
}
......@@ -199,7 +199,6 @@ runtime::gpu::GPU_ExternalFunction::GPU_ExternalFunction(
: ngraph::runtime::ExternalFunction(function, release_function)
, m_compiled_function(nullptr)
, m_emit_timing(std::getenv("NGRAPH_GPU_EMIT_TIMING") != nullptr)
, m_use_tbb(std::getenv("NGRAPH_GPU_USE_TBB") != nullptr)
{
}
......@@ -226,8 +225,6 @@ void runtime::gpu::GPU_ExternalFunction::compile()
writer +=
R"(// Generated by the NGraph GPU backend
#define IDX2F(i,j,ld) ((((j)-1)*(ld))+((i)-1))
#include <cassert>
#include <cmath>
#include <cstdlib>
......@@ -242,6 +239,7 @@ void runtime::gpu::GPU_ExternalFunction::compile()
#include <unordered_map>
#include <cuda_runtime.h>
#include <cudnn_v7.h>
#include "cublas_v2.h"
#include "cuda.h"
......@@ -317,6 +315,74 @@ void runtime::gpu::GPU_ExternalFunction::compile()
using namespace std;
)";
if (m_emit_timing)
{
writer << "// Declare debug timers\n";
vector<string> names;
for (shared_ptr<Function> current_function : pass_manager.get_state().get_functions())
{
for (shared_ptr<Node> node : current_function->get_ordered_ops())
{
if (!node->is_parameter() && !node->is_constant())
{
names.push_back(node->get_name());
}
}
}
for (const string& s : names)
{
writer << "ngraph::stopwatch timer_" << s << ";\n";
}
writer << "extern \"C\" size_t get_debug_timer_count() { return " << names.size()
<< "; }\n";
writer << "extern \"C\" const char* get_debug_timer_name(size_t index)\n";
writer << "{\n";
writer.indent++;
writer << "const char* rc;\n";
writer << "switch(index)\n";
writer << "{\n";
for (size_t i = 0; i < names.size(); i++)
{
writer << "case " << i << ": rc = \"" << names[i] << "\"; break;\n";
}
writer << "default: rc = \"\";\n";
writer << "}\n";
writer << "return rc;\n";
writer.indent--;
writer << "}\n";
writer << "extern \"C\" const size_t get_debug_timer_microseconds(size_t index)\n";
writer << "{\n";
writer.indent++;
writer << "size_t rc;\n";
writer << "switch(index)\n";
writer << "{\n";
for (size_t i = 0; i < names.size(); i++)
{
writer << "case " << i << ": rc = timer_" << names[i]
<< ".get_total_microseconds(); break;\n";
}
writer << "default: rc = 0;\n";
writer << "}\n";
writer << "return rc;\n";
writer.indent--;
writer << "}\n";
writer << "extern \"C\" const size_t get_debug_timer_call_count(size_t index)\n";
writer << "{\n";
writer.indent++;
writer << "size_t rc;\n";
writer << "switch(index)\n";
writer << "{\n";
for (size_t i = 0; i < names.size(); i++)
{
writer << "case " << i << ": rc = timer_" << names[i] << ".get_call_count(); break;\n";
}
writer << "default: rc = 0;\n";
writer << "}\n";
writer << "return rc;\n";
writer.indent--;
writer << "}\n";
writer << "\n";
}
// // The "dso_handle" symbol is required by __cxa_atexit()
// // which is enabled because the JIT uses it as the default mechanism
// // to register cleanup handlers. We use it, and not atexit(), because
......@@ -349,17 +415,13 @@ void runtime::gpu::GPU_ExternalFunction::compile()
for (shared_ptr<Function> f : pass_manager.get_state().get_functions())
{
writer << "extern \"C\" void " << f->get_name()
<< "(void** inputs, void** outputs, cublasHandle_t& cublas_handle);\n";
<< "(void*** inputs, void*** outputs, cublasHandle_t& cublas_handle);\n";
}
writer << "\n";
unordered_map<Node*, string> match_functions;
for (shared_ptr<Function> current_function : pass_manager.get_state().get_functions())
{
bool temporaries_used = false;
size_t worst_case_tmp_size = 0;
set<string> output_names;
for (shared_ptr<Node> op : current_function->get_results())
{
......@@ -377,80 +439,12 @@ void runtime::gpu::GPU_ExternalFunction::compile()
}
writer << "extern \"C\" void " << current_function->get_name();
writer << "(void** inputs, void** outputs, cublasHandle_t& cublas_handle)\n";
writer << "(void*** inputs, void*** outputs, cublasHandle_t& cublas_handle)\n";
writer << "{\n";
writer.indent++;
if (m_emit_timing)
{
writer << "// Declare debug timers\n";
vector<string> names;
for (shared_ptr<Function> current_function : pass_manager.get_state().get_functions())
{
for (shared_ptr<Node> node : current_function->get_ordered_ops())
{
if (!node->is_parameter() && !node->is_constant())
{
names.push_back(node->get_name());
}
}
}
for (const string& s : names)
{
writer << "ngraph::stopwatch timer_" << s << ";\n";
}
writer << "extern \"C\" size_t get_debug_timer_count() { return " << names.size()
<< "; }\n";
writer << "extern \"C\" const char* get_debug_timer_name(size_t index)\n";
writer << "{\n";
writer.indent++;
writer << "const char* rc;\n";
writer << "switch(index)\n";
writer << "{\n";
for (size_t i = 0; i < names.size(); i++)
{
writer << "case " << i << ": rc = \"" << names[i] << "\"; break;\n";
}
writer << "default: rc = \"\";\n";
writer << "}\n";
writer << "return rc;\n";
writer.indent--;
writer << "}\n";
writer << "extern \"C\" const size_t get_debug_timer_microseconds(size_t index)\n";
writer << "{\n";
writer.indent++;
writer << "size_t rc;\n";
writer << "switch(index)\n";
writer << "{\n";
for (size_t i = 0; i < names.size(); i++)
{
writer << "case " << i << ": rc = timer_" << names[i]
<< ".get_total_microseconds(); break;\n";
}
writer << "default: rc = 0;\n";
writer << "}\n";
writer << "return rc;\n";
writer.indent--;
writer << "}\n";
writer << "extern \"C\" const size_t get_debug_timer_call_count(size_t index)\n";
writer << "{\n";
writer.indent++;
writer << "size_t rc;\n";
writer << "switch(index)\n";
writer << "{\n";
for (size_t i = 0; i < names.size(); i++)
{
writer << "case " << i << ": rc = timer_" << names[i]
<< ".get_call_count(); break;\n";
}
writer << "default: rc = 0;\n";
writer << "}\n";
writer << "return rc;\n";
writer.indent--;
writer << "}\n";
writer << "\n";
}
bool temporaries_used = false;
size_t worst_case_tmp_size = 0;
for (shared_ptr<Node> node : current_function->get_ordered_ops())
{
if (node->liveness_new_list.size() > 0)
......@@ -462,10 +456,25 @@ void runtime::gpu::GPU_ExternalFunction::compile()
}
}
}
if (temporaries_used)
{
// TODO use temporary variables
size_t temp_pool_size = current_function->get_temporary_pool_size();
writer << "// Allocate the memory pool\n";
// TODO memory pool malloc.
writer << "void** pool_base_ptr = runtime::gpu::create_gpu_buffer(" << temp_pool_size
<< ");\n";
// Add temporaries to the variable name map
for (shared_ptr<Node> node : current_function->get_ordered_ops())
{
for (descriptor::Tensor* tensor : node->liveness_new_list)
{
stringstream ss;
ss << "((" << tensor->get_element_type().c_type_string()
<< "*)(pool_base_ptr + " << tensor->get_pool_offset() << "))";
m_variable_name_map[tensor->get_name()] = ss.str();
}
}
}
// Add inputs to the variable name map
......@@ -517,10 +526,10 @@ void runtime::gpu::GPU_ExternalFunction::compile()
if (tv == ptv)
{
parameter_as_output = true;
writer << "memcpy(static_cast<" << et.c_type_string() << "*>(outputs["
<< output_index << "]), "
writer << "runtime::gpu::cuda_memcpyDtD(reinterpret_cast<"
<< et.c_type_string() << "*>(outputs[" << output_index << "]), "
<< m_variable_name_map[ptv->get_tensor().get_name()] << ", "
<< ptv->get_tensor().size() << ");\n";
<< ptv->get_tensor().size() << ",1);\n";
break;
}
}
......@@ -529,7 +538,7 @@ void runtime::gpu::GPU_ExternalFunction::compile()
{
if (contains(constants, tv.get()))
{
writer << "memcpy(outputs[" << output_index << "], "
writer << "runtime::gpu::cuda_memcpyHtD(outputs[" << output_index << "], "
<< tv->get_tensor().get_name() << ", " << tv->get_tensor().size()
<< ");\n";
}
......@@ -618,7 +627,6 @@ void runtime::gpu::GPU_ExternalFunction::compile()
// End generated function
writer += "}\n\n";
}
// TODO: Cleanup and make this a utility function
file_util::make_directory(s_output_dir);
......@@ -669,9 +677,9 @@ void runtime::gpu::GPU_ExternalFunction::handle_output_alias(
writer.indent++;
for (size_t i = 1; i < outputs.size(); i++)
{
writer << "memcpy(static_cast<void*>(outputs[" << outputs[i]
<< "]), static_cast<void*>(outputs[" << outputs[0] << "]), "
<< otv->get_tensor().size() << ");\n";
writer << "runtime::gpu::cuda_memcpyDtD(static_cast<void*>(outputs["
<< outputs[i] << "]), static_cast<void*>(outputs[" << outputs[0]
<< "]), " << otv->get_tensor().size() << ",1);\n";
}
writer.indent--;
writer << "}\n";
......
......@@ -80,7 +80,6 @@ namespace ngraph
std::unique_ptr<codegen::Compiler> m_compiler;
std::unique_ptr<codegen::ExecutionEngine> m_execution_engine;
bool m_emit_timing;
bool m_use_tbb;
std::unordered_map<std::string, std::string> m_variable_name_map;
};
}
......
......@@ -17,8 +17,10 @@
#include <cassert>
#include <cstdlib>
#include <iostream>
#include <sstream>
#include <stddef.h>
#include <stdio.h>
#include <string>
#include "cuda.h"
#include "cuda_runtime.h"
......@@ -44,3 +46,21 @@ void runtime::gpu::check_cuda_errors(CUresult err)
{
assert(err == CUDA_SUCCESS);
}
void** runtime::gpu::create_gpu_buffer(size_t buffer_size)
{
void** allocated_buffer_pool;
cudaMalloc(&allocated_buffer_pool, buffer_size);
return allocated_buffer_pool;
}
void runtime::gpu::cuda_memcpyDtD(void* d, void* s, size_t element_count, size_t element_size)
{
size_t size_in_bytes = element_size * element_count;
cudaMemcpy(d, s, size_in_bytes, cudaMemcpyDeviceToDevice);
}
void runtime::gpu::cuda_memcpyHtD(void* d, void* s, size_t buffer_size)
{
cudaMemcpy(d, s, buffer_size, cudaMemcpyHostToDevice);
}
......@@ -22,6 +22,9 @@ namespace ngraph
{
void print_gpu_f32_tensor(void* p, size_t element_count, size_t element_size);
void check_cuda_errors(CUresult err);
void** create_gpu_buffer(size_t buffer_size);
void cuda_memcpyDtD(void* d, void* s, size_t element_count, size_t element_size);
void cuda_memcpyHtD(void* d, void* s, size_t buffer_size);
}
}
}
......@@ -83,11 +83,12 @@ if(NGRAPH_GPU_ENABLE AND LLVM_INCLUDE_DIR)
link_directories(${LLVM_LIB_DIR})
link_directories(${CUDA_LIBRARIES})
link_directories(${CUDA_CUBLAS_LIBRARIES})
link_directories(${CUDNN_LIBRARIES})
set(SRC
${SRC}
cudnn.cpp)
# Disabled for testing
# set(BACKEND_NAMES ${BACKEND_NAMES} "GPU")
set(BACKEND_NAMES ${BACKEND_NAMES} "GPU")
endif()
if(NGRAPH_ARGON_ENABLE)
......
......@@ -263,63 +263,3 @@ const auto str = R"(
auto module = compiler.compile(source);
}
// TEST(cudnn, abc)
// {
// auto shape = Shape{2, 2};
// auto A = make_shared<op::Parameter>(element::f32, shape);
// auto B = make_shared<op::Parameter>(element::f32, shape);
// auto C = make_shared<op::Parameter>(element::f32, shape);
// auto f = make_shared<Function>((A + B) * C, op::Parameters{A, B, C});
// auto manager = runtime::Manager::get("GPU");
// auto external = manager->compile(f);
// auto backend = manager->allocate_backend();
// auto cf = backend->make_call_frame(external);
// // Create some tensors for input/output
// shared_ptr<runtime::TensorView> a = backend->make_primary_tensor_view(element::f32, shape);
// shared_ptr<runtime::TensorView> b = backend->make_primary_tensor_view(element::f32, shape);
// shared_ptr<runtime::TensorView> c = backend->make_primary_tensor_view(element::f32, shape);
// shared_ptr<runtime::TensorView> result = backend->make_primary_tensor_view(element::f32, shape);
// copy_data(a, test::NDArray<float, 2>({{1, 2}, {3, 4}}).get_vector());
// copy_data(b, test::NDArray<float, 2>({{5, 6}, {7, 8}}).get_vector());
// copy_data(c, test::NDArray<float, 2>({{9, 10}, {11, 12}}).get_vector());
// cf->call({a, b, c}, {result});
// EXPECT_EQ(result->read_vector<float>(),
// (test::NDArray<float, 2>({{54, 80}, {110, 144}})).get_vector());
// cf->call({b, a, c}, {result});
// EXPECT_EQ(result->read_vector<float>(),
// (test::NDArray<float, 2>({{54, 80}, {110, 144}})).get_vector());
// cf->call({a, c, b}, {result});
// EXPECT_EQ(result->read_vector<float>(),
// (test::NDArray<float, 2>({{50, 72}, {98, 128}})).get_vector());
// }
TEST(cudnn, dot1d)
{
auto shape = Shape{4};
auto A = make_shared<op::Parameter>(element::f32, shape);
auto B = make_shared<op::Parameter>(element::f32, shape);
auto shape_r = Shape{1};
auto f = make_shared<Function>(make_shared<op::Dot>(A, B), op::Parameters{A, B});
auto manager = runtime::Manager::get("GPU");
auto external = manager->compile(f);
auto backend = manager->allocate_backend();
auto cf = backend->make_call_frame(external);
// Create some tensors for input/output
auto a = backend->make_primary_tensor_view(element::f32, shape);
copy_data(a, vector<float>{2, 4, 8, 16});
auto b = backend->make_primary_tensor_view(element::f32, shape);
copy_data(b, vector<float>{1, 2, 4, 8});
auto result = backend->make_primary_tensor_view(element::f32, shape_r);
cf->call({a, b}, {result});
EXPECT_EQ((vector<float>{170}), read_vector<float>(result));
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment