Commit bae77590 authored by Chris Sullivan's avatar Chris Sullivan Committed by adstraw

Add reduce sum to the GPU transformer (op::Sum) (#671)

* Current cudnn implementations use only
a single dimension for the ngraph tensor data (width).
In this case the tensor format should be set to

CUDNN_TENSOR_NCHW

so that adjacent memory accesses are coalesced (stride=1 for width).

* * Added some kernel emitter helpers that are reused often.
* Renamed EmitElementwise -> emit_elementwise to match emit<T>.
* op::Sum now handles trivial case of dim(input_tensor) = dim(output_tensor)
  by performing a memcpy as no axes are reduced.

*   Added general case for Nd descriptors which is used when the tensor
  has more than 4 dimensions. Currently a naive reduce is performed,
  in the future a coordinate transformation could be performed to
  improve the memory layout for the reduction.

* Switched to codegen::CodeWriter::block_begin/end.
It appears that CodeWriter::block_begin/end is not frequently used for emitters (in cpu and gpu transformers)
because a block comment is often desired. To this end I added prefix/suffix default parameters to CodeWriter::block_begin/end
so that this functionality is captured.
parent 72f4d661
...@@ -68,16 +68,16 @@ public: ...@@ -68,16 +68,16 @@ public:
std::string generate_temporary_name(std::string prefix = "tempvar"); std::string generate_temporary_name(std::string prefix = "tempvar");
void block_begin() void block_begin(std::string block_prefix = "")
{ {
*this << "{\n"; *this << "{" << block_prefix << "\n";
indent++; indent++;
} }
void block_end() void block_end(std::string block_suffix = "")
{ {
indent--; indent--;
*this << "}\n"; *this << "}" << block_suffix << "\n";
} }
private: private:
......
...@@ -103,10 +103,10 @@ namespace ngraph ...@@ -103,10 +103,10 @@ namespace ngraph
{ {
namespace gpu namespace gpu
{ {
void GPU_Emitter::EmitElementwise( void GPU_Emitter::emit_elementwise(
GPU_ExternalFunction* external_function, GPU_ExternalFunction* external_function,
codegen::CodeWriter& writer, codegen::CodeWriter& writer,
const ngraph::Node* n, const ngraph::Node* node,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args, const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out) const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{ {
...@@ -115,12 +115,11 @@ namespace ngraph ...@@ -115,12 +115,11 @@ namespace ngraph
return; return;
} }
writer << "{ // " << n->get_name() << "\n"; writer.block_begin(" // " + node->get_name());
writer.indent++;
writer << "int count = " << out[0].get_size() << ";\n"; writer << "int count = " << out[0].get_size() << ";\n";
writer << "if(count == 0) return;\n"; writer << "if(count == 0) return;\n";
writer << "ngraph::runtime::gpu::emit_elementwise_op<ngraph::op::" writer << "ngraph::runtime::gpu::emit_elementwise_op<ngraph::op::"
<< n->description() << ">(\"" << n->description() << "\"" << node->description() << ">(\"" << node->description() << "\""
<< ", {\"" << args[0].get_type() << "\", \"" << out[0].get_type() << "\"}" << ", {\"" << args[0].get_type() << "\", \"" << out[0].get_type() << "\"}"
<< ", count" << ", count"
<< ", CUdeviceptr(" << out[0].get_name() << ")"; << ", CUdeviceptr(" << out[0].get_name() << ")";
...@@ -129,8 +128,7 @@ namespace ngraph ...@@ -129,8 +128,7 @@ namespace ngraph
writer << ", CUdeviceptr(" << args[i].get_name() << ")"; writer << ", CUdeviceptr(" << args[i].get_name() << ")";
} }
writer << ");\n"; writer << ");\n";
writer.indent--; writer.block_end();
writer << "}\n";
} }
template <> template <>
...@@ -140,15 +138,14 @@ namespace ngraph ...@@ -140,15 +138,14 @@ namespace ngraph
{ {
return; return;
} }
writer << "{ // " << node->get_name() << "\n"; writer.block_begin(" // " + node->get_name());
writer.indent++;
writer << "int count = " << out[0].get_size() << ";\n"; writer << "int count = " << out[0].get_size() << ";\n";
writer += R"( writer += R"(
float alpha1 = 1.0, alpha2 = 1.0, beta = 0; float alpha1 = 1.0, alpha2 = 1.0, beta = 0;
cudnnTensorDescriptor_t descriptor; cudnnTensorDescriptor_t descriptor;
cudnnCreateTensorDescriptor(&descriptor); cudnnCreateTensorDescriptor(&descriptor);
cudnnSetTensor4dDescriptor(descriptor, cudnnSetTensor4dDescriptor(descriptor,
/*format=*/CUDNN_TENSOR_NHWC, /*format=*/CUDNN_TENSOR_NCHW,
/*dataType=*/CUDNN_DATA_FLOAT, /*dataType=*/CUDNN_DATA_FLOAT,
/*batch_size=*/1, /*batch_size=*/1,
/*channels=*/1, /*channels=*/1,
...@@ -171,8 +168,7 @@ cudnnSetOpTensorDescriptor(opTensorDesc, ...@@ -171,8 +168,7 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
<< "descriptor," << args[1].get_name() << "," << "descriptor," << args[1].get_name() << ","
<< "&beta," << "&beta,"
<< "descriptor," << out[0].get_name() << ");\n"; << "descriptor," << out[0].get_name() << ");\n";
writer.indent--; writer.block_end();
writer << "}\n";
} }
template <> template <>
...@@ -190,8 +186,8 @@ cudnnSetOpTensorDescriptor(opTensorDesc, ...@@ -190,8 +186,8 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
{ {
auto& first = (arg0_shape.empty() ? args[0] : args[1]); auto& first = (arg0_shape.empty() ? args[0] : args[1]);
auto& second = (arg0_shape.empty() ? args[1] : args[0]); auto& second = (arg0_shape.empty() ? args[1] : args[0]);
writer << "{ // " << node->get_name() << "\n";
writer.indent++; writer.block_begin(" // " + node->get_name());
writer << "int count = " << second.get_size() << ";\n"; writer << "int count = " << second.get_size() << ";\n";
writer << "cublasScopy(" writer << "cublasScopy("
<< "cublas_handle," << "cublas_handle,"
...@@ -201,20 +197,17 @@ cudnnSetOpTensorDescriptor(opTensorDesc, ...@@ -201,20 +197,17 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
<< "cublas_handle," << "cublas_handle,"
<< "count ," << first.get_name() << "," << out[0].get_name() << "count ," << first.get_name() << "," << out[0].get_name()
<< ", 1);\n"; << ", 1);\n";
writer.indent--; writer.block_end();
writer << "}\n";
return; return;
} }
//set output to 0 if input size is 0 //set output to 0 if input size is 0
if (args[0].get_size() == 0 || args[1].get_size() == 0) if (args[0].get_size() == 0 || args[1].get_size() == 0)
{ {
writer << "{ // " << node->get_name() << "\n"; writer.block_begin(" // " + node->get_name());
writer.indent++;
writer << "runtime::gpu::cuda_memset(" << out[0].get_name() << ", 0, " writer << "runtime::gpu::cuda_memset(" << out[0].get_name() << ", 0, "
<< out[0].get_size() << " * sizeof(float));\n"; << out[0].get_size() << " * sizeof(float));\n";
writer.indent--; writer.block_end();
writer << "}\n";
return; return;
} }
...@@ -231,21 +224,18 @@ cudnnSetOpTensorDescriptor(opTensorDesc, ...@@ -231,21 +224,18 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
"input1 and input2 shape does not match for dot;"); "input1 and input2 shape does not match for dot;");
} }
} }
writer << "{ // " << node->get_name() << "\n"; writer.block_begin(" // " + node->get_name());
writer.indent++;
writer << "cublasSdot(" writer << "cublasSdot("
<< "cublas_handle," << args[0].get_size() << "," << args[0].get_name() << "cublas_handle," << args[0].get_size() << "," << args[0].get_name()
<< "," << ","
<< "1," << args[1].get_name() << "," << "1," << args[1].get_name() << ","
<< "1," << out[0].get_name() << ");\n"; << "1," << out[0].get_name() << ");\n";
writer.indent--; writer.block_end();
writer << "}\n";
} }
else if ((arg0_shape.size() == 2) && (arg1_shape.size() == 1) && else if ((arg0_shape.size() == 2) && (arg1_shape.size() == 1) &&
(dot->get_reduction_axes_count() == 1)) (dot->get_reduction_axes_count() == 1))
{ {
writer << "{ // " << node->get_name() << "\n"; writer.block_begin(" // " + node->get_name());
writer.indent++;
writer << "const float alpha = 1.0;\n"; writer << "const float alpha = 1.0;\n";
writer << "const float beta = 0;\n"; writer << "const float beta = 0;\n";
writer << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_HOST);\n"; writer << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_HOST);\n";
...@@ -260,8 +250,7 @@ cudnnSetOpTensorDescriptor(opTensorDesc, ...@@ -260,8 +250,7 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
<< out[0].get_name() << "," << out[0].get_name() << ","
<< "1);\n"; << "1);\n";
writer << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_DEVICE);\n"; writer << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_DEVICE);\n";
writer.indent--; writer.block_end();
writer << "}\n";
} }
else if ((arg0_shape.size() == 2) && (arg1_shape.size() == 2) && else if ((arg0_shape.size() == 2) && (arg1_shape.size() == 2) &&
(dot->get_reduction_axes_count() == 1)) (dot->get_reduction_axes_count() == 1))
...@@ -273,8 +262,7 @@ cudnnSetOpTensorDescriptor(opTensorDesc, ...@@ -273,8 +262,7 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
{ {
throw std::runtime_error("input and output shape does not match for dot;"); throw std::runtime_error("input and output shape does not match for dot;");
} }
writer << "{ // " << node->get_name() << "\n"; writer.block_begin(" // " + node->get_name());
writer.indent++;
writer << "const float alpha = 1.0;\n"; writer << "const float alpha = 1.0;\n";
writer << "const float beta = 0.0;\n"; writer << "const float beta = 0.0;\n";
writer << "int m = " << arg0_shape[0] << ";\n"; writer << "int m = " << arg0_shape[0] << ";\n";
...@@ -296,8 +284,7 @@ cudnnSetOpTensorDescriptor(opTensorDesc, ...@@ -296,8 +284,7 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
<< out[0].get_name() << "," << out[0].get_name() << ","
<< "n);\n"; << "n);\n";
writer << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_DEVICE);\n"; writer << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_DEVICE);\n";
writer.indent--; writer.block_end();
writer << "}\n";
} }
else else
{ {
...@@ -313,15 +300,14 @@ cudnnSetOpTensorDescriptor(opTensorDesc, ...@@ -313,15 +300,14 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
{ {
return; return;
} }
writer << "{ // " << node->get_name() << "\n"; writer.block_begin(" // " + node->get_name());
writer.indent++;
writer << "int count = " << out[0].get_size() << ";\n"; writer << "int count = " << out[0].get_size() << ";\n";
writer += R"( writer += R"(
float alpha1 = 1.0, alpha2 = 1.0, beta = 0; float alpha1 = 1.0, alpha2 = 1.0, beta = 0;
cudnnTensorDescriptor_t descriptor; cudnnTensorDescriptor_t descriptor;
cudnnCreateTensorDescriptor(&descriptor); cudnnCreateTensorDescriptor(&descriptor);
cudnnSetTensor4dDescriptor(descriptor, cudnnSetTensor4dDescriptor(descriptor,
/*format=*/CUDNN_TENSOR_NHWC, /*format=*/CUDNN_TENSOR_NCHW,
/*dataType=*/CUDNN_DATA_FLOAT, /*dataType=*/CUDNN_DATA_FLOAT,
/*batch_size=*/1, /*batch_size=*/1,
/*channels=*/1, /*channels=*/1,
...@@ -344,8 +330,7 @@ cudnnSetOpTensorDescriptor(opTensorDesc, ...@@ -344,8 +330,7 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
<< "descriptor," << args[1].get_name() << "," << "descriptor," << args[1].get_name() << ","
<< "&beta," << "&beta,"
<< "descriptor," << out[0].get_name() << ");\n"; << "descriptor," << out[0].get_name() << ");\n";
writer.indent--; writer.block_end();
writer << "}\n";
} }
template <> template <>
...@@ -355,15 +340,14 @@ cudnnSetOpTensorDescriptor(opTensorDesc, ...@@ -355,15 +340,14 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
{ {
return; return;
} }
writer << "{ // " << node->get_name() << "\n"; writer.block_begin(" // " + node->get_name());
writer.indent++;
writer << "int count = " << out[0].get_size() << ";\n"; writer << "int count = " << out[0].get_size() << ";\n";
writer += R"( writer += R"(
float alpha1 = 1.0, alpha2 = 1.0, beta = 0; float alpha1 = 1.0, alpha2 = 1.0, beta = 0;
cudnnTensorDescriptor_t descriptor; cudnnTensorDescriptor_t descriptor;
cudnnCreateTensorDescriptor(&descriptor); cudnnCreateTensorDescriptor(&descriptor);
cudnnSetTensor4dDescriptor(descriptor, cudnnSetTensor4dDescriptor(descriptor,
/*format=*/CUDNN_TENSOR_NHWC, /*format=*/CUDNN_TENSOR_NCHW,
/*dataType=*/CUDNN_DATA_FLOAT, /*dataType=*/CUDNN_DATA_FLOAT,
/*batch_size=*/1, /*batch_size=*/1,
/*channels=*/1, /*channels=*/1,
...@@ -386,8 +370,7 @@ cudnnSetOpTensorDescriptor(opTensorDesc, ...@@ -386,8 +370,7 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
<< "descriptor," << args[1].get_name() << "," << "descriptor," << args[1].get_name() << ","
<< "&beta," << "&beta,"
<< "descriptor," << out[0].get_name() << ");\n"; << "descriptor," << out[0].get_name() << ");\n";
writer.indent--; writer.block_end();
writer << "}\n";
} }
template <> template <>
...@@ -397,15 +380,14 @@ cudnnSetOpTensorDescriptor(opTensorDesc, ...@@ -397,15 +380,14 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
{ {
return; return;
} }
writer << "{ // " << node->get_name() << "\n"; writer.block_begin(" // " + node->get_name());
writer.indent++;
writer << "int count = " << out[0].get_size() << ";\n"; writer << "int count = " << out[0].get_size() << ";\n";
writer += R"( writer += R"(
float alpha1 = -1.0, alpha2 = 0, beta = 0; float alpha1 = -1.0, alpha2 = 0, beta = 0;
cudnnTensorDescriptor_t descriptor; cudnnTensorDescriptor_t descriptor;
cudnnCreateTensorDescriptor(&descriptor); cudnnCreateTensorDescriptor(&descriptor);
cudnnSetTensor4dDescriptor(descriptor, cudnnSetTensor4dDescriptor(descriptor,
/*format=*/CUDNN_TENSOR_NHWC, /*format=*/CUDNN_TENSOR_NCHW,
/*dataType=*/CUDNN_DATA_FLOAT, /*dataType=*/CUDNN_DATA_FLOAT,
/*batch_size=*/1, /*batch_size=*/1,
/*channels=*/1, /*channels=*/1,
...@@ -428,8 +410,7 @@ cudnnSetOpTensorDescriptor(opTensorDesc, ...@@ -428,8 +410,7 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
<< "descriptor," << args[0].get_name() << "," << "descriptor," << args[0].get_name() << ","
<< "&beta," << "&beta,"
<< "descriptor," << out[0].get_name() << ");\n"; << "descriptor," << out[0].get_name() << ");\n";
writer.indent--; writer.block_end();
writer << "}\n";
} }
template <> template <>
...@@ -447,13 +428,9 @@ cudnnSetOpTensorDescriptor(opTensorDesc, ...@@ -447,13 +428,9 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
//broadcast axes is empty, do a copy //broadcast axes is empty, do a copy
if (axes.empty()) if (axes.empty())
{ {
writer << "{ // " << node->get_name() << " \n"; writer.block_begin(" // " + node->get_name());
writer.indent++; kernel::emit_memcpyDtD(writer, out[0], args[0]);
writer << "runtime::gpu::cuda_memcpyDtD(" << out[0].get_name() << ", " writer.block_end();
<< args[0].get_name() << ", " << out[0].get_size() << " * "
<< out[0].get_element_type().size() << ");\n";
writer.indent--;
writer << "}\n";
return; return;
} }
...@@ -487,8 +464,7 @@ cudnnSetOpTensorDescriptor(opTensorDesc, ...@@ -487,8 +464,7 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
repeat_size *= result_shape[i]; repeat_size *= result_shape[i];
} }
writer << "{ // " << node->get_name() << " \n"; writer.block_begin(" // " + node->get_name());
writer.indent++;
writer << "runtime::gpu::emit_broadcast(\"" << node->description() writer << "runtime::gpu::emit_broadcast(\"" << node->description()
<< "\", CUdeviceptr(" << args[0].get_name() << "), CUdeviceptr(" << "\", CUdeviceptr(" << args[0].get_name() << "), CUdeviceptr("
<< out[0].get_name() << ")" << out[0].get_name() << ")"
...@@ -496,9 +472,7 @@ cudnnSetOpTensorDescriptor(opTensorDesc, ...@@ -496,9 +472,7 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
<< "\"}" << "\"}"
<< ", " << repeat_size << ", " << repeat_times << ", " << ", " << repeat_size << ", " << repeat_times << ", "
<< out[0].get_size() << ");\n"; << out[0].get_size() << ");\n";
writer.block_end();
writer.indent--;
writer << "}\n";
} }
else else
{ {
...@@ -519,8 +493,7 @@ cudnnSetOpTensorDescriptor(opTensorDesc, ...@@ -519,8 +493,7 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
return; return;
} }
auto reshape = static_cast<const op::Reshape*>(node); auto reshape = static_cast<const op::Reshape*>(node);
writer << "{ // " << node->get_name() << "\n"; writer.block_begin(" // " + node->get_name());
writer.indent++;
auto arg_shape = args[0].get_shape(); auto arg_shape = args[0].get_shape();
auto arg_rank = arg_shape.size(); auto arg_rank = arg_shape.size();
auto result_shape = out[0].get_shape(); auto result_shape = out[0].get_shape();
...@@ -536,19 +509,12 @@ cudnnSetOpTensorDescriptor(opTensorDesc, ...@@ -536,19 +509,12 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
// we can just copy. // we can just copy.
if (same_layout || result_shape_product < 2) if (same_layout || result_shape_product < 2)
{ {
writer << "{ // " << node->get_name() << " 1\n"; kernel::emit_memcpyDtD(writer, out[0], args[0]);
writer.indent++;
writer << "runtime::gpu::cuda_memcpyDtD(" << out[0].get_name() << ", "
<< args[0].get_name() << ", " << out[0].get_size() << " * "
<< out[0].get_element_type().size() << ");\n";
writer.indent--;
writer << "}\n";
} }
// If there *is* a layout change in the 2D case, we transpose the input. // If there *is* a layout change in the 2D case, we transpose the input.
else if (arg_rank == 2) else if (arg_rank == 2)
{ {
// TODO Assert arg0_shape[0] == arg1_shape[0]? // TODO Assert arg0_shape[0] == arg1_shape[0]?
writer << "{ // " << node->get_name() << "\n";
writer.indent++; writer.indent++;
writer << "const float alpha = 1.0;\n"; writer << "const float alpha = 1.0;\n";
writer << "const float beta = 0;\n"; writer << "const float beta = 0;\n";
...@@ -563,8 +529,6 @@ cudnnSetOpTensorDescriptor(opTensorDesc, ...@@ -563,8 +529,6 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
<< args[0].get_name() << "," << arg_shape[1] << "," << out[0].get_name() << args[0].get_name() << "," << arg_shape[1] << "," << out[0].get_name()
<< "," << result_shape[1] << ");\n"; << "," << result_shape[1] << ");\n";
writer << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_DEVICE);\n"; writer << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_DEVICE);\n";
writer.indent--;
writer << "}\n";
} }
// Other cases (reordering of axes for tensors with rank>2) are not handled yet. // Other cases (reordering of axes for tensors with rank>2) are not handled yet.
else else
...@@ -573,8 +537,7 @@ cudnnSetOpTensorDescriptor(opTensorDesc, ...@@ -573,8 +537,7 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
"Axis permutation in reshape is not implemented yet for tensors with " "Axis permutation in reshape is not implemented yet for tensors with "
"rank>2"); "rank>2");
} }
writer.indent--; writer.block_end();
writer << "}\n";
} }
template <> template <>
...@@ -589,15 +552,14 @@ cudnnSetOpTensorDescriptor(opTensorDesc, ...@@ -589,15 +552,14 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
{ {
return; return;
} }
writer << "{ // " << node->get_name() << "\n"; writer.block_begin(" // " + node->get_name());
writer.indent++;
writer << "int count = " << out[0].get_size() << ";\n"; writer << "int count = " << out[0].get_size() << ";\n";
writer += R"( writer += R"(
float alpha1 = 1.0, alpha2 = 1.0, beta = 0; float alpha1 = 1.0, alpha2 = 1.0, beta = 0;
cudnnTensorDescriptor_t descriptor; cudnnTensorDescriptor_t descriptor;
cudnnCreateTensorDescriptor(&descriptor); cudnnCreateTensorDescriptor(&descriptor);
cudnnSetTensor4dDescriptor(descriptor, cudnnSetTensor4dDescriptor(descriptor,
/*format=*/CUDNN_TENSOR_NHWC, /*format=*/CUDNN_TENSOR_NCHW,
/*dataType=*/CUDNN_DATA_FLOAT, /*dataType=*/CUDNN_DATA_FLOAT,
/*batch_size=*/1, /*batch_size=*/1,
/*channels=*/1, /*channels=*/1,
...@@ -620,8 +582,7 @@ cudnnSetOpTensorDescriptor(opTensorDesc, ...@@ -620,8 +582,7 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
<< "descriptor," << args[1].get_name() << "," << "descriptor," << args[1].get_name() << ","
<< "&beta," << "&beta,"
<< "descriptor," << out[0].get_name() << ");\n"; << "descriptor," << out[0].get_name() << ");\n";
writer.indent--; writer.block_end();
writer << "}\n";
} }
template <> template <>
...@@ -642,8 +603,7 @@ cudnnSetOpTensorDescriptor(opTensorDesc, ...@@ -642,8 +603,7 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
repeat_size *= result_shape[i]; repeat_size *= result_shape[i];
} }
writer << "{ // " << node->get_name() << "\n"; writer.block_begin(" // " + node->get_name());
writer.indent++;
writer << "runtime::gpu::cuda_memset(" << out[0].get_name() << ", 0, " writer << "runtime::gpu::cuda_memset(" << out[0].get_name() << ", 0, "
<< out[0].get_size() << " * " << out[0].get_element_type().size() << ");\n"; << out[0].get_size() << " * " << out[0].get_element_type().size() << ");\n";
writer << "runtime::gpu::emit_onehot(\"" << node->description() writer << "runtime::gpu::emit_onehot(\"" << node->description()
...@@ -652,8 +612,7 @@ cudnnSetOpTensorDescriptor(opTensorDesc, ...@@ -652,8 +612,7 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
<< ", {\"" << args[0].get_type() << "\", \"" << out[0].get_type() << "\"}" << ", {\"" << args[0].get_type() << "\", \"" << out[0].get_type() << "\"}"
<< ", " << repeat_size << ", " << repeat_times << ", " << args[0].get_size() << ", " << repeat_size << ", " << repeat_times << ", " << args[0].get_size()
<< ");\n"; << ");\n";
writer.indent--; writer.block_end();
writer << "}\n";
} }
template <> template <>
...@@ -663,15 +622,14 @@ cudnnSetOpTensorDescriptor(opTensorDesc, ...@@ -663,15 +622,14 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
{ {
return; return;
} }
writer << "{ // " << node->get_name() << "\n"; writer.block_begin(" // " + node->get_name());
writer.indent++;
writer << "int count = " << out[0].get_size() << ";\n"; writer << "int count = " << out[0].get_size() << ";\n";
writer += R"( writer += R"(
float alpha1 = 1.0, alpha2 = 0, beta = 0; float alpha1 = 1.0, alpha2 = 0, beta = 0;
cudnnTensorDescriptor_t descriptor; cudnnTensorDescriptor_t descriptor;
cudnnCreateTensorDescriptor(&descriptor); cudnnCreateTensorDescriptor(&descriptor);
cudnnSetTensor4dDescriptor(descriptor, cudnnSetTensor4dDescriptor(descriptor,
/*format=*/CUDNN_TENSOR_NHWC, /*format=*/CUDNN_TENSOR_NCHW,
/*dataType=*/CUDNN_DATA_FLOAT, /*dataType=*/CUDNN_DATA_FLOAT,
/*batch_size=*/1, /*batch_size=*/1,
/*channels=*/1, /*channels=*/1,
...@@ -694,20 +652,128 @@ cudnnSetOpTensorDescriptor(opTensorDesc, ...@@ -694,20 +652,128 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
<< "descriptor," << args[0].get_name() << "," << "descriptor," << args[0].get_name() << ","
<< "&beta," << "&beta,"
<< "descriptor," << out[0].get_name() << ");\n"; << "descriptor," << out[0].get_name() << ");\n";
writer.indent--; writer.block_end();
writer << "}\n";
} }
template <> template <>
void GPU_Emitter::EMITTER_DECL(ngraph::op::Result) void GPU_Emitter::EMITTER_DECL(ngraph::op::Result)
{ {
writer << "{ //" << node->get_name() << "\n"; writer.block_begin(" // " + node->get_name());
writer.indent++; kernel::emit_memcpyDtD(writer, out[0], args[0]);
writer << "runtime::gpu::cuda_memcpyDtD(" << out[0].get_name() << ", " writer.block_end();
<< args[0].get_name() << ", " << out[0].get_size() << " * " return;
<< out[0].get_element_type().size() << ");\n"; }
writer.indent--;
writer << "}\n"; template <>
void GPU_Emitter::EMITTER_DECL(ngraph::op::Sum)
{
auto sum_node = static_cast<const ngraph::op::Sum*>(node);
auto reduction_axes = sum_node->get_reduction_axes();
auto& input_shape = args[0].get_shape();
const std::string input_desc = "input_descriptor";
const std::string output_desc = "output_descriptor";
const std::string tensor_type = "CUDNN_DATA_FLOAT";
const std::string tensor_format = "CUDNN_TENSOR_NCHW";
writer.block_begin(" // " + node->get_name());
{
if (out[0].get_size() != 0)
{
// one of args[] axes has zero size, zero output
if (args[0].get_size() == 0)
{
kernel::emit_memset(writer, out[0], 0);
}
// no change in dimensions, reduction not necessary
else if (input_shape.size() == out[0].get_shape().size())
{
kernel::emit_memcpyDtD(writer, out[0], args[0]);
}
// descriptors for tensors with <= 4 dimensions
else if (input_shape.size() <= 4)
{
// construct input tensor descriptor rt impl.
std::array<size_t, 4> dimensions;
size_t pos = 0;
for (size_t i = input_shape.size(); i < 4; i++)
{
dimensions[pos++] = 1;
}
for (size_t i = 0; i < input_shape.size(); i++)
{
dimensions[pos++] = input_shape[i];
}
kernel::emit_cudnnTensor4dDescriptor(
writer, input_desc, tensor_format, tensor_type, dimensions);
// mark reduced axes of input tensor for output tensor descriptor
for (auto const& idx_dim : reduction_axes)
{
dimensions[(4 - input_shape.size()) + idx_dim] = 1;
}
kernel::emit_cudnnTensor4dDescriptor(
writer, output_desc, tensor_format, tensor_type, dimensions);
// emit sum reduce operation
kernel::emit_cudnnReduceTensor(writer,
args[0],
out[0],
"CUDNN_REDUCE_TENSOR_ADD",
tensor_type,
"CUDNN_NOT_PROPAGATE_NAN",
input_desc,
output_desc,
1.0,
0.0);
}
// descriptors for Nd tensors
else
{
std::vector<size_t> dimensions = input_shape;
auto compute_strides = [](const std::vector<size_t>& dim) {
std::vector<size_t> strides(dim.size(), 1);
std::copy(dim.begin() + 1, dim.end(), strides.begin());
for (int64_t i = dim.size() - 2; i >= 0; i--)
{
strides[i] *= strides[i + 1];
}
return strides;
};
kernel::emit_cudnnTensorNdDescriptor(writer,
input_desc,
tensor_type,
dimensions.size(),
dimensions,
compute_strides(dimensions));
// mark reduced axes of input tensor for output tensor descriptor
for (auto const& idx_dim : reduction_axes)
{
dimensions[idx_dim] = 1;
}
kernel::emit_cudnnTensorNdDescriptor(writer,
output_desc,
tensor_type,
dimensions.size(),
dimensions,
compute_strides(dimensions));
// emit sum reduce operation
kernel::emit_cudnnReduceTensor(writer,
args[0],
out[0],
"CUDNN_REDUCE_TENSOR_ADD",
tensor_type,
"CUDNN_NOT_PROPAGATE_NAN",
input_desc,
output_desc,
1.0,
0.0);
}
}
}
writer.block_end();
return; return;
} }
} }
......
...@@ -58,7 +58,7 @@ namespace ngraph ...@@ -58,7 +58,7 @@ namespace ngraph
{ {
} }
static void EmitElementwise(GPU_ExternalFunction* external_function, static void emit_elementwise(GPU_ExternalFunction* external_function,
codegen::CodeWriter& writer, codegen::CodeWriter& writer,
const ngraph::Node* node, const ngraph::Node* node,
const std::vector<GPU_TensorViewWrapper>& args, const std::vector<GPU_TensorViewWrapper>& args,
......
...@@ -165,54 +165,54 @@ static const runtime::gpu::OpMap dispatcher{ ...@@ -165,54 +165,54 @@ static const runtime::gpu::OpMap dispatcher{
{TI(ngraph::op::Dot), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Dot>}, {TI(ngraph::op::Dot), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Dot>},
{TI(ngraph::op::Multiply), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Multiply>}, {TI(ngraph::op::Multiply), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Multiply>},
{TI(ngraph::op::Parameter), &runtime::gpu::GPU_Emitter::nop}, {TI(ngraph::op::Parameter), &runtime::gpu::GPU_Emitter::nop},
{TI(ngraph::op::Abs), &runtime::gpu::GPU_Emitter::EmitElementwise}, {TI(ngraph::op::Abs), &runtime::gpu::GPU_Emitter::emit_elementwise},
{TI(ngraph::op::Concat), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Concat>}, {TI(ngraph::op::Concat), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Concat>},
{TI(ngraph::op::Divide), &runtime::gpu::GPU_Emitter::EmitElementwise}, {TI(ngraph::op::Divide), &runtime::gpu::GPU_Emitter::emit_elementwise},
{TI(ngraph::op::Equal), &runtime::gpu::GPU_Emitter::EmitElementwise}, {TI(ngraph::op::Equal), &runtime::gpu::GPU_Emitter::emit_elementwise},
{TI(ngraph::op::GetOutputElement), {TI(ngraph::op::GetOutputElement),
&runtime::gpu::GPU_Emitter::emit<ngraph::op::GetOutputElement>}, &runtime::gpu::GPU_Emitter::emit<ngraph::op::GetOutputElement>},
{TI(ngraph::op::Greater), &runtime::gpu::GPU_Emitter::EmitElementwise}, {TI(ngraph::op::Greater), &runtime::gpu::GPU_Emitter::emit_elementwise},
{TI(ngraph::op::GreaterEq), &runtime::gpu::GPU_Emitter::EmitElementwise}, {TI(ngraph::op::GreaterEq), &runtime::gpu::GPU_Emitter::emit_elementwise},
{TI(ngraph::op::Less), &runtime::gpu::GPU_Emitter::EmitElementwise}, {TI(ngraph::op::Less), &runtime::gpu::GPU_Emitter::emit_elementwise},
{TI(ngraph::op::LessEq), &runtime::gpu::GPU_Emitter::EmitElementwise}, {TI(ngraph::op::LessEq), &runtime::gpu::GPU_Emitter::emit_elementwise},
{TI(ngraph::op::Log), &runtime::gpu::GPU_Emitter::EmitElementwise}, {TI(ngraph::op::Log), &runtime::gpu::GPU_Emitter::emit_elementwise},
{TI(ngraph::op::Maximum), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Maximum>}, {TI(ngraph::op::Maximum), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Maximum>},
{TI(ngraph::op::Minimum), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Minimum>}, {TI(ngraph::op::Minimum), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Minimum>},
{TI(ngraph::op::Negative), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Negative>}, {TI(ngraph::op::Negative), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Negative>},
{TI(ngraph::op::NotEqual), &runtime::gpu::GPU_Emitter::EmitElementwise}, {TI(ngraph::op::NotEqual), &runtime::gpu::GPU_Emitter::emit_elementwise},
{TI(ngraph::op::Power), &runtime::gpu::GPU_Emitter::EmitElementwise}, {TI(ngraph::op::Power), &runtime::gpu::GPU_Emitter::emit_elementwise},
{TI(ngraph::op::Select), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Select>}, {TI(ngraph::op::Select), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Select>},
{TI(ngraph::op::Subtract), &runtime::gpu::GPU_Emitter::EmitElementwise}, {TI(ngraph::op::Subtract), &runtime::gpu::GPU_Emitter::emit_elementwise},
{TI(ngraph::op::Broadcast), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Broadcast>}, {TI(ngraph::op::Broadcast), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Broadcast>},
{TI(ngraph::op::Convert), &runtime::gpu::GPU_Emitter::EmitElementwise}, {TI(ngraph::op::Convert), &runtime::gpu::GPU_Emitter::emit_elementwise},
{TI(ngraph::op::Constant), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Constant>}, {TI(ngraph::op::Constant), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Constant>},
{TI(ngraph::op::Reshape), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Reshape>}, {TI(ngraph::op::Reshape), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Reshape>},
{TI(ngraph::op::FunctionCall), &runtime::gpu::GPU_Emitter::emit<ngraph::op::FunctionCall>}, {TI(ngraph::op::FunctionCall), &runtime::gpu::GPU_Emitter::emit<ngraph::op::FunctionCall>},
{TI(ngraph::op::Reduce), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Reduce>}, {TI(ngraph::op::Reduce), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Reduce>},
{TI(ngraph::op::Sign), &runtime::gpu::GPU_Emitter::EmitElementwise}, {TI(ngraph::op::Sign), &runtime::gpu::GPU_Emitter::emit_elementwise},
{TI(ngraph::op::Slice), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Slice>}, {TI(ngraph::op::Slice), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Slice>},
{TI(ngraph::op::Sum), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Sum>}, {TI(ngraph::op::Sum), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Sum>},
{TI(ngraph::op::Exp), &runtime::gpu::GPU_Emitter::EmitElementwise}, {TI(ngraph::op::Exp), &runtime::gpu::GPU_Emitter::emit_elementwise},
{TI(ngraph::op::Sin), &runtime::gpu::GPU_Emitter::EmitElementwise}, {TI(ngraph::op::Sin), &runtime::gpu::GPU_Emitter::emit_elementwise},
{TI(ngraph::op::Sinh), &runtime::gpu::GPU_Emitter::EmitElementwise}, {TI(ngraph::op::Sinh), &runtime::gpu::GPU_Emitter::emit_elementwise},
{TI(ngraph::op::Cos), &runtime::gpu::GPU_Emitter::EmitElementwise}, {TI(ngraph::op::Cos), &runtime::gpu::GPU_Emitter::emit_elementwise},
{TI(ngraph::op::Cosh), &runtime::gpu::GPU_Emitter::EmitElementwise}, {TI(ngraph::op::Cosh), &runtime::gpu::GPU_Emitter::emit_elementwise},
{TI(ngraph::op::Tan), &runtime::gpu::GPU_Emitter::EmitElementwise}, {TI(ngraph::op::Tan), &runtime::gpu::GPU_Emitter::emit_elementwise},
{TI(ngraph::op::Tanh), &runtime::gpu::GPU_Emitter::EmitElementwise}, {TI(ngraph::op::Tanh), &runtime::gpu::GPU_Emitter::emit_elementwise},
{TI(ngraph::op::Asin), &runtime::gpu::GPU_Emitter::EmitElementwise}, {TI(ngraph::op::Asin), &runtime::gpu::GPU_Emitter::emit_elementwise},
{TI(ngraph::op::Acos), &runtime::gpu::GPU_Emitter::EmitElementwise}, {TI(ngraph::op::Acos), &runtime::gpu::GPU_Emitter::emit_elementwise},
{TI(ngraph::op::Atan), &runtime::gpu::GPU_Emitter::EmitElementwise}, {TI(ngraph::op::Atan), &runtime::gpu::GPU_Emitter::emit_elementwise},
{TI(ngraph::op::ReplaceSlice), &runtime::gpu::GPU_Emitter::emit<ngraph::op::ReplaceSlice>}, {TI(ngraph::op::ReplaceSlice), &runtime::gpu::GPU_Emitter::emit<ngraph::op::ReplaceSlice>},
{TI(ngraph::op::OneHot), &runtime::gpu::GPU_Emitter::emit<ngraph::op::OneHot>}, {TI(ngraph::op::OneHot), &runtime::gpu::GPU_Emitter::emit<ngraph::op::OneHot>},
{TI(ngraph::op::Floor), &runtime::gpu::GPU_Emitter::EmitElementwise}, {TI(ngraph::op::Floor), &runtime::gpu::GPU_Emitter::emit_elementwise},
{TI(ngraph::op::Ceiling), &runtime::gpu::GPU_Emitter::EmitElementwise}, {TI(ngraph::op::Ceiling), &runtime::gpu::GPU_Emitter::emit_elementwise},
{TI(ngraph::op::Sqrt), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Sqrt>}, {TI(ngraph::op::Sqrt), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Sqrt>},
{TI(ngraph::op::Convolution), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Convolution>}, {TI(ngraph::op::Convolution), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Convolution>},
{TI(ngraph::op::ConvolutionBackpropFilters), {TI(ngraph::op::ConvolutionBackpropFilters),
&runtime::gpu::GPU_Emitter::emit<ngraph::op::ConvolutionBackpropFilters>}, &runtime::gpu::GPU_Emitter::emit<ngraph::op::ConvolutionBackpropFilters>},
{TI(ngraph::op::ConvolutionBackpropData), {TI(ngraph::op::ConvolutionBackpropData),
&runtime::gpu::GPU_Emitter::emit<ngraph::op::ConvolutionBackpropData>}, &runtime::gpu::GPU_Emitter::emit<ngraph::op::ConvolutionBackpropData>},
{TI(ngraph::op::Not), &runtime::gpu::GPU_Emitter::EmitElementwise}, {TI(ngraph::op::Not), &runtime::gpu::GPU_Emitter::emit_elementwise},
{TI(ngraph::op::MaxPool), &runtime::gpu::GPU_Emitter::emit<ngraph::op::MaxPool>}, {TI(ngraph::op::MaxPool), &runtime::gpu::GPU_Emitter::emit<ngraph::op::MaxPool>},
{TI(ngraph::op::Reverse), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Reverse>}, {TI(ngraph::op::Reverse), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Reverse>},
{TI(ngraph::op::Result), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Result>}, {TI(ngraph::op::Result), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Result>},
...@@ -231,8 +231,8 @@ static const runtime::gpu::OpMap dispatcher{ ...@@ -231,8 +231,8 @@ static const runtime::gpu::OpMap dispatcher{
{TI(ngraph::op::Product), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Product>}, {TI(ngraph::op::Product), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Product>},
{TI(ngraph::op::Max), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Max>}, {TI(ngraph::op::Max), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Max>},
{TI(ngraph::op::Min), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Min>}, {TI(ngraph::op::Min), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Min>},
{TI(ngraph::op::Relu), &runtime::gpu::GPU_Emitter::EmitElementwise}, {TI(ngraph::op::Relu), &runtime::gpu::GPU_Emitter::emit_elementwise},
{TI(ngraph::op::ReluBackprop), &runtime::gpu::GPU_Emitter::EmitElementwise}, {TI(ngraph::op::ReluBackprop), &runtime::gpu::GPU_Emitter::emit_elementwise},
{TI(ngraph::op::Softmax), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Softmax>}, {TI(ngraph::op::Softmax), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Softmax>},
}; };
......
...@@ -13,11 +13,113 @@ ...@@ -13,11 +13,113 @@
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*******************************************************************************/ *******************************************************************************/
#include <algorithm> #include <algorithm>
#include <map> #include <map>
#include "gpu_kernel_emitters.hpp"
#include "ngraph/codegen/code_writer.hpp" #include "ngraph/codegen/code_writer.hpp"
#include "ngraph/runtime/gpu/gpu_kernel_emitters.hpp" #include "ngraph/util.hpp"
using namespace ngraph; using namespace ngraph;
using namespace ngraph::runtime::gpu::kernel;
void runtime::gpu::kernel::emit_memset(codegen::CodeWriter& writer,
const GPU_TensorViewWrapper& dst,
int value,
size_t buffer_size)
{
if (buffer_size == 0)
{
buffer_size = dst.get_size() * dst.get_element_type().size();
}
writer << "runtime::gpu::cuda_memset(" << dst.get_name() << ", " << value << ", " << buffer_size
<< ");\n";
}
void runtime::gpu::kernel::emit_memcpyDtD(codegen::CodeWriter& writer,
const GPU_TensorViewWrapper& dst,
const GPU_TensorViewWrapper& src)
{
writer << "runtime::gpu::cuda_memcpyDtD(" << dst.get_name() << ", " << src.get_name() << ", "
<< dst.get_size() << " * " << dst.get_element_type().size() << ");\n";
return;
}
void runtime::gpu::kernel::emit_cudnnTensor4dDescriptor(codegen::CodeWriter& writer,
const std::string& name,
const std::string& format,
const std::string& data_type,
const std::array<size_t, 4>& axes)
{
writer << "cudnnTensorDescriptor_t " << name << ";\n";
writer << "cudnnCreateTensorDescriptor(&" << name << ");\n";
writer << "cudnnSetTensor4dDescriptor(" << name << ",\n";
writer << " /*format=*/" << format << ",\n";
writer << " /*dataType=*/" << data_type;
for (auto const& axis : axes)
{
writer << ",\n /*dimension_size*/" << axis;
}
writer << ");\n";
}
void runtime::gpu::kernel::emit_cudnnTensorNdDescriptor(codegen::CodeWriter& writer,
const std::string& name,
const std::string& data_type,
const size_t& num_axes,
const std::vector<size_t>& axes,
const std::vector<size_t>& strides)
{
writer << "const int " << name << "_axes[] = {" << join(axes) << "};\n";
writer << "const int " << name << "_strides[] = {" << join(strides) << "};\n";
writer << "cudnnTensorDescriptor_t " << name << ";\n";
writer << "cudnnCreateTensorDescriptor(&" << name << ");\n";
writer << "cudnnSetTensorNdDescriptor(" << name << ",\n";
writer << " /*dataType=*/" << data_type << ",\n";
writer << " /*num_dimensions=*/" << num_axes << ",\n";
writer << " /*dimensions*/" << name << "_axes,\n";
writer << " /*strides*/" << name << "_strides);\n";
}
void runtime::gpu::kernel::emit_cudnnReduceTensor(codegen::CodeWriter& writer,
const GPU_TensorViewWrapper& in,
const GPU_TensorViewWrapper& out,
const std::string& reduce_op,
const std::string& data_type,
const std::string& nan_prop,
const std::string& input_desc,
const std::string& output_desc,
const float& alpha,
const float& beta)
{
writer << "cudnnReduceTensorDescriptor_t reduceTensorDesc;\n";
writer << "cudnnCreateReduceTensorDescriptor(&reduceTensorDesc);\n";
writer << "cudnnSetReduceTensorDescriptor(reduceTensorDesc,\n";
writer << " " << reduce_op << ",\n";
writer << " " << data_type << ",\n";
writer << " " << nan_prop << ",\n";
writer << " CUDNN_REDUCE_TENSOR_NO_INDICES,\n";
writer << " CUDNN_32BIT_INDICES);\n";
writer << "size_t workspace_size = 0;\n";
writer << "cudnnGetReductionWorkspaceSize(cudnn_handle,\n";
writer << " reduceTensorDesc,\n";
writer << " " << input_desc << ",\n";
writer << " " << output_desc << ",\n";
writer << " &workspace_size);\n";
writer << "void* workspace_ptr = "
"ngraph::runtime::gpu::create_gpu_buffer(workspace_size);\n";
writer << "float alpha = " << alpha << ", beta = " << beta << ";\n";
writer << "cudnnReduceTensor(cudnn_handle,\n";
writer << " reduceTensorDesc,\n";
writer << " nullptr,\n";
writer << " 0,\n";
writer << " workspace_ptr,\n";
writer << " workspace_size,\n";
writer << " &alpha,\n";
writer << " " << input_desc << ",\n";
writer << " " << in.get_name() << ",\n";
writer << " &beta,\n";
writer << " " << output_desc << ",\n";
writer << " " << out.get_name() << ");\n";
writer << "ngraph::runtime::gpu::free_gpu_buffer(workspace_ptr);\n";
}
...@@ -17,6 +17,8 @@ ...@@ -17,6 +17,8 @@
#pragma once #pragma once
#include "ngraph/codegen/code_writer.hpp" #include "ngraph/codegen/code_writer.hpp"
#include "ngraph/node.hpp"
#include "ngraph/runtime/gpu/gpu_tensor_view_wrapper.hpp"
namespace ngraph namespace ngraph
{ {
...@@ -26,6 +28,38 @@ namespace ngraph ...@@ -26,6 +28,38 @@ namespace ngraph
{ {
namespace kernel namespace kernel
{ {
void emit_memset(codegen::CodeWriter& writer,
const GPU_TensorViewWrapper& dst,
int value,
size_t buffer_size = 0);
void emit_memcpyDtD(codegen::CodeWriter& writer,
const GPU_TensorViewWrapper& dst,
const GPU_TensorViewWrapper& src);
void emit_cudnnTensor4dDescriptor(codegen::CodeWriter& writer,
const std::string& name,
const std::string& format,
const std::string& data_type,
const std::array<size_t, 4>& axes);
void emit_cudnnTensorNdDescriptor(codegen::CodeWriter& writer,
const std::string& name,
const std::string& data_type,
const size_t& num_axes,
const std::vector<size_t>& axes,
const std::vector<size_t>& strides);
void emit_cudnnReduceTensor(codegen::CodeWriter& writer,
const GPU_TensorViewWrapper& in,
const GPU_TensorViewWrapper& out,
const std::string& reduce_op,
const std::string& data_type,
const std::string& nan_prop,
const std::string& input_desc,
const std::string& output_desc,
const float& alpha,
const float& beta);
} }
} }
} }
......
...@@ -3163,7 +3163,6 @@ TEST(${BACKEND_NAME}, tensor_constant_int64) ...@@ -3163,7 +3163,6 @@ TEST(${BACKEND_NAME}, tensor_constant_int64)
// Trivial case with no summed axes. // Trivial case with no summed axes.
TEST(${BACKEND_NAME}, sum_trivial) TEST(${BACKEND_NAME}, sum_trivial)
{ {
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape{2, 2}; Shape shape{2, 2};
auto A = make_shared<op::Parameter>(element::f32, shape); auto A = make_shared<op::Parameter>(element::f32, shape);
auto f = make_shared<Function>(make_shared<op::Sum>(A, AxisSet{}), op::ParameterVector{A}); auto f = make_shared<Function>(make_shared<op::Sum>(A, AxisSet{}), op::ParameterVector{A});
...@@ -3185,7 +3184,6 @@ TEST(${BACKEND_NAME}, sum_trivial) ...@@ -3185,7 +3184,6 @@ TEST(${BACKEND_NAME}, sum_trivial)
// Failure has been reported at 5D for some reason // Failure has been reported at 5D for some reason
TEST(${BACKEND_NAME}, sum_trivial_5d) TEST(${BACKEND_NAME}, sum_trivial_5d)
{ {
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape{2, 2, 2, 2, 2}; Shape shape{2, 2, 2, 2, 2};
auto A = make_shared<op::Parameter>(element::f32, shape); auto A = make_shared<op::Parameter>(element::f32, shape);
auto f = make_shared<Function>(make_shared<op::Sum>(A, AxisSet{}), op::ParameterVector{A}); auto f = make_shared<Function>(make_shared<op::Sum>(A, AxisSet{}), op::ParameterVector{A});
...@@ -3209,7 +3207,6 @@ TEST(${BACKEND_NAME}, sum_trivial_5d) ...@@ -3209,7 +3207,6 @@ TEST(${BACKEND_NAME}, sum_trivial_5d)
TEST(${BACKEND_NAME}, sum_to_scalar) TEST(${BACKEND_NAME}, sum_to_scalar)
{ {
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape{2, 2}; Shape shape{2, 2};
auto A = make_shared<op::Parameter>(element::f32, shape); auto A = make_shared<op::Parameter>(element::f32, shape);
auto f = make_shared<Function>(make_shared<op::Sum>(A, AxisSet{0, 1}), op::ParameterVector{A}); auto f = make_shared<Function>(make_shared<op::Sum>(A, AxisSet{0, 1}), op::ParameterVector{A});
...@@ -3234,7 +3231,6 @@ TEST(${BACKEND_NAME}, sum_to_scalar) ...@@ -3234,7 +3231,6 @@ TEST(${BACKEND_NAME}, sum_to_scalar)
TEST(${BACKEND_NAME}, sum_matrix_columns) TEST(${BACKEND_NAME}, sum_matrix_columns)
{ {
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape_a{3, 2}; Shape shape_a{3, 2};
auto A = make_shared<op::Parameter>(element::f32, shape_a); auto A = make_shared<op::Parameter>(element::f32, shape_a);
Shape shape_rt{2}; Shape shape_rt{2};
...@@ -3260,7 +3256,6 @@ TEST(${BACKEND_NAME}, sum_matrix_columns) ...@@ -3260,7 +3256,6 @@ TEST(${BACKEND_NAME}, sum_matrix_columns)
TEST(${BACKEND_NAME}, sum_matrix_rows) TEST(${BACKEND_NAME}, sum_matrix_rows)
{ {
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape_a{3, 2}; Shape shape_a{3, 2};
auto A = make_shared<op::Parameter>(element::f32, shape_a); auto A = make_shared<op::Parameter>(element::f32, shape_a);
Shape shape_rt{3}; Shape shape_rt{3};
...@@ -3286,7 +3281,6 @@ TEST(${BACKEND_NAME}, sum_matrix_rows) ...@@ -3286,7 +3281,6 @@ TEST(${BACKEND_NAME}, sum_matrix_rows)
TEST(${BACKEND_NAME}, sum_matrix_rows_zero) TEST(${BACKEND_NAME}, sum_matrix_rows_zero)
{ {
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
SKIP_TEST_FOR("NNP", "${BACKEND_NAME}"); SKIP_TEST_FOR("NNP", "${BACKEND_NAME}");
Shape shape_a{3, 0}; Shape shape_a{3, 0};
...@@ -3315,9 +3309,7 @@ TEST(${BACKEND_NAME}, sum_matrix_rows_zero) ...@@ -3315,9 +3309,7 @@ TEST(${BACKEND_NAME}, sum_matrix_rows_zero)
TEST(${BACKEND_NAME}, sum_matrix_cols_zero) TEST(${BACKEND_NAME}, sum_matrix_cols_zero)
{ {
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
SKIP_TEST_FOR("NNP", "${BACKEND_NAME}"); SKIP_TEST_FOR("NNP", "${BACKEND_NAME}");
// Now the reduction (g(x:float32[2,2],y:float32[]) = reduce(x,y,f,axes={})). // Now the reduction (g(x:float32[2,2],y:float32[]) = reduce(x,y,f,axes={})).
Shape shape_a{0, 2}; Shape shape_a{0, 2};
auto A = make_shared<op::Parameter>(element::f32, shape_a); auto A = make_shared<op::Parameter>(element::f32, shape_a);
...@@ -3345,7 +3337,6 @@ TEST(${BACKEND_NAME}, sum_matrix_cols_zero) ...@@ -3345,7 +3337,6 @@ TEST(${BACKEND_NAME}, sum_matrix_cols_zero)
TEST(${BACKEND_NAME}, sum_vector_zero) TEST(${BACKEND_NAME}, sum_vector_zero)
{ {
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
SKIP_TEST_FOR("NNP", "${BACKEND_NAME}"); SKIP_TEST_FOR("NNP", "${BACKEND_NAME}");
Shape shape_a{0}; Shape shape_a{0};
...@@ -3374,7 +3365,6 @@ TEST(${BACKEND_NAME}, sum_vector_zero) ...@@ -3374,7 +3365,6 @@ TEST(${BACKEND_NAME}, sum_vector_zero)
TEST(${BACKEND_NAME}, sum_matrix_to_scalar_zero_by_zero) TEST(${BACKEND_NAME}, sum_matrix_to_scalar_zero_by_zero)
{ {
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
SKIP_TEST_FOR("NNP", "${BACKEND_NAME}"); SKIP_TEST_FOR("NNP", "${BACKEND_NAME}");
Shape shape_a{0, 0}; Shape shape_a{0, 0};
...@@ -3403,7 +3393,6 @@ TEST(${BACKEND_NAME}, sum_matrix_to_scalar_zero_by_zero) ...@@ -3403,7 +3393,6 @@ TEST(${BACKEND_NAME}, sum_matrix_to_scalar_zero_by_zero)
TEST(${BACKEND_NAME}, sum_3d_to_matrix_most_sig) TEST(${BACKEND_NAME}, sum_3d_to_matrix_most_sig)
{ {
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape_a{3, 3, 3}; Shape shape_a{3, 3, 3};
auto A = make_shared<op::Parameter>(element::f32, shape_a); auto A = make_shared<op::Parameter>(element::f32, shape_a);
Shape shape_rt{3, 3}; Shape shape_rt{3, 3};
...@@ -3435,7 +3424,6 @@ TEST(${BACKEND_NAME}, sum_3d_to_matrix_most_sig) ...@@ -3435,7 +3424,6 @@ TEST(${BACKEND_NAME}, sum_3d_to_matrix_most_sig)
TEST(${BACKEND_NAME}, sum_3d_to_matrix_least_sig) TEST(${BACKEND_NAME}, sum_3d_to_matrix_least_sig)
{ {
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape_a{3, 3, 3}; Shape shape_a{3, 3, 3};
auto A = make_shared<op::Parameter>(element::f32, shape_a); auto A = make_shared<op::Parameter>(element::f32, shape_a);
Shape shape_rt{3, 3}; Shape shape_rt{3, 3};
...@@ -3467,7 +3455,6 @@ TEST(${BACKEND_NAME}, sum_3d_to_matrix_least_sig) ...@@ -3467,7 +3455,6 @@ TEST(${BACKEND_NAME}, sum_3d_to_matrix_least_sig)
TEST(${BACKEND_NAME}, sum_3d_to_vector) TEST(${BACKEND_NAME}, sum_3d_to_vector)
{ {
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape_a{3, 3, 3}; Shape shape_a{3, 3, 3};
auto A = make_shared<op::Parameter>(element::f32, shape_a); auto A = make_shared<op::Parameter>(element::f32, shape_a);
Shape shape_rt{3}; Shape shape_rt{3};
...@@ -3493,7 +3480,6 @@ TEST(${BACKEND_NAME}, sum_3d_to_vector) ...@@ -3493,7 +3480,6 @@ TEST(${BACKEND_NAME}, sum_3d_to_vector)
TEST(${BACKEND_NAME}, sum_3d_to_scalar) TEST(${BACKEND_NAME}, sum_3d_to_scalar)
{ {
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape_a{3, 3, 3}; Shape shape_a{3, 3, 3};
auto A = make_shared<op::Parameter>(element::f32, shape_a); auto A = make_shared<op::Parameter>(element::f32, shape_a);
Shape shape_rt{}; Shape shape_rt{};
...@@ -3519,7 +3505,6 @@ TEST(${BACKEND_NAME}, sum_3d_to_scalar) ...@@ -3519,7 +3505,6 @@ TEST(${BACKEND_NAME}, sum_3d_to_scalar)
TEST(${BACKEND_NAME}, sum_3d_eliminate_zero_dim) TEST(${BACKEND_NAME}, sum_3d_eliminate_zero_dim)
{ {
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
SKIP_TEST_FOR("NNP", "${BACKEND_NAME}"); SKIP_TEST_FOR("NNP", "${BACKEND_NAME}");
Shape shape_a{3, 0, 2}; Shape shape_a{3, 0, 2};
...@@ -3546,7 +3531,6 @@ TEST(${BACKEND_NAME}, sum_3d_eliminate_zero_dim) ...@@ -3546,7 +3531,6 @@ TEST(${BACKEND_NAME}, sum_3d_eliminate_zero_dim)
TEST(${BACKEND_NAME}, sum_to_scalar_stable) TEST(${BACKEND_NAME}, sum_to_scalar_stable)
{ {
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
SKIP_TEST_FOR("NNP", "${BACKEND_NAME}"); SKIP_TEST_FOR("NNP", "${BACKEND_NAME}");
Shape shape{2, 2}; Shape shape{2, 2};
...@@ -3570,7 +3554,6 @@ TEST(${BACKEND_NAME}, sum_to_scalar_stable) ...@@ -3570,7 +3554,6 @@ TEST(${BACKEND_NAME}, sum_to_scalar_stable)
TEST(${BACKEND_NAME}, sum_3d_to_vector_stable) TEST(${BACKEND_NAME}, sum_3d_to_vector_stable)
{ {
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
SKIP_TEST_FOR("NNP", "${BACKEND_NAME}"); SKIP_TEST_FOR("NNP", "${BACKEND_NAME}");
Shape shape_a{3, 3, 3}; Shape shape_a{3, 3, 3};
...@@ -3594,6 +3577,28 @@ TEST(${BACKEND_NAME}, sum_3d_to_vector_stable) ...@@ -3594,6 +3577,28 @@ TEST(${BACKEND_NAME}, sum_3d_to_vector_stable)
test::all_close(read_vector<float>(result), vector<float>{1e-4f, 1e-5f, 1e-6f}, 5e-2f)); test::all_close(read_vector<float>(result), vector<float>{1e-4f, 1e-5f, 1e-6f}, 5e-2f));
} }
TEST(${BACKEND_NAME}, sum_5d_to_scalar)
{
Shape shape_a{3, 3, 3, 3, 3};
auto A = make_shared<op::Parameter>(element::f32, shape_a);
Shape shape_rt{};
auto f = make_shared<Function>(make_shared<op::Sum>(A, AxisSet{0, 1, 2, 3, 4}),
op::ParameterVector{A});
auto manager = runtime::Manager::get("${BACKEND_NAME}");
auto external = manager->compile(f);
auto backend = manager->allocate_backend();
auto cf = backend->make_call_frame(external);
// Create some tensors for input/output
auto a = backend->make_primary_tensor_view(element::f32, shape_a);
copy_data(a, std::vector<float>(std::pow(3, 5), 1));
auto result = backend->make_primary_tensor_view(element::f32, shape_rt);
cf->call({result}, {a});
EXPECT_EQ(std::vector<float>{243.}, read_vector<float>(result));
}
TEST(${BACKEND_NAME}, sign) TEST(${BACKEND_NAME}, sign)
{ {
Shape shape{2, 3}; Shape shape{2, 3};
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment