Commit bae77590 authored by Chris Sullivan's avatar Chris Sullivan Committed by adstraw

Add reduce sum to the GPU transformer (op::Sum) (#671)

* Current cudnn implementations use only
a single dimension for the ngraph tensor data (width).
In this case the tensor format should be set to

CUDNN_TENSOR_NCHW

so that adjacent memory accesses are coalesced (stride=1 for width).

* * Added some kernel emitter helpers that are reused often.
* Renamed EmitElementwise -> emit_elementwise to match emit<T>.
* op::Sum now handles trivial case of dim(input_tensor) = dim(output_tensor)
  by performing a memcpy as no axes are reduced.

*   Added general case for Nd descriptors which is used when the tensor
  has more than 4 dimensions. Currently a naive reduce is performed,
  in the future a coordinate transformation could be performed to
  improve the memory layout for the reduction.

* Switched to codegen::CodeWriter::block_begin/end.
It appears that CodeWriter::block_begin/end is not frequently used for emitters (in cpu and gpu transformers)
because a block comment is often desired. To this end I added prefix/suffix default parameters to CodeWriter::block_begin/end
so that this functionality is captured.
parent 72f4d661
......@@ -68,16 +68,16 @@ public:
std::string generate_temporary_name(std::string prefix = "tempvar");
void block_begin()
void block_begin(std::string block_prefix = "")
{
*this << "{\n";
*this << "{" << block_prefix << "\n";
indent++;
}
void block_end()
void block_end(std::string block_suffix = "")
{
indent--;
*this << "}\n";
*this << "}" << block_suffix << "\n";
}
private:
......
......@@ -103,10 +103,10 @@ namespace ngraph
{
namespace gpu
{
void GPU_Emitter::EmitElementwise(
void GPU_Emitter::emit_elementwise(
GPU_ExternalFunction* external_function,
codegen::CodeWriter& writer,
const ngraph::Node* n,
const ngraph::Node* node,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
......@@ -115,12 +115,11 @@ namespace ngraph
return;
}
writer << "{ // " << n->get_name() << "\n";
writer.indent++;
writer.block_begin(" // " + node->get_name());
writer << "int count = " << out[0].get_size() << ";\n";
writer << "if(count == 0) return;\n";
writer << "ngraph::runtime::gpu::emit_elementwise_op<ngraph::op::"
<< n->description() << ">(\"" << n->description() << "\""
<< node->description() << ">(\"" << node->description() << "\""
<< ", {\"" << args[0].get_type() << "\", \"" << out[0].get_type() << "\"}"
<< ", count"
<< ", CUdeviceptr(" << out[0].get_name() << ")";
......@@ -129,8 +128,7 @@ namespace ngraph
writer << ", CUdeviceptr(" << args[i].get_name() << ")";
}
writer << ");\n";
writer.indent--;
writer << "}\n";
writer.block_end();
}
template <>
......@@ -140,15 +138,14 @@ namespace ngraph
{
return;
}
writer << "{ // " << node->get_name() << "\n";
writer.indent++;
writer.block_begin(" // " + node->get_name());
writer << "int count = " << out[0].get_size() << ";\n";
writer += R"(
float alpha1 = 1.0, alpha2 = 1.0, beta = 0;
cudnnTensorDescriptor_t descriptor;
cudnnCreateTensorDescriptor(&descriptor);
cudnnSetTensor4dDescriptor(descriptor,
/*format=*/CUDNN_TENSOR_NHWC,
/*format=*/CUDNN_TENSOR_NCHW,
/*dataType=*/CUDNN_DATA_FLOAT,
/*batch_size=*/1,
/*channels=*/1,
......@@ -171,8 +168,7 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
<< "descriptor," << args[1].get_name() << ","
<< "&beta,"
<< "descriptor," << out[0].get_name() << ");\n";
writer.indent--;
writer << "}\n";
writer.block_end();
}
template <>
......@@ -190,8 +186,8 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
{
auto& first = (arg0_shape.empty() ? args[0] : args[1]);
auto& second = (arg0_shape.empty() ? args[1] : args[0]);
writer << "{ // " << node->get_name() << "\n";
writer.indent++;
writer.block_begin(" // " + node->get_name());
writer << "int count = " << second.get_size() << ";\n";
writer << "cublasScopy("
<< "cublas_handle,"
......@@ -201,20 +197,17 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
<< "cublas_handle,"
<< "count ," << first.get_name() << "," << out[0].get_name()
<< ", 1);\n";
writer.indent--;
writer << "}\n";
writer.block_end();
return;
}
//set output to 0 if input size is 0
if (args[0].get_size() == 0 || args[1].get_size() == 0)
{
writer << "{ // " << node->get_name() << "\n";
writer.indent++;
writer.block_begin(" // " + node->get_name());
writer << "runtime::gpu::cuda_memset(" << out[0].get_name() << ", 0, "
<< out[0].get_size() << " * sizeof(float));\n";
writer.indent--;
writer << "}\n";
writer.block_end();
return;
}
......@@ -231,21 +224,18 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
"input1 and input2 shape does not match for dot;");
}
}
writer << "{ // " << node->get_name() << "\n";
writer.indent++;
writer.block_begin(" // " + node->get_name());
writer << "cublasSdot("
<< "cublas_handle," << args[0].get_size() << "," << args[0].get_name()
<< ","
<< "1," << args[1].get_name() << ","
<< "1," << out[0].get_name() << ");\n";
writer.indent--;
writer << "}\n";
writer.block_end();
}
else if ((arg0_shape.size() == 2) && (arg1_shape.size() == 1) &&
(dot->get_reduction_axes_count() == 1))
{
writer << "{ // " << node->get_name() << "\n";
writer.indent++;
writer.block_begin(" // " + node->get_name());
writer << "const float alpha = 1.0;\n";
writer << "const float beta = 0;\n";
writer << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_HOST);\n";
......@@ -260,8 +250,7 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
<< out[0].get_name() << ","
<< "1);\n";
writer << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_DEVICE);\n";
writer.indent--;
writer << "}\n";
writer.block_end();
}
else if ((arg0_shape.size() == 2) && (arg1_shape.size() == 2) &&
(dot->get_reduction_axes_count() == 1))
......@@ -273,8 +262,7 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
{
throw std::runtime_error("input and output shape does not match for dot;");
}
writer << "{ // " << node->get_name() << "\n";
writer.indent++;
writer.block_begin(" // " + node->get_name());
writer << "const float alpha = 1.0;\n";
writer << "const float beta = 0.0;\n";
writer << "int m = " << arg0_shape[0] << ";\n";
......@@ -296,8 +284,7 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
<< out[0].get_name() << ","
<< "n);\n";
writer << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_DEVICE);\n";
writer.indent--;
writer << "}\n";
writer.block_end();
}
else
{
......@@ -313,15 +300,14 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
{
return;
}
writer << "{ // " << node->get_name() << "\n";
writer.indent++;
writer.block_begin(" // " + node->get_name());
writer << "int count = " << out[0].get_size() << ";\n";
writer += R"(
float alpha1 = 1.0, alpha2 = 1.0, beta = 0;
cudnnTensorDescriptor_t descriptor;
cudnnCreateTensorDescriptor(&descriptor);
cudnnSetTensor4dDescriptor(descriptor,
/*format=*/CUDNN_TENSOR_NHWC,
/*format=*/CUDNN_TENSOR_NCHW,
/*dataType=*/CUDNN_DATA_FLOAT,
/*batch_size=*/1,
/*channels=*/1,
......@@ -344,8 +330,7 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
<< "descriptor," << args[1].get_name() << ","
<< "&beta,"
<< "descriptor," << out[0].get_name() << ");\n";
writer.indent--;
writer << "}\n";
writer.block_end();
}
template <>
......@@ -355,15 +340,14 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
{
return;
}
writer << "{ // " << node->get_name() << "\n";
writer.indent++;
writer.block_begin(" // " + node->get_name());
writer << "int count = " << out[0].get_size() << ";\n";
writer += R"(
float alpha1 = 1.0, alpha2 = 1.0, beta = 0;
cudnnTensorDescriptor_t descriptor;
cudnnCreateTensorDescriptor(&descriptor);
cudnnSetTensor4dDescriptor(descriptor,
/*format=*/CUDNN_TENSOR_NHWC,
/*format=*/CUDNN_TENSOR_NCHW,
/*dataType=*/CUDNN_DATA_FLOAT,
/*batch_size=*/1,
/*channels=*/1,
......@@ -386,8 +370,7 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
<< "descriptor," << args[1].get_name() << ","
<< "&beta,"
<< "descriptor," << out[0].get_name() << ");\n";
writer.indent--;
writer << "}\n";
writer.block_end();
}
template <>
......@@ -397,15 +380,14 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
{
return;
}
writer << "{ // " << node->get_name() << "\n";
writer.indent++;
writer.block_begin(" // " + node->get_name());
writer << "int count = " << out[0].get_size() << ";\n";
writer += R"(
float alpha1 = -1.0, alpha2 = 0, beta = 0;
cudnnTensorDescriptor_t descriptor;
cudnnCreateTensorDescriptor(&descriptor);
cudnnSetTensor4dDescriptor(descriptor,
/*format=*/CUDNN_TENSOR_NHWC,
/*format=*/CUDNN_TENSOR_NCHW,
/*dataType=*/CUDNN_DATA_FLOAT,
/*batch_size=*/1,
/*channels=*/1,
......@@ -428,8 +410,7 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
<< "descriptor," << args[0].get_name() << ","
<< "&beta,"
<< "descriptor," << out[0].get_name() << ");\n";
writer.indent--;
writer << "}\n";
writer.block_end();
}
template <>
......@@ -447,13 +428,9 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
//broadcast axes is empty, do a copy
if (axes.empty())
{
writer << "{ // " << node->get_name() << " \n";
writer.indent++;
writer << "runtime::gpu::cuda_memcpyDtD(" << out[0].get_name() << ", "
<< args[0].get_name() << ", " << out[0].get_size() << " * "
<< out[0].get_element_type().size() << ");\n";
writer.indent--;
writer << "}\n";
writer.block_begin(" // " + node->get_name());
kernel::emit_memcpyDtD(writer, out[0], args[0]);
writer.block_end();
return;
}
......@@ -487,8 +464,7 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
repeat_size *= result_shape[i];
}
writer << "{ // " << node->get_name() << " \n";
writer.indent++;
writer.block_begin(" // " + node->get_name());
writer << "runtime::gpu::emit_broadcast(\"" << node->description()
<< "\", CUdeviceptr(" << args[0].get_name() << "), CUdeviceptr("
<< out[0].get_name() << ")"
......@@ -496,9 +472,7 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
<< "\"}"
<< ", " << repeat_size << ", " << repeat_times << ", "
<< out[0].get_size() << ");\n";
writer.indent--;
writer << "}\n";
writer.block_end();
}
else
{
......@@ -519,8 +493,7 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
return;
}
auto reshape = static_cast<const op::Reshape*>(node);
writer << "{ // " << node->get_name() << "\n";
writer.indent++;
writer.block_begin(" // " + node->get_name());
auto arg_shape = args[0].get_shape();
auto arg_rank = arg_shape.size();
auto result_shape = out[0].get_shape();
......@@ -536,19 +509,12 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
// we can just copy.
if (same_layout || result_shape_product < 2)
{
writer << "{ // " << node->get_name() << " 1\n";
writer.indent++;
writer << "runtime::gpu::cuda_memcpyDtD(" << out[0].get_name() << ", "
<< args[0].get_name() << ", " << out[0].get_size() << " * "
<< out[0].get_element_type().size() << ");\n";
writer.indent--;
writer << "}\n";
kernel::emit_memcpyDtD(writer, out[0], args[0]);
}
// If there *is* a layout change in the 2D case, we transpose the input.
else if (arg_rank == 2)
{
// TODO Assert arg0_shape[0] == arg1_shape[0]?
writer << "{ // " << node->get_name() << "\n";
writer.indent++;
writer << "const float alpha = 1.0;\n";
writer << "const float beta = 0;\n";
......@@ -563,8 +529,6 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
<< args[0].get_name() << "," << arg_shape[1] << "," << out[0].get_name()
<< "," << result_shape[1] << ");\n";
writer << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_DEVICE);\n";
writer.indent--;
writer << "}\n";
}
// Other cases (reordering of axes for tensors with rank>2) are not handled yet.
else
......@@ -573,8 +537,7 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
"Axis permutation in reshape is not implemented yet for tensors with "
"rank>2");
}
writer.indent--;
writer << "}\n";
writer.block_end();
}
template <>
......@@ -589,15 +552,14 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
{
return;
}
writer << "{ // " << node->get_name() << "\n";
writer.indent++;
writer.block_begin(" // " + node->get_name());
writer << "int count = " << out[0].get_size() << ";\n";
writer += R"(
float alpha1 = 1.0, alpha2 = 1.0, beta = 0;
cudnnTensorDescriptor_t descriptor;
cudnnCreateTensorDescriptor(&descriptor);
cudnnSetTensor4dDescriptor(descriptor,
/*format=*/CUDNN_TENSOR_NHWC,
/*format=*/CUDNN_TENSOR_NCHW,
/*dataType=*/CUDNN_DATA_FLOAT,
/*batch_size=*/1,
/*channels=*/1,
......@@ -620,8 +582,7 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
<< "descriptor," << args[1].get_name() << ","
<< "&beta,"
<< "descriptor," << out[0].get_name() << ");\n";
writer.indent--;
writer << "}\n";
writer.block_end();
}
template <>
......@@ -642,8 +603,7 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
repeat_size *= result_shape[i];
}
writer << "{ // " << node->get_name() << "\n";
writer.indent++;
writer.block_begin(" // " + node->get_name());
writer << "runtime::gpu::cuda_memset(" << out[0].get_name() << ", 0, "
<< out[0].get_size() << " * " << out[0].get_element_type().size() << ");\n";
writer << "runtime::gpu::emit_onehot(\"" << node->description()
......@@ -652,8 +612,7 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
<< ", {\"" << args[0].get_type() << "\", \"" << out[0].get_type() << "\"}"
<< ", " << repeat_size << ", " << repeat_times << ", " << args[0].get_size()
<< ");\n";
writer.indent--;
writer << "}\n";
writer.block_end();
}
template <>
......@@ -663,15 +622,14 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
{
return;
}
writer << "{ // " << node->get_name() << "\n";
writer.indent++;
writer.block_begin(" // " + node->get_name());
writer << "int count = " << out[0].get_size() << ";\n";
writer += R"(
float alpha1 = 1.0, alpha2 = 0, beta = 0;
cudnnTensorDescriptor_t descriptor;
cudnnCreateTensorDescriptor(&descriptor);
cudnnSetTensor4dDescriptor(descriptor,
/*format=*/CUDNN_TENSOR_NHWC,
/*format=*/CUDNN_TENSOR_NCHW,
/*dataType=*/CUDNN_DATA_FLOAT,
/*batch_size=*/1,
/*channels=*/1,
......@@ -694,20 +652,128 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
<< "descriptor," << args[0].get_name() << ","
<< "&beta,"
<< "descriptor," << out[0].get_name() << ");\n";
writer.indent--;
writer << "}\n";
writer.block_end();
}
template <>
void GPU_Emitter::EMITTER_DECL(ngraph::op::Result)
{
writer << "{ //" << node->get_name() << "\n";
writer.indent++;
writer << "runtime::gpu::cuda_memcpyDtD(" << out[0].get_name() << ", "
<< args[0].get_name() << ", " << out[0].get_size() << " * "
<< out[0].get_element_type().size() << ");\n";
writer.indent--;
writer << "}\n";
writer.block_begin(" // " + node->get_name());
kernel::emit_memcpyDtD(writer, out[0], args[0]);
writer.block_end();
return;
}
template <>
void GPU_Emitter::EMITTER_DECL(ngraph::op::Sum)
{
auto sum_node = static_cast<const ngraph::op::Sum*>(node);
auto reduction_axes = sum_node->get_reduction_axes();
auto& input_shape = args[0].get_shape();
const std::string input_desc = "input_descriptor";
const std::string output_desc = "output_descriptor";
const std::string tensor_type = "CUDNN_DATA_FLOAT";
const std::string tensor_format = "CUDNN_TENSOR_NCHW";
writer.block_begin(" // " + node->get_name());
{
if (out[0].get_size() != 0)
{
// one of args[] axes has zero size, zero output
if (args[0].get_size() == 0)
{
kernel::emit_memset(writer, out[0], 0);
}
// no change in dimensions, reduction not necessary
else if (input_shape.size() == out[0].get_shape().size())
{
kernel::emit_memcpyDtD(writer, out[0], args[0]);
}
// descriptors for tensors with <= 4 dimensions
else if (input_shape.size() <= 4)
{
// construct input tensor descriptor rt impl.
std::array<size_t, 4> dimensions;
size_t pos = 0;
for (size_t i = input_shape.size(); i < 4; i++)
{
dimensions[pos++] = 1;
}
for (size_t i = 0; i < input_shape.size(); i++)
{
dimensions[pos++] = input_shape[i];
}
kernel::emit_cudnnTensor4dDescriptor(
writer, input_desc, tensor_format, tensor_type, dimensions);
// mark reduced axes of input tensor for output tensor descriptor
for (auto const& idx_dim : reduction_axes)
{
dimensions[(4 - input_shape.size()) + idx_dim] = 1;
}
kernel::emit_cudnnTensor4dDescriptor(
writer, output_desc, tensor_format, tensor_type, dimensions);
// emit sum reduce operation
kernel::emit_cudnnReduceTensor(writer,
args[0],
out[0],
"CUDNN_REDUCE_TENSOR_ADD",
tensor_type,
"CUDNN_NOT_PROPAGATE_NAN",
input_desc,
output_desc,
1.0,
0.0);
}
// descriptors for Nd tensors
else
{
std::vector<size_t> dimensions = input_shape;
auto compute_strides = [](const std::vector<size_t>& dim) {
std::vector<size_t> strides(dim.size(), 1);
std::copy(dim.begin() + 1, dim.end(), strides.begin());
for (int64_t i = dim.size() - 2; i >= 0; i--)
{
strides[i] *= strides[i + 1];
}
return strides;
};
kernel::emit_cudnnTensorNdDescriptor(writer,
input_desc,
tensor_type,
dimensions.size(),
dimensions,
compute_strides(dimensions));
// mark reduced axes of input tensor for output tensor descriptor
for (auto const& idx_dim : reduction_axes)
{
dimensions[idx_dim] = 1;
}
kernel::emit_cudnnTensorNdDescriptor(writer,
output_desc,
tensor_type,
dimensions.size(),
dimensions,
compute_strides(dimensions));
// emit sum reduce operation
kernel::emit_cudnnReduceTensor(writer,
args[0],
out[0],
"CUDNN_REDUCE_TENSOR_ADD",
tensor_type,
"CUDNN_NOT_PROPAGATE_NAN",
input_desc,
output_desc,
1.0,
0.0);
}
}
}
writer.block_end();
return;
}
}
......
......@@ -58,7 +58,7 @@ namespace ngraph
{
}
static void EmitElementwise(GPU_ExternalFunction* external_function,
static void emit_elementwise(GPU_ExternalFunction* external_function,
codegen::CodeWriter& writer,
const ngraph::Node* node,
const std::vector<GPU_TensorViewWrapper>& args,
......
......@@ -165,54 +165,54 @@ static const runtime::gpu::OpMap dispatcher{
{TI(ngraph::op::Dot), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Dot>},
{TI(ngraph::op::Multiply), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Multiply>},
{TI(ngraph::op::Parameter), &runtime::gpu::GPU_Emitter::nop},
{TI(ngraph::op::Abs), &runtime::gpu::GPU_Emitter::EmitElementwise},
{TI(ngraph::op::Abs), &runtime::gpu::GPU_Emitter::emit_elementwise},
{TI(ngraph::op::Concat), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Concat>},
{TI(ngraph::op::Divide), &runtime::gpu::GPU_Emitter::EmitElementwise},
{TI(ngraph::op::Equal), &runtime::gpu::GPU_Emitter::EmitElementwise},
{TI(ngraph::op::Divide), &runtime::gpu::GPU_Emitter::emit_elementwise},
{TI(ngraph::op::Equal), &runtime::gpu::GPU_Emitter::emit_elementwise},
{TI(ngraph::op::GetOutputElement),
&runtime::gpu::GPU_Emitter::emit<ngraph::op::GetOutputElement>},
{TI(ngraph::op::Greater), &runtime::gpu::GPU_Emitter::EmitElementwise},
{TI(ngraph::op::GreaterEq), &runtime::gpu::GPU_Emitter::EmitElementwise},
{TI(ngraph::op::Less), &runtime::gpu::GPU_Emitter::EmitElementwise},
{TI(ngraph::op::LessEq), &runtime::gpu::GPU_Emitter::EmitElementwise},
{TI(ngraph::op::Log), &runtime::gpu::GPU_Emitter::EmitElementwise},
{TI(ngraph::op::Greater), &runtime::gpu::GPU_Emitter::emit_elementwise},
{TI(ngraph::op::GreaterEq), &runtime::gpu::GPU_Emitter::emit_elementwise},
{TI(ngraph::op::Less), &runtime::gpu::GPU_Emitter::emit_elementwise},
{TI(ngraph::op::LessEq), &runtime::gpu::GPU_Emitter::emit_elementwise},
{TI(ngraph::op::Log), &runtime::gpu::GPU_Emitter::emit_elementwise},
{TI(ngraph::op::Maximum), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Maximum>},
{TI(ngraph::op::Minimum), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Minimum>},
{TI(ngraph::op::Negative), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Negative>},
{TI(ngraph::op::NotEqual), &runtime::gpu::GPU_Emitter::EmitElementwise},
{TI(ngraph::op::Power), &runtime::gpu::GPU_Emitter::EmitElementwise},
{TI(ngraph::op::NotEqual), &runtime::gpu::GPU_Emitter::emit_elementwise},
{TI(ngraph::op::Power), &runtime::gpu::GPU_Emitter::emit_elementwise},
{TI(ngraph::op::Select), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Select>},
{TI(ngraph::op::Subtract), &runtime::gpu::GPU_Emitter::EmitElementwise},
{TI(ngraph::op::Subtract), &runtime::gpu::GPU_Emitter::emit_elementwise},
{TI(ngraph::op::Broadcast), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Broadcast>},
{TI(ngraph::op::Convert), &runtime::gpu::GPU_Emitter::EmitElementwise},
{TI(ngraph::op::Convert), &runtime::gpu::GPU_Emitter::emit_elementwise},
{TI(ngraph::op::Constant), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Constant>},
{TI(ngraph::op::Reshape), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Reshape>},
{TI(ngraph::op::FunctionCall), &runtime::gpu::GPU_Emitter::emit<ngraph::op::FunctionCall>},
{TI(ngraph::op::Reduce), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Reduce>},
{TI(ngraph::op::Sign), &runtime::gpu::GPU_Emitter::EmitElementwise},
{TI(ngraph::op::Sign), &runtime::gpu::GPU_Emitter::emit_elementwise},
{TI(ngraph::op::Slice), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Slice>},
{TI(ngraph::op::Sum), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Sum>},
{TI(ngraph::op::Exp), &runtime::gpu::GPU_Emitter::EmitElementwise},
{TI(ngraph::op::Sin), &runtime::gpu::GPU_Emitter::EmitElementwise},
{TI(ngraph::op::Sinh), &runtime::gpu::GPU_Emitter::EmitElementwise},
{TI(ngraph::op::Cos), &runtime::gpu::GPU_Emitter::EmitElementwise},
{TI(ngraph::op::Cosh), &runtime::gpu::GPU_Emitter::EmitElementwise},
{TI(ngraph::op::Tan), &runtime::gpu::GPU_Emitter::EmitElementwise},
{TI(ngraph::op::Tanh), &runtime::gpu::GPU_Emitter::EmitElementwise},
{TI(ngraph::op::Asin), &runtime::gpu::GPU_Emitter::EmitElementwise},
{TI(ngraph::op::Acos), &runtime::gpu::GPU_Emitter::EmitElementwise},
{TI(ngraph::op::Atan), &runtime::gpu::GPU_Emitter::EmitElementwise},
{TI(ngraph::op::Exp), &runtime::gpu::GPU_Emitter::emit_elementwise},
{TI(ngraph::op::Sin), &runtime::gpu::GPU_Emitter::emit_elementwise},
{TI(ngraph::op::Sinh), &runtime::gpu::GPU_Emitter::emit_elementwise},
{TI(ngraph::op::Cos), &runtime::gpu::GPU_Emitter::emit_elementwise},
{TI(ngraph::op::Cosh), &runtime::gpu::GPU_Emitter::emit_elementwise},
{TI(ngraph::op::Tan), &runtime::gpu::GPU_Emitter::emit_elementwise},
{TI(ngraph::op::Tanh), &runtime::gpu::GPU_Emitter::emit_elementwise},
{TI(ngraph::op::Asin), &runtime::gpu::GPU_Emitter::emit_elementwise},
{TI(ngraph::op::Acos), &runtime::gpu::GPU_Emitter::emit_elementwise},
{TI(ngraph::op::Atan), &runtime::gpu::GPU_Emitter::emit_elementwise},
{TI(ngraph::op::ReplaceSlice), &runtime::gpu::GPU_Emitter::emit<ngraph::op::ReplaceSlice>},
{TI(ngraph::op::OneHot), &runtime::gpu::GPU_Emitter::emit<ngraph::op::OneHot>},
{TI(ngraph::op::Floor), &runtime::gpu::GPU_Emitter::EmitElementwise},
{TI(ngraph::op::Ceiling), &runtime::gpu::GPU_Emitter::EmitElementwise},
{TI(ngraph::op::Floor), &runtime::gpu::GPU_Emitter::emit_elementwise},
{TI(ngraph::op::Ceiling), &runtime::gpu::GPU_Emitter::emit_elementwise},
{TI(ngraph::op::Sqrt), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Sqrt>},
{TI(ngraph::op::Convolution), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Convolution>},
{TI(ngraph::op::ConvolutionBackpropFilters),
&runtime::gpu::GPU_Emitter::emit<ngraph::op::ConvolutionBackpropFilters>},
{TI(ngraph::op::ConvolutionBackpropData),
&runtime::gpu::GPU_Emitter::emit<ngraph::op::ConvolutionBackpropData>},
{TI(ngraph::op::Not), &runtime::gpu::GPU_Emitter::EmitElementwise},
{TI(ngraph::op::Not), &runtime::gpu::GPU_Emitter::emit_elementwise},
{TI(ngraph::op::MaxPool), &runtime::gpu::GPU_Emitter::emit<ngraph::op::MaxPool>},
{TI(ngraph::op::Reverse), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Reverse>},
{TI(ngraph::op::Result), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Result>},
......@@ -231,8 +231,8 @@ static const runtime::gpu::OpMap dispatcher{
{TI(ngraph::op::Product), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Product>},
{TI(ngraph::op::Max), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Max>},
{TI(ngraph::op::Min), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Min>},
{TI(ngraph::op::Relu), &runtime::gpu::GPU_Emitter::EmitElementwise},
{TI(ngraph::op::ReluBackprop), &runtime::gpu::GPU_Emitter::EmitElementwise},
{TI(ngraph::op::Relu), &runtime::gpu::GPU_Emitter::emit_elementwise},
{TI(ngraph::op::ReluBackprop), &runtime::gpu::GPU_Emitter::emit_elementwise},
{TI(ngraph::op::Softmax), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Softmax>},
};
......
......@@ -13,11 +13,113 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#include <algorithm>
#include <map>
#include "gpu_kernel_emitters.hpp"
#include "ngraph/codegen/code_writer.hpp"
#include "ngraph/runtime/gpu/gpu_kernel_emitters.hpp"
#include "ngraph/util.hpp"
using namespace ngraph;
using namespace ngraph::runtime::gpu::kernel;
void runtime::gpu::kernel::emit_memset(codegen::CodeWriter& writer,
const GPU_TensorViewWrapper& dst,
int value,
size_t buffer_size)
{
if (buffer_size == 0)
{
buffer_size = dst.get_size() * dst.get_element_type().size();
}
writer << "runtime::gpu::cuda_memset(" << dst.get_name() << ", " << value << ", " << buffer_size
<< ");\n";
}
void runtime::gpu::kernel::emit_memcpyDtD(codegen::CodeWriter& writer,
const GPU_TensorViewWrapper& dst,
const GPU_TensorViewWrapper& src)
{
writer << "runtime::gpu::cuda_memcpyDtD(" << dst.get_name() << ", " << src.get_name() << ", "
<< dst.get_size() << " * " << dst.get_element_type().size() << ");\n";
return;
}
void runtime::gpu::kernel::emit_cudnnTensor4dDescriptor(codegen::CodeWriter& writer,
const std::string& name,
const std::string& format,
const std::string& data_type,
const std::array<size_t, 4>& axes)
{
writer << "cudnnTensorDescriptor_t " << name << ";\n";
writer << "cudnnCreateTensorDescriptor(&" << name << ");\n";
writer << "cudnnSetTensor4dDescriptor(" << name << ",\n";
writer << " /*format=*/" << format << ",\n";
writer << " /*dataType=*/" << data_type;
for (auto const& axis : axes)
{
writer << ",\n /*dimension_size*/" << axis;
}
writer << ");\n";
}
void runtime::gpu::kernel::emit_cudnnTensorNdDescriptor(codegen::CodeWriter& writer,
const std::string& name,
const std::string& data_type,
const size_t& num_axes,
const std::vector<size_t>& axes,
const std::vector<size_t>& strides)
{
writer << "const int " << name << "_axes[] = {" << join(axes) << "};\n";
writer << "const int " << name << "_strides[] = {" << join(strides) << "};\n";
writer << "cudnnTensorDescriptor_t " << name << ";\n";
writer << "cudnnCreateTensorDescriptor(&" << name << ");\n";
writer << "cudnnSetTensorNdDescriptor(" << name << ",\n";
writer << " /*dataType=*/" << data_type << ",\n";
writer << " /*num_dimensions=*/" << num_axes << ",\n";
writer << " /*dimensions*/" << name << "_axes,\n";
writer << " /*strides*/" << name << "_strides);\n";
}
void runtime::gpu::kernel::emit_cudnnReduceTensor(codegen::CodeWriter& writer,
const GPU_TensorViewWrapper& in,
const GPU_TensorViewWrapper& out,
const std::string& reduce_op,
const std::string& data_type,
const std::string& nan_prop,
const std::string& input_desc,
const std::string& output_desc,
const float& alpha,
const float& beta)
{
writer << "cudnnReduceTensorDescriptor_t reduceTensorDesc;\n";
writer << "cudnnCreateReduceTensorDescriptor(&reduceTensorDesc);\n";
writer << "cudnnSetReduceTensorDescriptor(reduceTensorDesc,\n";
writer << " " << reduce_op << ",\n";
writer << " " << data_type << ",\n";
writer << " " << nan_prop << ",\n";
writer << " CUDNN_REDUCE_TENSOR_NO_INDICES,\n";
writer << " CUDNN_32BIT_INDICES);\n";
writer << "size_t workspace_size = 0;\n";
writer << "cudnnGetReductionWorkspaceSize(cudnn_handle,\n";
writer << " reduceTensorDesc,\n";
writer << " " << input_desc << ",\n";
writer << " " << output_desc << ",\n";
writer << " &workspace_size);\n";
writer << "void* workspace_ptr = "
"ngraph::runtime::gpu::create_gpu_buffer(workspace_size);\n";
writer << "float alpha = " << alpha << ", beta = " << beta << ";\n";
writer << "cudnnReduceTensor(cudnn_handle,\n";
writer << " reduceTensorDesc,\n";
writer << " nullptr,\n";
writer << " 0,\n";
writer << " workspace_ptr,\n";
writer << " workspace_size,\n";
writer << " &alpha,\n";
writer << " " << input_desc << ",\n";
writer << " " << in.get_name() << ",\n";
writer << " &beta,\n";
writer << " " << output_desc << ",\n";
writer << " " << out.get_name() << ");\n";
writer << "ngraph::runtime::gpu::free_gpu_buffer(workspace_ptr);\n";
}
......@@ -17,6 +17,8 @@
#pragma once
#include "ngraph/codegen/code_writer.hpp"
#include "ngraph/node.hpp"
#include "ngraph/runtime/gpu/gpu_tensor_view_wrapper.hpp"
namespace ngraph
{
......@@ -26,6 +28,38 @@ namespace ngraph
{
namespace kernel
{
void emit_memset(codegen::CodeWriter& writer,
const GPU_TensorViewWrapper& dst,
int value,
size_t buffer_size = 0);
void emit_memcpyDtD(codegen::CodeWriter& writer,
const GPU_TensorViewWrapper& dst,
const GPU_TensorViewWrapper& src);
void emit_cudnnTensor4dDescriptor(codegen::CodeWriter& writer,
const std::string& name,
const std::string& format,
const std::string& data_type,
const std::array<size_t, 4>& axes);
void emit_cudnnTensorNdDescriptor(codegen::CodeWriter& writer,
const std::string& name,
const std::string& data_type,
const size_t& num_axes,
const std::vector<size_t>& axes,
const std::vector<size_t>& strides);
void emit_cudnnReduceTensor(codegen::CodeWriter& writer,
const GPU_TensorViewWrapper& in,
const GPU_TensorViewWrapper& out,
const std::string& reduce_op,
const std::string& data_type,
const std::string& nan_prop,
const std::string& input_desc,
const std::string& output_desc,
const float& alpha,
const float& beta);
}
}
}
......
......@@ -3163,7 +3163,6 @@ TEST(${BACKEND_NAME}, tensor_constant_int64)
// Trivial case with no summed axes.
TEST(${BACKEND_NAME}, sum_trivial)
{
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape{2, 2};
auto A = make_shared<op::Parameter>(element::f32, shape);
auto f = make_shared<Function>(make_shared<op::Sum>(A, AxisSet{}), op::ParameterVector{A});
......@@ -3185,7 +3184,6 @@ TEST(${BACKEND_NAME}, sum_trivial)
// Failure has been reported at 5D for some reason
TEST(${BACKEND_NAME}, sum_trivial_5d)
{
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape{2, 2, 2, 2, 2};
auto A = make_shared<op::Parameter>(element::f32, shape);
auto f = make_shared<Function>(make_shared<op::Sum>(A, AxisSet{}), op::ParameterVector{A});
......@@ -3209,7 +3207,6 @@ TEST(${BACKEND_NAME}, sum_trivial_5d)
TEST(${BACKEND_NAME}, sum_to_scalar)
{
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape{2, 2};
auto A = make_shared<op::Parameter>(element::f32, shape);
auto f = make_shared<Function>(make_shared<op::Sum>(A, AxisSet{0, 1}), op::ParameterVector{A});
......@@ -3234,7 +3231,6 @@ TEST(${BACKEND_NAME}, sum_to_scalar)
TEST(${BACKEND_NAME}, sum_matrix_columns)
{
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape_a{3, 2};
auto A = make_shared<op::Parameter>(element::f32, shape_a);
Shape shape_rt{2};
......@@ -3260,7 +3256,6 @@ TEST(${BACKEND_NAME}, sum_matrix_columns)
TEST(${BACKEND_NAME}, sum_matrix_rows)
{
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape_a{3, 2};
auto A = make_shared<op::Parameter>(element::f32, shape_a);
Shape shape_rt{3};
......@@ -3286,7 +3281,6 @@ TEST(${BACKEND_NAME}, sum_matrix_rows)
TEST(${BACKEND_NAME}, sum_matrix_rows_zero)
{
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
SKIP_TEST_FOR("NNP", "${BACKEND_NAME}");
Shape shape_a{3, 0};
......@@ -3315,9 +3309,7 @@ TEST(${BACKEND_NAME}, sum_matrix_rows_zero)
TEST(${BACKEND_NAME}, sum_matrix_cols_zero)
{
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
SKIP_TEST_FOR("NNP", "${BACKEND_NAME}");
// Now the reduction (g(x:float32[2,2],y:float32[]) = reduce(x,y,f,axes={})).
Shape shape_a{0, 2};
auto A = make_shared<op::Parameter>(element::f32, shape_a);
......@@ -3345,7 +3337,6 @@ TEST(${BACKEND_NAME}, sum_matrix_cols_zero)
TEST(${BACKEND_NAME}, sum_vector_zero)
{
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
SKIP_TEST_FOR("NNP", "${BACKEND_NAME}");
Shape shape_a{0};
......@@ -3374,7 +3365,6 @@ TEST(${BACKEND_NAME}, sum_vector_zero)
TEST(${BACKEND_NAME}, sum_matrix_to_scalar_zero_by_zero)
{
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
SKIP_TEST_FOR("NNP", "${BACKEND_NAME}");
Shape shape_a{0, 0};
......@@ -3403,7 +3393,6 @@ TEST(${BACKEND_NAME}, sum_matrix_to_scalar_zero_by_zero)
TEST(${BACKEND_NAME}, sum_3d_to_matrix_most_sig)
{
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape_a{3, 3, 3};
auto A = make_shared<op::Parameter>(element::f32, shape_a);
Shape shape_rt{3, 3};
......@@ -3435,7 +3424,6 @@ TEST(${BACKEND_NAME}, sum_3d_to_matrix_most_sig)
TEST(${BACKEND_NAME}, sum_3d_to_matrix_least_sig)
{
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape_a{3, 3, 3};
auto A = make_shared<op::Parameter>(element::f32, shape_a);
Shape shape_rt{3, 3};
......@@ -3467,7 +3455,6 @@ TEST(${BACKEND_NAME}, sum_3d_to_matrix_least_sig)
TEST(${BACKEND_NAME}, sum_3d_to_vector)
{
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape_a{3, 3, 3};
auto A = make_shared<op::Parameter>(element::f32, shape_a);
Shape shape_rt{3};
......@@ -3493,7 +3480,6 @@ TEST(${BACKEND_NAME}, sum_3d_to_vector)
TEST(${BACKEND_NAME}, sum_3d_to_scalar)
{
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape_a{3, 3, 3};
auto A = make_shared<op::Parameter>(element::f32, shape_a);
Shape shape_rt{};
......@@ -3519,7 +3505,6 @@ TEST(${BACKEND_NAME}, sum_3d_to_scalar)
TEST(${BACKEND_NAME}, sum_3d_eliminate_zero_dim)
{
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
SKIP_TEST_FOR("NNP", "${BACKEND_NAME}");
Shape shape_a{3, 0, 2};
......@@ -3546,7 +3531,6 @@ TEST(${BACKEND_NAME}, sum_3d_eliminate_zero_dim)
TEST(${BACKEND_NAME}, sum_to_scalar_stable)
{
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
SKIP_TEST_FOR("NNP", "${BACKEND_NAME}");
Shape shape{2, 2};
......@@ -3570,7 +3554,6 @@ TEST(${BACKEND_NAME}, sum_to_scalar_stable)
TEST(${BACKEND_NAME}, sum_3d_to_vector_stable)
{
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
SKIP_TEST_FOR("NNP", "${BACKEND_NAME}");
Shape shape_a{3, 3, 3};
......@@ -3594,6 +3577,28 @@ TEST(${BACKEND_NAME}, sum_3d_to_vector_stable)
test::all_close(read_vector<float>(result), vector<float>{1e-4f, 1e-5f, 1e-6f}, 5e-2f));
}
TEST(${BACKEND_NAME}, sum_5d_to_scalar)
{
Shape shape_a{3, 3, 3, 3, 3};
auto A = make_shared<op::Parameter>(element::f32, shape_a);
Shape shape_rt{};
auto f = make_shared<Function>(make_shared<op::Sum>(A, AxisSet{0, 1, 2, 3, 4}),
op::ParameterVector{A});
auto manager = runtime::Manager::get("${BACKEND_NAME}");
auto external = manager->compile(f);
auto backend = manager->allocate_backend();
auto cf = backend->make_call_frame(external);
// Create some tensors for input/output
auto a = backend->make_primary_tensor_view(element::f32, shape_a);
copy_data(a, std::vector<float>(std::pow(3, 5), 1));
auto result = backend->make_primary_tensor_view(element::f32, shape_rt);
cf->call({result}, {a});
EXPECT_EQ(std::vector<float>{243.}, read_vector<float>(result));
}
TEST(${BACKEND_NAME}, sign)
{
Shape shape{2, 3};
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment