Commit bae77590 authored by Chris Sullivan's avatar Chris Sullivan Committed by adstraw

Add reduce sum to the GPU transformer (op::Sum) (#671)

* Current cudnn implementations use only
a single dimension for the ngraph tensor data (width).
In this case the tensor format should be set to

CUDNN_TENSOR_NCHW

so that adjacent memory accesses are coalesced (stride=1 for width).

* * Added some kernel emitter helpers that are reused often.
* Renamed EmitElementwise -> emit_elementwise to match emit<T>.
* op::Sum now handles trivial case of dim(input_tensor) = dim(output_tensor)
  by performing a memcpy as no axes are reduced.

*   Added general case for Nd descriptors which is used when the tensor
  has more than 4 dimensions. Currently a naive reduce is performed,
  in the future a coordinate transformation could be performed to
  improve the memory layout for the reduction.

* Switched to codegen::CodeWriter::block_begin/end.
It appears that CodeWriter::block_begin/end is not frequently used for emitters (in cpu and gpu transformers)
because a block comment is often desired. To this end I added prefix/suffix default parameters to CodeWriter::block_begin/end
so that this functionality is captured.
parent 72f4d661
...@@ -68,16 +68,16 @@ public: ...@@ -68,16 +68,16 @@ public:
std::string generate_temporary_name(std::string prefix = "tempvar"); std::string generate_temporary_name(std::string prefix = "tempvar");
void block_begin() void block_begin(std::string block_prefix = "")
{ {
*this << "{\n"; *this << "{" << block_prefix << "\n";
indent++; indent++;
} }
void block_end() void block_end(std::string block_suffix = "")
{ {
indent--; indent--;
*this << "}\n"; *this << "}" << block_suffix << "\n";
} }
private: private:
......
This diff is collapsed.
...@@ -58,7 +58,7 @@ namespace ngraph ...@@ -58,7 +58,7 @@ namespace ngraph
{ {
} }
static void EmitElementwise(GPU_ExternalFunction* external_function, static void emit_elementwise(GPU_ExternalFunction* external_function,
codegen::CodeWriter& writer, codegen::CodeWriter& writer,
const ngraph::Node* node, const ngraph::Node* node,
const std::vector<GPU_TensorViewWrapper>& args, const std::vector<GPU_TensorViewWrapper>& args,
......
...@@ -165,54 +165,54 @@ static const runtime::gpu::OpMap dispatcher{ ...@@ -165,54 +165,54 @@ static const runtime::gpu::OpMap dispatcher{
{TI(ngraph::op::Dot), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Dot>}, {TI(ngraph::op::Dot), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Dot>},
{TI(ngraph::op::Multiply), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Multiply>}, {TI(ngraph::op::Multiply), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Multiply>},
{TI(ngraph::op::Parameter), &runtime::gpu::GPU_Emitter::nop}, {TI(ngraph::op::Parameter), &runtime::gpu::GPU_Emitter::nop},
{TI(ngraph::op::Abs), &runtime::gpu::GPU_Emitter::EmitElementwise}, {TI(ngraph::op::Abs), &runtime::gpu::GPU_Emitter::emit_elementwise},
{TI(ngraph::op::Concat), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Concat>}, {TI(ngraph::op::Concat), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Concat>},
{TI(ngraph::op::Divide), &runtime::gpu::GPU_Emitter::EmitElementwise}, {TI(ngraph::op::Divide), &runtime::gpu::GPU_Emitter::emit_elementwise},
{TI(ngraph::op::Equal), &runtime::gpu::GPU_Emitter::EmitElementwise}, {TI(ngraph::op::Equal), &runtime::gpu::GPU_Emitter::emit_elementwise},
{TI(ngraph::op::GetOutputElement), {TI(ngraph::op::GetOutputElement),
&runtime::gpu::GPU_Emitter::emit<ngraph::op::GetOutputElement>}, &runtime::gpu::GPU_Emitter::emit<ngraph::op::GetOutputElement>},
{TI(ngraph::op::Greater), &runtime::gpu::GPU_Emitter::EmitElementwise}, {TI(ngraph::op::Greater), &runtime::gpu::GPU_Emitter::emit_elementwise},
{TI(ngraph::op::GreaterEq), &runtime::gpu::GPU_Emitter::EmitElementwise}, {TI(ngraph::op::GreaterEq), &runtime::gpu::GPU_Emitter::emit_elementwise},
{TI(ngraph::op::Less), &runtime::gpu::GPU_Emitter::EmitElementwise}, {TI(ngraph::op::Less), &runtime::gpu::GPU_Emitter::emit_elementwise},
{TI(ngraph::op::LessEq), &runtime::gpu::GPU_Emitter::EmitElementwise}, {TI(ngraph::op::LessEq), &runtime::gpu::GPU_Emitter::emit_elementwise},
{TI(ngraph::op::Log), &runtime::gpu::GPU_Emitter::EmitElementwise}, {TI(ngraph::op::Log), &runtime::gpu::GPU_Emitter::emit_elementwise},
{TI(ngraph::op::Maximum), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Maximum>}, {TI(ngraph::op::Maximum), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Maximum>},
{TI(ngraph::op::Minimum), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Minimum>}, {TI(ngraph::op::Minimum), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Minimum>},
{TI(ngraph::op::Negative), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Negative>}, {TI(ngraph::op::Negative), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Negative>},
{TI(ngraph::op::NotEqual), &runtime::gpu::GPU_Emitter::EmitElementwise}, {TI(ngraph::op::NotEqual), &runtime::gpu::GPU_Emitter::emit_elementwise},
{TI(ngraph::op::Power), &runtime::gpu::GPU_Emitter::EmitElementwise}, {TI(ngraph::op::Power), &runtime::gpu::GPU_Emitter::emit_elementwise},
{TI(ngraph::op::Select), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Select>}, {TI(ngraph::op::Select), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Select>},
{TI(ngraph::op::Subtract), &runtime::gpu::GPU_Emitter::EmitElementwise}, {TI(ngraph::op::Subtract), &runtime::gpu::GPU_Emitter::emit_elementwise},
{TI(ngraph::op::Broadcast), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Broadcast>}, {TI(ngraph::op::Broadcast), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Broadcast>},
{TI(ngraph::op::Convert), &runtime::gpu::GPU_Emitter::EmitElementwise}, {TI(ngraph::op::Convert), &runtime::gpu::GPU_Emitter::emit_elementwise},
{TI(ngraph::op::Constant), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Constant>}, {TI(ngraph::op::Constant), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Constant>},
{TI(ngraph::op::Reshape), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Reshape>}, {TI(ngraph::op::Reshape), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Reshape>},
{TI(ngraph::op::FunctionCall), &runtime::gpu::GPU_Emitter::emit<ngraph::op::FunctionCall>}, {TI(ngraph::op::FunctionCall), &runtime::gpu::GPU_Emitter::emit<ngraph::op::FunctionCall>},
{TI(ngraph::op::Reduce), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Reduce>}, {TI(ngraph::op::Reduce), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Reduce>},
{TI(ngraph::op::Sign), &runtime::gpu::GPU_Emitter::EmitElementwise}, {TI(ngraph::op::Sign), &runtime::gpu::GPU_Emitter::emit_elementwise},
{TI(ngraph::op::Slice), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Slice>}, {TI(ngraph::op::Slice), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Slice>},
{TI(ngraph::op::Sum), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Sum>}, {TI(ngraph::op::Sum), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Sum>},
{TI(ngraph::op::Exp), &runtime::gpu::GPU_Emitter::EmitElementwise}, {TI(ngraph::op::Exp), &runtime::gpu::GPU_Emitter::emit_elementwise},
{TI(ngraph::op::Sin), &runtime::gpu::GPU_Emitter::EmitElementwise}, {TI(ngraph::op::Sin), &runtime::gpu::GPU_Emitter::emit_elementwise},
{TI(ngraph::op::Sinh), &runtime::gpu::GPU_Emitter::EmitElementwise}, {TI(ngraph::op::Sinh), &runtime::gpu::GPU_Emitter::emit_elementwise},
{TI(ngraph::op::Cos), &runtime::gpu::GPU_Emitter::EmitElementwise}, {TI(ngraph::op::Cos), &runtime::gpu::GPU_Emitter::emit_elementwise},
{TI(ngraph::op::Cosh), &runtime::gpu::GPU_Emitter::EmitElementwise}, {TI(ngraph::op::Cosh), &runtime::gpu::GPU_Emitter::emit_elementwise},
{TI(ngraph::op::Tan), &runtime::gpu::GPU_Emitter::EmitElementwise}, {TI(ngraph::op::Tan), &runtime::gpu::GPU_Emitter::emit_elementwise},
{TI(ngraph::op::Tanh), &runtime::gpu::GPU_Emitter::EmitElementwise}, {TI(ngraph::op::Tanh), &runtime::gpu::GPU_Emitter::emit_elementwise},
{TI(ngraph::op::Asin), &runtime::gpu::GPU_Emitter::EmitElementwise}, {TI(ngraph::op::Asin), &runtime::gpu::GPU_Emitter::emit_elementwise},
{TI(ngraph::op::Acos), &runtime::gpu::GPU_Emitter::EmitElementwise}, {TI(ngraph::op::Acos), &runtime::gpu::GPU_Emitter::emit_elementwise},
{TI(ngraph::op::Atan), &runtime::gpu::GPU_Emitter::EmitElementwise}, {TI(ngraph::op::Atan), &runtime::gpu::GPU_Emitter::emit_elementwise},
{TI(ngraph::op::ReplaceSlice), &runtime::gpu::GPU_Emitter::emit<ngraph::op::ReplaceSlice>}, {TI(ngraph::op::ReplaceSlice), &runtime::gpu::GPU_Emitter::emit<ngraph::op::ReplaceSlice>},
{TI(ngraph::op::OneHot), &runtime::gpu::GPU_Emitter::emit<ngraph::op::OneHot>}, {TI(ngraph::op::OneHot), &runtime::gpu::GPU_Emitter::emit<ngraph::op::OneHot>},
{TI(ngraph::op::Floor), &runtime::gpu::GPU_Emitter::EmitElementwise}, {TI(ngraph::op::Floor), &runtime::gpu::GPU_Emitter::emit_elementwise},
{TI(ngraph::op::Ceiling), &runtime::gpu::GPU_Emitter::EmitElementwise}, {TI(ngraph::op::Ceiling), &runtime::gpu::GPU_Emitter::emit_elementwise},
{TI(ngraph::op::Sqrt), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Sqrt>}, {TI(ngraph::op::Sqrt), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Sqrt>},
{TI(ngraph::op::Convolution), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Convolution>}, {TI(ngraph::op::Convolution), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Convolution>},
{TI(ngraph::op::ConvolutionBackpropFilters), {TI(ngraph::op::ConvolutionBackpropFilters),
&runtime::gpu::GPU_Emitter::emit<ngraph::op::ConvolutionBackpropFilters>}, &runtime::gpu::GPU_Emitter::emit<ngraph::op::ConvolutionBackpropFilters>},
{TI(ngraph::op::ConvolutionBackpropData), {TI(ngraph::op::ConvolutionBackpropData),
&runtime::gpu::GPU_Emitter::emit<ngraph::op::ConvolutionBackpropData>}, &runtime::gpu::GPU_Emitter::emit<ngraph::op::ConvolutionBackpropData>},
{TI(ngraph::op::Not), &runtime::gpu::GPU_Emitter::EmitElementwise}, {TI(ngraph::op::Not), &runtime::gpu::GPU_Emitter::emit_elementwise},
{TI(ngraph::op::MaxPool), &runtime::gpu::GPU_Emitter::emit<ngraph::op::MaxPool>}, {TI(ngraph::op::MaxPool), &runtime::gpu::GPU_Emitter::emit<ngraph::op::MaxPool>},
{TI(ngraph::op::Reverse), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Reverse>}, {TI(ngraph::op::Reverse), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Reverse>},
{TI(ngraph::op::Result), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Result>}, {TI(ngraph::op::Result), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Result>},
...@@ -231,8 +231,8 @@ static const runtime::gpu::OpMap dispatcher{ ...@@ -231,8 +231,8 @@ static const runtime::gpu::OpMap dispatcher{
{TI(ngraph::op::Product), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Product>}, {TI(ngraph::op::Product), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Product>},
{TI(ngraph::op::Max), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Max>}, {TI(ngraph::op::Max), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Max>},
{TI(ngraph::op::Min), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Min>}, {TI(ngraph::op::Min), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Min>},
{TI(ngraph::op::Relu), &runtime::gpu::GPU_Emitter::EmitElementwise}, {TI(ngraph::op::Relu), &runtime::gpu::GPU_Emitter::emit_elementwise},
{TI(ngraph::op::ReluBackprop), &runtime::gpu::GPU_Emitter::EmitElementwise}, {TI(ngraph::op::ReluBackprop), &runtime::gpu::GPU_Emitter::emit_elementwise},
{TI(ngraph::op::Softmax), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Softmax>}, {TI(ngraph::op::Softmax), &runtime::gpu::GPU_Emitter::emit<ngraph::op::Softmax>},
}; };
......
...@@ -13,11 +13,113 @@ ...@@ -13,11 +13,113 @@
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*******************************************************************************/ *******************************************************************************/
#include <algorithm> #include <algorithm>
#include <map> #include <map>
#include "gpu_kernel_emitters.hpp"
#include "ngraph/codegen/code_writer.hpp" #include "ngraph/codegen/code_writer.hpp"
#include "ngraph/runtime/gpu/gpu_kernel_emitters.hpp" #include "ngraph/util.hpp"
using namespace ngraph; using namespace ngraph;
using namespace ngraph::runtime::gpu::kernel;
void runtime::gpu::kernel::emit_memset(codegen::CodeWriter& writer,
const GPU_TensorViewWrapper& dst,
int value,
size_t buffer_size)
{
if (buffer_size == 0)
{
buffer_size = dst.get_size() * dst.get_element_type().size();
}
writer << "runtime::gpu::cuda_memset(" << dst.get_name() << ", " << value << ", " << buffer_size
<< ");\n";
}
void runtime::gpu::kernel::emit_memcpyDtD(codegen::CodeWriter& writer,
const GPU_TensorViewWrapper& dst,
const GPU_TensorViewWrapper& src)
{
writer << "runtime::gpu::cuda_memcpyDtD(" << dst.get_name() << ", " << src.get_name() << ", "
<< dst.get_size() << " * " << dst.get_element_type().size() << ");\n";
return;
}
void runtime::gpu::kernel::emit_cudnnTensor4dDescriptor(codegen::CodeWriter& writer,
const std::string& name,
const std::string& format,
const std::string& data_type,
const std::array<size_t, 4>& axes)
{
writer << "cudnnTensorDescriptor_t " << name << ";\n";
writer << "cudnnCreateTensorDescriptor(&" << name << ");\n";
writer << "cudnnSetTensor4dDescriptor(" << name << ",\n";
writer << " /*format=*/" << format << ",\n";
writer << " /*dataType=*/" << data_type;
for (auto const& axis : axes)
{
writer << ",\n /*dimension_size*/" << axis;
}
writer << ");\n";
}
void runtime::gpu::kernel::emit_cudnnTensorNdDescriptor(codegen::CodeWriter& writer,
const std::string& name,
const std::string& data_type,
const size_t& num_axes,
const std::vector<size_t>& axes,
const std::vector<size_t>& strides)
{
writer << "const int " << name << "_axes[] = {" << join(axes) << "};\n";
writer << "const int " << name << "_strides[] = {" << join(strides) << "};\n";
writer << "cudnnTensorDescriptor_t " << name << ";\n";
writer << "cudnnCreateTensorDescriptor(&" << name << ");\n";
writer << "cudnnSetTensorNdDescriptor(" << name << ",\n";
writer << " /*dataType=*/" << data_type << ",\n";
writer << " /*num_dimensions=*/" << num_axes << ",\n";
writer << " /*dimensions*/" << name << "_axes,\n";
writer << " /*strides*/" << name << "_strides);\n";
}
void runtime::gpu::kernel::emit_cudnnReduceTensor(codegen::CodeWriter& writer,
const GPU_TensorViewWrapper& in,
const GPU_TensorViewWrapper& out,
const std::string& reduce_op,
const std::string& data_type,
const std::string& nan_prop,
const std::string& input_desc,
const std::string& output_desc,
const float& alpha,
const float& beta)
{
writer << "cudnnReduceTensorDescriptor_t reduceTensorDesc;\n";
writer << "cudnnCreateReduceTensorDescriptor(&reduceTensorDesc);\n";
writer << "cudnnSetReduceTensorDescriptor(reduceTensorDesc,\n";
writer << " " << reduce_op << ",\n";
writer << " " << data_type << ",\n";
writer << " " << nan_prop << ",\n";
writer << " CUDNN_REDUCE_TENSOR_NO_INDICES,\n";
writer << " CUDNN_32BIT_INDICES);\n";
writer << "size_t workspace_size = 0;\n";
writer << "cudnnGetReductionWorkspaceSize(cudnn_handle,\n";
writer << " reduceTensorDesc,\n";
writer << " " << input_desc << ",\n";
writer << " " << output_desc << ",\n";
writer << " &workspace_size);\n";
writer << "void* workspace_ptr = "
"ngraph::runtime::gpu::create_gpu_buffer(workspace_size);\n";
writer << "float alpha = " << alpha << ", beta = " << beta << ";\n";
writer << "cudnnReduceTensor(cudnn_handle,\n";
writer << " reduceTensorDesc,\n";
writer << " nullptr,\n";
writer << " 0,\n";
writer << " workspace_ptr,\n";
writer << " workspace_size,\n";
writer << " &alpha,\n";
writer << " " << input_desc << ",\n";
writer << " " << in.get_name() << ",\n";
writer << " &beta,\n";
writer << " " << output_desc << ",\n";
writer << " " << out.get_name() << ");\n";
writer << "ngraph::runtime::gpu::free_gpu_buffer(workspace_ptr);\n";
}
...@@ -17,6 +17,8 @@ ...@@ -17,6 +17,8 @@
#pragma once #pragma once
#include "ngraph/codegen/code_writer.hpp" #include "ngraph/codegen/code_writer.hpp"
#include "ngraph/node.hpp"
#include "ngraph/runtime/gpu/gpu_tensor_view_wrapper.hpp"
namespace ngraph namespace ngraph
{ {
...@@ -26,6 +28,38 @@ namespace ngraph ...@@ -26,6 +28,38 @@ namespace ngraph
{ {
namespace kernel namespace kernel
{ {
void emit_memset(codegen::CodeWriter& writer,
const GPU_TensorViewWrapper& dst,
int value,
size_t buffer_size = 0);
void emit_memcpyDtD(codegen::CodeWriter& writer,
const GPU_TensorViewWrapper& dst,
const GPU_TensorViewWrapper& src);
void emit_cudnnTensor4dDescriptor(codegen::CodeWriter& writer,
const std::string& name,
const std::string& format,
const std::string& data_type,
const std::array<size_t, 4>& axes);
void emit_cudnnTensorNdDescriptor(codegen::CodeWriter& writer,
const std::string& name,
const std::string& data_type,
const size_t& num_axes,
const std::vector<size_t>& axes,
const std::vector<size_t>& strides);
void emit_cudnnReduceTensor(codegen::CodeWriter& writer,
const GPU_TensorViewWrapper& in,
const GPU_TensorViewWrapper& out,
const std::string& reduce_op,
const std::string& data_type,
const std::string& nan_prop,
const std::string& input_desc,
const std::string& output_desc,
const float& alpha,
const float& beta);
} }
} }
} }
......
...@@ -3163,7 +3163,6 @@ TEST(${BACKEND_NAME}, tensor_constant_int64) ...@@ -3163,7 +3163,6 @@ TEST(${BACKEND_NAME}, tensor_constant_int64)
// Trivial case with no summed axes. // Trivial case with no summed axes.
TEST(${BACKEND_NAME}, sum_trivial) TEST(${BACKEND_NAME}, sum_trivial)
{ {
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape{2, 2}; Shape shape{2, 2};
auto A = make_shared<op::Parameter>(element::f32, shape); auto A = make_shared<op::Parameter>(element::f32, shape);
auto f = make_shared<Function>(make_shared<op::Sum>(A, AxisSet{}), op::ParameterVector{A}); auto f = make_shared<Function>(make_shared<op::Sum>(A, AxisSet{}), op::ParameterVector{A});
...@@ -3185,7 +3184,6 @@ TEST(${BACKEND_NAME}, sum_trivial) ...@@ -3185,7 +3184,6 @@ TEST(${BACKEND_NAME}, sum_trivial)
// Failure has been reported at 5D for some reason // Failure has been reported at 5D for some reason
TEST(${BACKEND_NAME}, sum_trivial_5d) TEST(${BACKEND_NAME}, sum_trivial_5d)
{ {
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape{2, 2, 2, 2, 2}; Shape shape{2, 2, 2, 2, 2};
auto A = make_shared<op::Parameter>(element::f32, shape); auto A = make_shared<op::Parameter>(element::f32, shape);
auto f = make_shared<Function>(make_shared<op::Sum>(A, AxisSet{}), op::ParameterVector{A}); auto f = make_shared<Function>(make_shared<op::Sum>(A, AxisSet{}), op::ParameterVector{A});
...@@ -3209,7 +3207,6 @@ TEST(${BACKEND_NAME}, sum_trivial_5d) ...@@ -3209,7 +3207,6 @@ TEST(${BACKEND_NAME}, sum_trivial_5d)
TEST(${BACKEND_NAME}, sum_to_scalar) TEST(${BACKEND_NAME}, sum_to_scalar)
{ {
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape{2, 2}; Shape shape{2, 2};
auto A = make_shared<op::Parameter>(element::f32, shape); auto A = make_shared<op::Parameter>(element::f32, shape);
auto f = make_shared<Function>(make_shared<op::Sum>(A, AxisSet{0, 1}), op::ParameterVector{A}); auto f = make_shared<Function>(make_shared<op::Sum>(A, AxisSet{0, 1}), op::ParameterVector{A});
...@@ -3234,7 +3231,6 @@ TEST(${BACKEND_NAME}, sum_to_scalar) ...@@ -3234,7 +3231,6 @@ TEST(${BACKEND_NAME}, sum_to_scalar)
TEST(${BACKEND_NAME}, sum_matrix_columns) TEST(${BACKEND_NAME}, sum_matrix_columns)
{ {
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape_a{3, 2}; Shape shape_a{3, 2};
auto A = make_shared<op::Parameter>(element::f32, shape_a); auto A = make_shared<op::Parameter>(element::f32, shape_a);
Shape shape_rt{2}; Shape shape_rt{2};
...@@ -3260,7 +3256,6 @@ TEST(${BACKEND_NAME}, sum_matrix_columns) ...@@ -3260,7 +3256,6 @@ TEST(${BACKEND_NAME}, sum_matrix_columns)
TEST(${BACKEND_NAME}, sum_matrix_rows) TEST(${BACKEND_NAME}, sum_matrix_rows)
{ {
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape_a{3, 2}; Shape shape_a{3, 2};
auto A = make_shared<op::Parameter>(element::f32, shape_a); auto A = make_shared<op::Parameter>(element::f32, shape_a);
Shape shape_rt{3}; Shape shape_rt{3};
...@@ -3286,7 +3281,6 @@ TEST(${BACKEND_NAME}, sum_matrix_rows) ...@@ -3286,7 +3281,6 @@ TEST(${BACKEND_NAME}, sum_matrix_rows)
TEST(${BACKEND_NAME}, sum_matrix_rows_zero) TEST(${BACKEND_NAME}, sum_matrix_rows_zero)
{ {
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
SKIP_TEST_FOR("NNP", "${BACKEND_NAME}"); SKIP_TEST_FOR("NNP", "${BACKEND_NAME}");
Shape shape_a{3, 0}; Shape shape_a{3, 0};
...@@ -3315,9 +3309,7 @@ TEST(${BACKEND_NAME}, sum_matrix_rows_zero) ...@@ -3315,9 +3309,7 @@ TEST(${BACKEND_NAME}, sum_matrix_rows_zero)
TEST(${BACKEND_NAME}, sum_matrix_cols_zero) TEST(${BACKEND_NAME}, sum_matrix_cols_zero)
{ {
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
SKIP_TEST_FOR("NNP", "${BACKEND_NAME}"); SKIP_TEST_FOR("NNP", "${BACKEND_NAME}");
// Now the reduction (g(x:float32[2,2],y:float32[]) = reduce(x,y,f,axes={})). // Now the reduction (g(x:float32[2,2],y:float32[]) = reduce(x,y,f,axes={})).
Shape shape_a{0, 2}; Shape shape_a{0, 2};
auto A = make_shared<op::Parameter>(element::f32, shape_a); auto A = make_shared<op::Parameter>(element::f32, shape_a);
...@@ -3345,7 +3337,6 @@ TEST(${BACKEND_NAME}, sum_matrix_cols_zero) ...@@ -3345,7 +3337,6 @@ TEST(${BACKEND_NAME}, sum_matrix_cols_zero)
TEST(${BACKEND_NAME}, sum_vector_zero) TEST(${BACKEND_NAME}, sum_vector_zero)
{ {
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
SKIP_TEST_FOR("NNP", "${BACKEND_NAME}"); SKIP_TEST_FOR("NNP", "${BACKEND_NAME}");
Shape shape_a{0}; Shape shape_a{0};
...@@ -3374,7 +3365,6 @@ TEST(${BACKEND_NAME}, sum_vector_zero) ...@@ -3374,7 +3365,6 @@ TEST(${BACKEND_NAME}, sum_vector_zero)
TEST(${BACKEND_NAME}, sum_matrix_to_scalar_zero_by_zero) TEST(${BACKEND_NAME}, sum_matrix_to_scalar_zero_by_zero)
{ {
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
SKIP_TEST_FOR("NNP", "${BACKEND_NAME}"); SKIP_TEST_FOR("NNP", "${BACKEND_NAME}");
Shape shape_a{0, 0}; Shape shape_a{0, 0};
...@@ -3403,7 +3393,6 @@ TEST(${BACKEND_NAME}, sum_matrix_to_scalar_zero_by_zero) ...@@ -3403,7 +3393,6 @@ TEST(${BACKEND_NAME}, sum_matrix_to_scalar_zero_by_zero)
TEST(${BACKEND_NAME}, sum_3d_to_matrix_most_sig) TEST(${BACKEND_NAME}, sum_3d_to_matrix_most_sig)
{ {
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape_a{3, 3, 3}; Shape shape_a{3, 3, 3};
auto A = make_shared<op::Parameter>(element::f32, shape_a); auto A = make_shared<op::Parameter>(element::f32, shape_a);
Shape shape_rt{3, 3}; Shape shape_rt{3, 3};
...@@ -3435,7 +3424,6 @@ TEST(${BACKEND_NAME}, sum_3d_to_matrix_most_sig) ...@@ -3435,7 +3424,6 @@ TEST(${BACKEND_NAME}, sum_3d_to_matrix_most_sig)
TEST(${BACKEND_NAME}, sum_3d_to_matrix_least_sig) TEST(${BACKEND_NAME}, sum_3d_to_matrix_least_sig)
{ {
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape_a{3, 3, 3}; Shape shape_a{3, 3, 3};
auto A = make_shared<op::Parameter>(element::f32, shape_a); auto A = make_shared<op::Parameter>(element::f32, shape_a);
Shape shape_rt{3, 3}; Shape shape_rt{3, 3};
...@@ -3467,7 +3455,6 @@ TEST(${BACKEND_NAME}, sum_3d_to_matrix_least_sig) ...@@ -3467,7 +3455,6 @@ TEST(${BACKEND_NAME}, sum_3d_to_matrix_least_sig)
TEST(${BACKEND_NAME}, sum_3d_to_vector) TEST(${BACKEND_NAME}, sum_3d_to_vector)
{ {
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape_a{3, 3, 3}; Shape shape_a{3, 3, 3};
auto A = make_shared<op::Parameter>(element::f32, shape_a); auto A = make_shared<op::Parameter>(element::f32, shape_a);
Shape shape_rt{3}; Shape shape_rt{3};
...@@ -3493,7 +3480,6 @@ TEST(${BACKEND_NAME}, sum_3d_to_vector) ...@@ -3493,7 +3480,6 @@ TEST(${BACKEND_NAME}, sum_3d_to_vector)
TEST(${BACKEND_NAME}, sum_3d_to_scalar) TEST(${BACKEND_NAME}, sum_3d_to_scalar)
{ {
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape_a{3, 3, 3}; Shape shape_a{3, 3, 3};
auto A = make_shared<op::Parameter>(element::f32, shape_a); auto A = make_shared<op::Parameter>(element::f32, shape_a);
Shape shape_rt{}; Shape shape_rt{};
...@@ -3519,7 +3505,6 @@ TEST(${BACKEND_NAME}, sum_3d_to_scalar) ...@@ -3519,7 +3505,6 @@ TEST(${BACKEND_NAME}, sum_3d_to_scalar)
TEST(${BACKEND_NAME}, sum_3d_eliminate_zero_dim) TEST(${BACKEND_NAME}, sum_3d_eliminate_zero_dim)
{ {
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
SKIP_TEST_FOR("NNP", "${BACKEND_NAME}"); SKIP_TEST_FOR("NNP", "${BACKEND_NAME}");
Shape shape_a{3, 0, 2}; Shape shape_a{3, 0, 2};
...@@ -3546,7 +3531,6 @@ TEST(${BACKEND_NAME}, sum_3d_eliminate_zero_dim) ...@@ -3546,7 +3531,6 @@ TEST(${BACKEND_NAME}, sum_3d_eliminate_zero_dim)
TEST(${BACKEND_NAME}, sum_to_scalar_stable) TEST(${BACKEND_NAME}, sum_to_scalar_stable)
{ {
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
SKIP_TEST_FOR("NNP", "${BACKEND_NAME}"); SKIP_TEST_FOR("NNP", "${BACKEND_NAME}");
Shape shape{2, 2}; Shape shape{2, 2};
...@@ -3570,7 +3554,6 @@ TEST(${BACKEND_NAME}, sum_to_scalar_stable) ...@@ -3570,7 +3554,6 @@ TEST(${BACKEND_NAME}, sum_to_scalar_stable)
TEST(${BACKEND_NAME}, sum_3d_to_vector_stable) TEST(${BACKEND_NAME}, sum_3d_to_vector_stable)
{ {
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
SKIP_TEST_FOR("NNP", "${BACKEND_NAME}"); SKIP_TEST_FOR("NNP", "${BACKEND_NAME}");
Shape shape_a{3, 3, 3}; Shape shape_a{3, 3, 3};
...@@ -3594,6 +3577,28 @@ TEST(${BACKEND_NAME}, sum_3d_to_vector_stable) ...@@ -3594,6 +3577,28 @@ TEST(${BACKEND_NAME}, sum_3d_to_vector_stable)
test::all_close(read_vector<float>(result), vector<float>{1e-4f, 1e-5f, 1e-6f}, 5e-2f)); test::all_close(read_vector<float>(result), vector<float>{1e-4f, 1e-5f, 1e-6f}, 5e-2f));
} }
TEST(${BACKEND_NAME}, sum_5d_to_scalar)
{
Shape shape_a{3, 3, 3, 3, 3};
auto A = make_shared<op::Parameter>(element::f32, shape_a);
Shape shape_rt{};
auto f = make_shared<Function>(make_shared<op::Sum>(A, AxisSet{0, 1, 2, 3, 4}),
op::ParameterVector{A});
auto manager = runtime::Manager::get("${BACKEND_NAME}");
auto external = manager->compile(f);
auto backend = manager->allocate_backend();
auto cf = backend->make_call_frame(external);
// Create some tensors for input/output
auto a = backend->make_primary_tensor_view(element::f32, shape_a);
copy_data(a, std::vector<float>(std::pow(3, 5), 1));
auto result = backend->make_primary_tensor_view(element::f32, shape_rt);
cf->call({result}, {a});
EXPECT_EQ(std::vector<float>{243.}, read_vector<float>(result));
}
TEST(${BACKEND_NAME}, sign) TEST(${BACKEND_NAME}, sign)
{ {
Shape shape{2, 3}; Shape shape{2, 3};
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment