add gpu broadcast

e1b2f54c · fenglei.tian · f4ff1c3b · e1b2f54c · e1b2f54c · e1b2f54c
Commit e1b2f54c authored Mar 02, 2018 by fenglei.tian
5 changed files
--- a/src/ngraph/runtime/gpu/gpu_cuda_kernel_emitters.cpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_kernel_emitters.cpp
@@ -61,6 +61,12 @@ namespace ngraph
                                              0));  // arguments
                CUDA_SAFE_CALL(cuCtxSynchronize()); // Retrieve and print output.
            }
+
+            void emit_broadcast(
+                void* in, void* out, size_t repeat_size, size_t repeat_times, size_t count)
+            {
+                return;
+            }
        }
    }
 }
--- a/src/ngraph/runtime/gpu/gpu_cuda_kernel_emitters.hpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_kernel_emitters.hpp
@@ -27,6 +27,8 @@ namespace ngraph
        namespace gpu
        {
            void emit_abs(void* in, void* out, size_t count);
+            void emit_broadcast(
+                void* in, void* out, size_t repeat_size, size_t repeat_times, size_t count);
        }
    }
 }
--- a/src/ngraph/runtime/gpu/gpu_emitter.cpp
+++ b/src/ngraph/runtime/gpu/gpu_emitter.cpp
@@ -457,7 +457,62 @@ void runtime::gpu::GPU_Emitter::EmitBroadcast(
    const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
    const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
+    auto broadcast = static_cast<const ngraph::op::Broadcast*>(n);
+    auto arg_shape = args[0].get_shape();
+    auto result_shape = out[0].get_shape();
+
+    auto& axes = broadcast->get_broadcast_axes();
+    if (axes.empty())
+    {
+        writer << "{   // " << n->get_name() << " \n";
+        writer.indent++;
+        writer << "runtime::gpu::cuda_memcpyDtD(" << out[0].get_name() << ", " << args[0].get_name()
+               << ", " << out[0].get_size() << " * " << out[0].get_element_type().size() << ");\n";
+        writer.indent--;
+        writer << "}\n";
+        return;
+    }
+
+    vector<int> axes_v;
+    std::copy(axes.begin(), axes.end(), std::back_inserter(axes_v));
+    bool is_one_axes = true;
+    if (axes.size() != 1)
+    {
+        for (int i = 1; i < axes_v.size(); i++)
+        {
+            if (axes_v[i] != axes_v[i - 1] + 1)
+            {
+                is_one_axes = false;
+                break;
+            }
+        }
+    }
+    if (is_one_axes)
+    {
+        int repeat_times = 1;
+        for (int i = 0; i < axes.size(); i++)
+        {
+            repeat_times *= result_shape[axes_v[i]];
+        }
+
+        int repeat_size = 1;
+        for (int i = *axes.rbegin(); i < result_shape.size(); i++)
+        {
+            repeat_size *= result_shape[i];
+        }
+
+        writer << "{   // " << n->get_name() << " \n";
+        writer.indent++;
+        writer << "runtime::gpu::emit_broadcast(" << args[0].get_name() << ", " << out[0].get_name()
+               << ", " << repeat_size << ", " << repeat_times << ", " << out[0].get_size()
+               << ");\n";
+        writer.indent--;
+        writer << "}\n";
+    }
+    else
+    {
        throw std::runtime_error(n->get_name() + " is not implemented.");
+    }
 }

 void runtime::gpu::GPU_Emitter::EmitConvert(codegen::CodeWriter& writer,
@@ -474,7 +529,6 @@ void runtime::gpu::GPU_Emitter::EmitConstant(
    const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
    const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
-    throw std::runtime_error(n->get_name() + " is not implemented.");
 }

 void runtime::gpu::GPU_Emitter::EmitReshape(codegen::CodeWriter& writer,

--- a/test/backend_performance.cpp
+++ b/test/backend_performance.cpp
@@ -46,7 +46,6 @@ TEST(benchmark, mxnet_mnist_mlp_forward)

 TEST(benchmark, gpu_mxnet_mnist_mlp_forward)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    const string json_path = file_util::path_join(SERIALIZED_ZOO, "mxnet/mnist_mlp_forward.json");
    run_benchmark(json_path, "GPU", 1000);
 }

--- a/test/backend_test.in.cpp
+++ b/test/backend_test.in.cpp
@@ -1620,7 +1620,6 @@ TEST(${BACKEND_NAME}, function_call)

 TEST(${BACKEND_NAME}, broadcast_scalar_vector)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    Shape shape_a{};
    auto A = make_shared<op::Parameter>(element::f32, shape_a);
    Shape shape_r{4};
@@ -1643,7 +1642,6 @@ TEST(${BACKEND_NAME}, broadcast_scalar_vector)

 TEST(${BACKEND_NAME}, broadcast_scalar_matrix)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    Shape shape_a{};
    auto A = make_shared<op::Parameter>(element::f32, shape_a);
    Shape shape_r{2, 2};
@@ -1666,7 +1664,6 @@ TEST(${BACKEND_NAME}, broadcast_scalar_matrix)

 TEST(${BACKEND_NAME}, broadcast_scalar_tensor)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    Shape shape_a{};
    auto A = make_shared<op::Parameter>(element::f32, shape_a);
    Shape shape_r{2, 2, 2};
@@ -1689,7 +1686,6 @@ TEST(${BACKEND_NAME}, broadcast_scalar_tensor)

 TEST(${BACKEND_NAME}, broadcast_trivial)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    Shape shape{2, 2, 2};
    auto A = make_shared<op::Parameter>(element::f32, shape);
    auto f = make_shared<Function>(make_shared<op::Broadcast>(A, shape, AxisSet{}),
@@ -1711,7 +1707,6 @@ TEST(${BACKEND_NAME}, broadcast_trivial)

 TEST(${BACKEND_NAME}, broadcast_vector_colwise)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    Shape shape_a{3};
    auto A = make_shared<op::Parameter>(element::f32, shape_a);
    Shape shape_r{3, 4};
@@ -1734,7 +1729,6 @@ TEST(${BACKEND_NAME}, broadcast_vector_colwise)

 TEST(${BACKEND_NAME}, broadcast_vector_rowwise)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    Shape shape_a{4};
    auto A = make_shared<op::Parameter>(element::f32, shape_a);
    Shape shape_r{3, 4};
@@ -1806,7 +1800,6 @@ TEST(${BACKEND_NAME}, broadcast_vector_rowwise_int64)

 TEST(${BACKEND_NAME}, broadcast_matrix_0)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    Shape shape_a{2, 2};
    auto A = make_shared<op::Parameter>(element::f32, shape_a);
    Shape shape_r{2, 2, 2};
@@ -1829,7 +1822,6 @@ TEST(${BACKEND_NAME}, broadcast_matrix_0)

 TEST(${BACKEND_NAME}, broadcast_matrix_1)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    Shape shape_a{2, 2};
    auto A = make_shared<op::Parameter>(element::f32, shape_a);
    Shape shape_r{2, 2, 2};
@@ -1852,7 +1844,6 @@ TEST(${BACKEND_NAME}, broadcast_matrix_1)

 TEST(${BACKEND_NAME}, broadcast_matrix_2)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    Shape shape_a{2, 2};
    auto A = make_shared<op::Parameter>(element::f32, shape_a);
    Shape shape_r{2, 2, 2};