gpu broadcast (#576)

* add gpu broadcast * add broadcast kernel * fix bug for cumemdopyDtD usage in gpu_external_function.cpp

gpu broadcast (#576)
* add gpu broadcast * add broadcast kernel * fix bug for cumemdopyDtD usage in gpu_external_function.cpp
41268068 · Fenglei · Robert Kimball · ae50019e · 41268068 · 41268068
Commit 41268068 authored Mar 06, 2018 by Fenglei Committed by Robert Kimball Mar 06, 2018
5 changed files
--- a/src/ngraph/runtime/gpu/gpu_cuda_kernel_emitters.cpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_kernel_emitters.cpp
@@ -61,6 +61,55 @@ namespace ngraph
                                              0));  // arguments
                CUDA_SAFE_CALL(cuCtxSynchronize()); // Retrieve and print output.
            }
+
+            void emit_broadcast(
+                void* in, void* out, size_t repeat_size, size_t repeat_times, size_t count)
+            {
+                std::string name = "broadcast";
+                // Create an instance of nvrtcProgram with the code string.
+                if (CudaFunctionPool::instance().get(name) == nullptr)
+                {
+                    const char* opts[] = {"--gpu-architecture=compute_35",
+                                          "--relocatable-device-code=true"};
+                    std::string kernel;
+                    std::string data_type("float");
+
+                    kernel = R"(
+extern "C" __global__
+void cuda_)" + name + "(" + data_type +
+                             "* in, " + data_type + "* out, size_t m, size_t k, size_t n)\n" + R"(
+{
+    size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if(tid < n)
+    {
+        size_t idx = tid / (m * k) * m + tid % m;
+        out[tid] = in[idx];
+    }
+})";
+
+                    CudaFunctionPool::instance().set(
+                        name, CudaFunctionBuilder::get("cuda_" + name, kernel, 2, opts));
+                }
+
+                //convert runtime ptr to driver api ptr
+                CUdeviceptr d_ptr_in, d_ptr_out;
+                d_ptr_in = (CUdeviceptr)in;
+                d_ptr_out = (CUdeviceptr)out;
+
+                void* args_list[] = {&d_ptr_in, &d_ptr_out, &repeat_size, &repeat_times, &count};
+                CUDA_SAFE_CALL(cuLaunchKernel(*CudaFunctionPool::instance().get(name).get(),
+                                              count,
+                                              1,
+                                              1, // grid dim
+                                              1,
+                                              1,
+                                              1, // block dim
+                                              0,
+                                              NULL, // shared mem and stream
+                                              args_list,
+                                              0));  // arguments
+                CUDA_SAFE_CALL(cuCtxSynchronize()); // Retrieve and print output.
+            }
        }
    }
 }
--- a/src/ngraph/runtime/gpu/gpu_cuda_kernel_emitters.hpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_kernel_emitters.hpp
@@ -27,6 +27,8 @@ namespace ngraph
        namespace gpu
        {
            void emit_abs(void* in, void* out, size_t count);
+            void emit_broadcast(
+                void* in, void* out, size_t repeat_size, size_t repeat_times, size_t count);
        }
    }
 }
--- a/src/ngraph/runtime/gpu/gpu_emitter.cpp
+++ b/src/ngraph/runtime/gpu/gpu_emitter.cpp
@@ -457,7 +457,65 @@ void runtime::gpu::GPU_Emitter::EmitBroadcast(
    const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
    const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
-    throw std::runtime_error(n->get_name() + " is not implemented.");
+    auto broadcast = static_cast<const ngraph::op::Broadcast*>(n);
+    auto arg_shape = args[0].get_shape();
+    auto result_shape = out[0].get_shape();
+
+    auto& axes = broadcast->get_broadcast_axes();
+    //broadcast axes is empty, do a copy
+    if (axes.empty())
+    {
+        writer << "{   // " << n->get_name() << " \n";
+        writer.indent++;
+        writer << "runtime::gpu::cuda_memcpyDtD(" << out[0].get_name() << ", " << args[0].get_name()
+               << ", " << out[0].get_size() << " * " << out[0].get_element_type().size() << ");\n";
+        writer.indent--;
+        writer << "}\n";
+        return;
+    }
+
+    //broadcast axes size is 1, or can be group to 1 (consecutive axes, like 01 or 12 or 123 etc)
+    vector<int> axes_v;
+    std::copy(axes.begin(), axes.end(), std::back_inserter(axes_v));
+    std::sort(axes_v.begin(), axes_v.end());
+    bool is_one_axes = true;
+    if (axes.size() != 1)
+    {
+        for (int i = 1; i < axes_v.size(); i++)
+        {
+            if (axes_v[i] != axes_v[i - 1] + 1)
+            {
+                is_one_axes = false;
+                break;
+            }
+        }
+    }
+    if (is_one_axes)
+    {
+        int repeat_times = 1;
+        for (int i = 0; i < axes_v.size(); i++)
+        {
+            repeat_times *= result_shape[axes_v[i]];
+        }
+
+        int repeat_size = 1;
+        for (int i = *axes_v.rbegin() + 1; i < result_shape.size(); i++)
+        {
+            repeat_size *= result_shape[i];
+        }
+
+        writer << "{   // " << n->get_name() << " \n";
+        writer.indent++;
+        writer << "runtime::gpu::emit_broadcast(" << args[0].get_name() << ", " << out[0].get_name()
+               << ", " << repeat_size << ", " << repeat_times << ", " << out[0].get_size()
+               << ");\n";
+        writer.indent--;
+        writer << "}\n";
+    }
+    else
+    {
+        throw std::runtime_error(n->get_name() + " is not implemented.");
+    }
 }

 void runtime::gpu::GPU_Emitter::EmitConvert(codegen::CodeWriter& writer,
@@ -474,7 +532,6 @@ void runtime::gpu::GPU_Emitter::EmitConstant(
    const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
    const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
-    throw std::runtime_error(n->get_name() + " is not implemented.");
 }

 void runtime::gpu::GPU_Emitter::EmitReshape(codegen::CodeWriter& writer,

--- a/test/backend_performance.cpp
+++ b/test/backend_performance.cpp
@@ -46,7 +46,6 @@ TEST(benchmark, mxnet_mnist_mlp_forward)

 TEST(benchmark, gpu_mxnet_mnist_mlp_forward)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    const string json_path = file_util::path_join(SERIALIZED_ZOO, "mxnet/mnist_mlp_forward.json");
    run_benchmark(json_path, "GPU", 1000);
 }

--- a/test/backend_test.in.cpp
+++ b/test/backend_test.in.cpp
@@ -1701,7 +1701,6 @@ TEST(${BACKEND_NAME}, function_call)

 TEST(${BACKEND_NAME}, broadcast_scalar_vector)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    Shape shape_a{};
    auto A = make_shared<op::Parameter>(element::f32, shape_a);
    Shape shape_r{4};
@@ -1724,7 +1723,6 @@ TEST(${BACKEND_NAME}, broadcast_scalar_vector)

 TEST(${BACKEND_NAME}, broadcast_scalar_matrix)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    Shape shape_a{};
    auto A = make_shared<op::Parameter>(element::f32, shape_a);
    Shape shape_r{2, 2};
@@ -1747,7 +1745,6 @@ TEST(${BACKEND_NAME}, broadcast_scalar_matrix)

 TEST(${BACKEND_NAME}, broadcast_scalar_tensor)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    Shape shape_a{};
    auto A = make_shared<op::Parameter>(element::f32, shape_a);
    Shape shape_r{2, 2, 2};
@@ -1770,7 +1767,6 @@ TEST(${BACKEND_NAME}, broadcast_scalar_tensor)

 TEST(${BACKEND_NAME}, broadcast_trivial)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    Shape shape{2, 2, 2};
    auto A = make_shared<op::Parameter>(element::f32, shape);
    auto f = make_shared<Function>(make_shared<op::Broadcast>(A, shape, AxisSet{}),
@@ -1792,7 +1788,6 @@ TEST(${BACKEND_NAME}, broadcast_trivial)

 TEST(${BACKEND_NAME}, broadcast_vector_colwise)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    Shape shape_a{3};
    auto A = make_shared<op::Parameter>(element::f32, shape_a);
    Shape shape_r{3, 4};
@@ -1815,7 +1810,6 @@ TEST(${BACKEND_NAME}, broadcast_vector_colwise)

 TEST(${BACKEND_NAME}, broadcast_vector_rowwise)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    Shape shape_a{4};
    auto A = make_shared<op::Parameter>(element::f32, shape_a);
    Shape shape_r{3, 4};
@@ -1887,7 +1881,6 @@ TEST(${BACKEND_NAME}, broadcast_vector_rowwise_int64)

 TEST(${BACKEND_NAME}, broadcast_matrix_0)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    Shape shape_a{2, 2};
    auto A = make_shared<op::Parameter>(element::f32, shape_a);
    Shape shape_r{2, 2, 2};
@@ -1910,7 +1903,6 @@ TEST(${BACKEND_NAME}, broadcast_matrix_0)

 TEST(${BACKEND_NAME}, broadcast_matrix_1)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    Shape shape_a{2, 2};
    auto A = make_shared<op::Parameter>(element::f32, shape_a);
    Shape shape_r{2, 2, 2};
@@ -1933,7 +1925,6 @@ TEST(${BACKEND_NAME}, broadcast_matrix_1)

 TEST(${BACKEND_NAME}, broadcast_matrix_2)
 {
-    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
    Shape shape_a{2, 2};
    auto A = make_shared<op::Parameter>(element::f32, shape_a);
    Shape shape_r{2, 2, 2};