add broadcast kernel

7525d6e1 · Fenglei Tian · e1b2f54c · 7525d6e1
Commit 7525d6e1 authored Mar 02, 2018 by Fenglei Tian
Hide whitespace changes
Inline Side-by-side

Showing with 44 additions and 1 deletion

gpu_cuda_kernel_emitters.cpp src/ngraph/runtime/gpu/gpu_cuda_kernel_emitters.cpp +44 -1

No files found.
--- a/src/ngraph/runtime/gpu/gpu_cuda_kernel_emitters.cpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_kernel_emitters.cpp
@@ -65,7 +65,50 @@ namespace ngraph
            void emit_broadcast(
                void* in, void* out, size_t repeat_size, size_t repeat_times, size_t count)
            {
-                return;
+                std::string name = "broadcast";
+                // Create an instance of nvrtcProgram with the code string.
+                if (CudaFunctionPool::instance().get(name) == nullptr)
+                {
+                    const char* opts[] = {"--gpu-architecture=compute_35",
+                                          "--relocatable-device-code=true"};
+                    std::string kernel;
+                    std::string data_type("float");
+                                    kernel = R"(
+extern "C" __global__
+void cuda_)" + name + "(" +
+                         data_type + "* in, " + data_type + "* out, size_t m, size_t k, size_t n)\n" + R"(
+{
+    size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if(tid < n)
+    {
+        size_t idx = tid / (m * k) * m + tid % m;
+        out[tid] = in[idx];
+    }
+})";
+                    CudaFunctionPool::instance().set(
+                        name, CudaFunctionBuilder::get("cuda_" + name, kernel, 2, opts));
+                }
+                //convert runtime ptr to driver api ptr
+                CUdeviceptr d_ptr_in, d_ptr_out;
+                d_ptr_in = (CUdeviceptr)in;
+                d_ptr_out = (CUdeviceptr)out;
+                void* args_list[] = {&d_ptr_in, &d_ptr_out, &repeat_size, &repeat_times, &count};
+                CUDA_SAFE_CALL(cuLaunchKernel(*CudaFunctionPool::instance().get(name).get(),
+                                              count,
+                                              1,
+                                              1, // grid dim
+                                              1,
+                                              1,
+                                              1, // block dim
+                                              0,
+                                              NULL, // shared mem and stream
+                                              args_list,
+                                              0));  // arguments
+                CUDA_SAFE_CALL(cuCtxSynchronize()); // Retrieve and print output.
            }
        }
    }