add cuda kernel, function and context builder and manager

e332d92b · Fenglei Tian · d68008ea · e332d92b · e332d92b · e332d92b
Commit e332d92b authored Feb 21, 2018 by Fenglei Tian
6 changed files
--- a/src/ngraph/runtime/gpu/gpu_cuda_context_manager.hpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_context_manager.hpp
+/*******************************************************************************
+* Copyright 2017-2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#pragma once
+
+#include <string>
+
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace gpu
+        {
+            class Cuda_context_manager
+            {
+                public:
+                static Cuda_context_manager& Instance()
+                {
+                    static Cuda_context_manager manager;
+                    return pool;
+                }
+
+                Cuda_context_manager(Cuda_context_manager const&) = delete;
+                Cuda_context_manager(Cuda_context_manager&&) = delete;
+                Cuda_context_manager& operator=(Cuda_context_manager const&) = delete;
+                Cuda_context_manager& operator=(Cuda_context_manager &&) = delete;
+
+                std::shared_ptr<CUcontext> GetContext()
+                {
+                    return context_ptr;
+                }
+
+                protected:
+                Cuda_context_manager()
+                {
+                    CUDA_SAFE_CALL(cuInit(0));
+                    CUDA_SAFE_CALL(cuDeviceGet(&cuDevice, 0));
+                    CUDA_SAFE_CALL(cuCtxCreate(&context, 0, cuDevice));
+                    context_ptr = std::make_shared<CUcontext>(context);
+                }
+                ~Cuda_context_manager(){}
+                CUdevice cuDevice;
+                CUcontext context;
+                std::shared_ptr<CUcontext> context_ptr;
+            }
+        }
+    }
+}
--- a/src/ngraph/runtime/gpu/gpu_cuda_function_builder.hpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_function_builder.hpp
+/*******************************************************************************
+* Copyright 2017-2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#pragma once
+
+#include <string>
+
+#include "ngraph/runtime/gpu/gpu_util.hpp"
+#include "ngraph/runtime/gpu/gpu_cuda_context_manager.hpp"
+
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace gpu
+        {
+            class Cuda_function_builder
+            {
+                public:
+                static std::shared_ptr<CUfuction> get(std::string& kernel, std::string& name, int number_of_options, std::string options)
+                {
+                        nvrtcProgram prog; 
+                        NVRTC_SAFE_CALL(nvrtcCreateProgram(&prog,
+                                    kernel.c_str(), 
+                                    "op.cu", 
+                                    0,      // numHeaders 
+                                    NULL,   // headers 
+                                    NULL)); // includeNames
+
+                        nvrtcResult compileResult = nvrtcCompileProgram(prog, 
+                                number_of_options, 
+                                options); 
+
+                        if (compileResult != NVRTC_SUCCESS) {
+                            // size_t logSize; 
+                            // NVRTC_SAFE_CALL(nvrtcGetProgramLogSize(prog, &logSize)); 
+                            // char *log = new char[logSize]; 
+                            // NVRTC_SAFE_CALL(nvrtcGetProgramLog(prog, log)); 
+                            // std::cout << log << '\n'; 
+                            // delete[] log; 
+                            throw std::runtime_error("compile error: \n" + kernel + "\n options" + options);
+                        }
+
+                        size_t ptxSize; 
+                        NVRTC_SAFE_CALL(nvrtcGetPTXSize(prog, &ptxSize)); 
+                        char *ptx = new char[ptxSize]; 
+                        NVRTC_SAFE_CALL(nvrtcGetPTX(prog, ptx));      // Load the generated PTX and get a handle to the parent kernel. 
+                        NVRTC_SAFE_CALL(nvrtcDestroyProgram(&prog)); // Destroy the program. 
+                        
+                        CUmodule module;
+                        CUfunction function;
+                        CUDA_SAFE_CALL(cuModuleLoadDataEx(&module, ptx, 0, 0, 0));
+                        CUDA_SAFE_CALL(cuModuleGetFunction(&function, module, name));
+                        return std::make_shared<CUfunction>(function);
+                }
+            }
+        }
+    }
+}
--- a/src/ngraph/runtime/gpu/gpu_cuda_function_pool.hpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_function_pool.hpp
+/*******************************************************************************
+* Copyright 2017-2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#pragma once
+
+#include <string>
+
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace gpu
+        {
+            class Cuda_function_pool
+            {
+                public:
+                static Cuda_function_pool& Instance()
+                {
+                    static Cuda_function_pool pool;
+                    return pool;
+                }
+
+                Cuda_function_pool(Cuda_function_pool const&) = delete;
+                Cuda_function_pool(Cuda_function_pool&&) = delete;
+                Cuda_function_pool& operator=(Cuda_function_pool const&) = delete;
+                Cuda_function_pool& operator=(Cuda_function_pool &&) = delete;
+
+                void Set(std::string& name, std::shared_ptr<CUfunction> function)
+                {
+                    CUfunction_map.insert({name,function});
+                }
+
+                std::shared_ptr<CUfunction> Get(std::string& name)
+                {
+                    auto it = CUfunction_map.find(name);
+                    if(it != CUfunction_map.end())
+                    {
+                        return (*it).second;
+                    }
+                    return nullptr;
+                }
+
+                protected:
+                Cuda_function_pool(){}
+                ~Cuda_function_pool(){}
+
+                std::unordered_map<std::string, std::shared_ptr<CUfunction>> CUfunction_map;
+            }
+        }
+    }
+}
--- a/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.hpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.hpp
+/*******************************************************************************
+* Copyright 2017-2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#pragma once
+
+#include <string>
+
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace gpu
+        {
+            class Cuda_kernel_builder
+            {
+                static std::string get_1_element_op(std::string& name, std::string& data_type, std::string& op, std::string& kernel)
+                {
+                    kernel = R"(  
+extern "C" __global__
+void cuda_op_)" + name + "(" + data_type + "* in, " + data_type + "* out, size_t n)\n"
+R"({  
+size_t tid = blockIdx.x * blockDim.x + threadIdx.x;  
+if(tid < n) 
+{
+out[tid] = " + op + "(in[tid]);\n"
+R"(}
+})";
+                    return;
+                }
+
+                static std::string get_2_element_op(std::string& name, std::string& data_type, std::string op, std::string& kernel)
+                {
+                    kernel = R"(  
+extern "C" __global__
+void cuda_op_)" + name + "(" + data_type + "* in1, " + data_type + "* in2, " + data_type + "* out, size_t n)\n"
+R"({  
+size_t tid = blockIdx.x * blockDim.x + threadIdx.x;  
+if(tid < n) 
+{
+out[tid] = in1[tid] )" + op + "in2[tid]\n"
+R"(}
+})";
+                    return;
+                }
+
+                static std::string get_n_element_op(std::string& name, std::string& data_type, std::vector<std::string> ops, std::string& kernel)
+                {
+                    kernel = "";
+                    return;
+                }
+            }
+        }
+    }
+}
--- a/src/ngraph/runtime/gpu/gpu_cuda_kernel_emitters.cpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_kernel_emitters.cpp
-// ----------------------------------------------------------------------------
-// Copyright 2017 Nervana Systems Inc.
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// ----------------------------------------------------------------------------
+/*******************************************************************************
+* Copyright 2017-2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
 #include <algorithm>
 #include <map>

-#include "ngraph/runtime/gpu/gpu_cuda_kernel_emitters.hpp"
+

 #include <nvrtc.h>
 #include <cuda.h>
@@ -22,47 +25,10 @@
 #include <cublas_v2.h>
 #include <cudnn_v7.h>

-#include "ngraph/node.hpp"
-#include "ngraph/ops/broadcast.hpp"
-#include "ngraph/ops/concatenate.hpp"
-#include "ngraph/ops/constant.hpp"
-#include "ngraph/ops/convolution.hpp"
-#include "ngraph/ops/dot.hpp"
-#include "ngraph/ops/function_call.hpp"
-#include "ngraph/ops/get_output_element.hpp"
-#include "ngraph/ops/max_pool.hpp"
-#include "ngraph/ops/one_hot.hpp"
-#include "ngraph/ops/reduce.hpp"
-#include "ngraph/ops/replace_slice.hpp"
-#include "ngraph/ops/reshape.hpp"
-#include "ngraph/ops/reverse.hpp"
-#include "ngraph/ops/slice.hpp"
-#include "ngraph/ops/sum.hpp"
-#include "ngraph/util.hpp"
-
-
-
-#define NVRTC_SAFE_CALL(x) \ 
-do { \ 
-    nvrtcResult result = x; \ 
-        if (result != NVRTC_SUCCESS) { \ 
-            std::cerr << "\nerror: " #x " failed with error " \ 
-                << nvrtcGetErrorString(result) << '\n'; \
-                exit(1); \ 
-        } \
-} while(0) 
-
-#define CUDA_SAFE_CALL(x) \ 
-do { \ 
-    CUresult result = x; \ 
-        if (result != CUDA_SUCCESS) { \ 
-            const char *msg; \ 
-                cuGetErrorName(result, &msg); \ 
-                std::cerr << "\nerror: " #x " failed with error " \ 
-                << msg << '\n'; \ 
-                exit(1); \ 
-        } \ 
-} while(0)
+#include "ngraph/runtime/gpu/gpu_cuda_kernel_emitters.hpp"
+#include "ngraph/runtime/gpu/gpu_cude_kernel_builder.hpp"
+#include "ngraph/runtime/gpu/gpu_cude_function_builder.hpp"
+#include "ngraph/runtime/gpu/gpu_cude_function_pool.hpp"

 namespace ngraph
 {
@@ -77,69 +43,25 @@ namespace ngraph

                    void emit_abs(void* in, void* out, size_t count)
                    {
-                        const char *op_abs = R"(  
-  extern "C" __global__  
-  void cuda_op_abs(float* in, float* out, size_t n)  
-  {  
-    size_t tid = blockIdx.x * blockDim.x + threadIdx.x;  
-    if(tid < n) 
-    {
-      out[tid] = fabsf(in[tid]);  
-    }
-  })";
-
-                            // Create an instance of nvrtcProgram with the code string. 
-
-                        nvrtcProgram prog; 
-                        NVRTC_SAFE_CALL(nvrtcCreateProgram(&prog,
-                                    op_abs, 
-                                    "op_abs.cu", 
-                                    0, // numHeaders 
-                                    NULL, // headers 
-                                    NULL)); // includeNames
-
-
-                        const char *opts[] = {"--gpu-architecture=compute_35",
+                        std::string name = "abs";
+                        // Create an instance of nvrtcProgram with the code string. 
+                        if(Cuda_function_pool::Instance().get(name) == nullptr)
+                        {
+                            const char *opts[] = {"--gpu-architecture=compute_35",
                            "--relocatable-device-code=true"};
-                        nvrtcResult compileResult = nvrtcCompileProgram(prog, 
-                                2, 
-                                opts); 
-
-                        size_t logSize; 
-
-                        NVRTC_SAFE_CALL(nvrtcGetProgramLogSize(prog, &logSize)); 
-                        char *log = new char[logSize]; 
-                        NVRTC_SAFE_CALL(nvrtcGetProgramLog(prog, log)); 
-                        std::cout << log << '\n'; 
-                        delete[] log; 
-
-                        if (compileResult != NVRTC_SUCCESS) {
-                            exit(1);
+                            std::string kernel;
+                            Cuda_kernel_builder::get_1_element_op(name, "float", "fabsf",kernel);
+                            Cuda_function_pool::Instance().set(name, Cuda_function_builder(name, kernel, 2, opts));
                        }
-
-                        size_t ptxSize; 
-                        NVRTC_SAFE_CALL(nvrtcGetPTXSize(prog, &ptxSize)); 
-                        char *ptx = new char[ptxSize]; 
-                        NVRTC_SAFE_CALL(nvrtcGetPTX(prog, ptx)); // Destroy the program. 
-                        NVRTC_SAFE_CALL(nvrtcDestroyProgram(&prog)); // Load the generated PTX and get a handle to the parent kernel. 
-
-                        CUdevice cuDevice;
-                        CUcontext context;
-                        CUmodule module;
-                        CUfunction cuda_op_abs_kernel;
-                        CUDA_SAFE_CALL( cuInit(0));
-                        CUDA_SAFE_CALL(cuDeviceGet(&cuDevice, 0));
-                        CUDA_SAFE_CALL(cuCtxCreate(&context, 0, cuDevice)); 
-                        CUDA_SAFE_CALL(cuModuleLoadDataEx(&module, ptx, 0, 0, 0));
-                        CUDA_SAFE_CALL(cuModuleGetFunction(&cuda_op_abs_kernel, module, "cuda_op_abs"));
-
+                     
+                        //convert runtime ptr to driver api ptr
                        CUdeviceptr dPtrIn, dPtrOut;
                        dPtrIn = (CUdeviceptr)in;
                        dPtrOut = (CUdeviceptr)out;

                        void *argsList[] = {&dPtrIn, &dPtrOut, &count};
                        CUDA_SAFE_CALL(
-                                cuLaunchKernel(cuda_op_abs_kernel, 
+                                cuLaunchKernel(cudCuda_function_pool::Instance().get(name).get(), 
                                    count ,1, 1, // grid dim 
                                    1, 1, 1, // block dim 
                                    0, NULL, // shared mem and stream 

--- a/src/ngraph/runtime/gpu/gpu_util.hpp
+++ b/src/ngraph/runtime/gpu/gpu_util.hpp
@@ -16,6 +16,28 @@

 #pragma once

+#define NVRTC_SAFE_CALL(x) \ 
+do { \ 
+    nvrtcResult result = x; \ 
+        if (result != NVRTC_SUCCESS) { \ 
+            std::cerr << "\nerror: " #x " failed with error " \ 
+                << nvrtcGetErrorString(result) << '\n'; \
+                exit(1); \ 
+        } \
+} while(0) 
+
+#define CUDA_SAFE_CALL(x) \ 
+do { \ 
+    CUresult result = x; \ 
+        if (result != CUDA_SUCCESS) { \ 
+            const char *msg; \ 
+                cuGetErrorName(result, &msg); \ 
+                std::cerr << "\nerror: " #x " failed with error " \ 
+                << msg << '\n'; \ 
+                exit(1); \ 
+        } \ 
+} while(0)
+
 namespace ngraph
 {
    namespace runtime