add rentime cuda kernel compile

e63322d9 · fenglei.tian · ae010b42 · e63322d9 · e63322d9 · e63322d9
Commit e63322d9 authored Feb 09, 2018 by fenglei.tian
Showing with 141 additions and 11 deletions

run.sh scripts/run.sh +8 -7

CMakeLists.txt src/ngraph/CMakeLists.txt +3 -1

gpu_emitter.cpp src/ngraph/runtime/gpu/gpu_emitter.cpp +127 -1

CMakeLists.txt test/CMakeLists.txt +3 -2

No files found.
--- a/scripts/run.sh
+++ b/scripts/run.sh
-./build/test/unit-test --gtest_filter=GPU.ab
-./build/test/unit-test --gtest_filter=GPU.abc
-./build/test/unit-test --gtest_filter=GPU.maximum
-./build/test/unit-test --gtest_filter=GPU.minimum
-./build/test/unit-test --gtest_filter=GPU.multiple*
-./build/test/unit-test --gtest_filter=GPU.sqrt
-./build/test/unit-test --gtest_filter=GPU.nagtive
+./build/test/unit-test --gtest_filter=GPU.abs
+#./build/test/unit-test --gtest_filter=GPU.ab
+#./build/test/unit-test --gtest_filter=GPU.abc
+#./build/test/unit-test --gtest_filter=GPU.maximum
+#./build/test/unit-test --gtest_filter=GPU.minimum
+#./build/test/unit-test --gtest_filter=GPU.multiple*
+#./build/test/unit-test --gtest_filter=GPU.sqrt
+#./build/test/unit-test --gtest_filter=GPU.nagtive
 #./build/test/unit-test --gtest_filter=GPU.abs
 #./build/test/unit-test --gtest_filter=GPU.dot*
--- a/src/ngraph/CMakeLists.txt
+++ b/src/ngraph/CMakeLists.txt
@@ -271,7 +271,9 @@ endif()

 # Nvidia
 if(NGRAPH_GPU_ENABLE AND CUDA_LIBRARIES)
-    target_link_libraries(ngraph PRIVATE ${CUDA_LIBRARIES} ${CUDA_CUBLAS_LIBRARIES} ${CUDNN_LIBRARIES})
+    find_library(CUDA_nvrtc_LIBRARY nvrtc /usr/local/cuda/lib64)
+    find_library(CUDA_cuda_LIBRARY cuda /usr/local/cuda/lib64)
+    target_link_libraries(ngraph PUBLIC ${CUDA_cuda_LIBRARY} ${CUDA_nvrtc_LIBRARY} ${CUDA_LIBRARIES} ${CUDA_CUBLAS_LIBRARIES} ${CUDNN_LIBRARIES})
 endif()

 # Argon

--- a/src/ngraph/runtime/gpu/gpu_emitter.cpp
+++ b/src/ngraph/runtime/gpu/gpu_emitter.cpp
@@ -20,6 +20,11 @@
 #include <typeindex>
 #include <unordered_map>
 #include <vector>
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cublas_v2.h>
+#include <cudnn_v7.h>

 #include "ngraph/node.hpp"
 #include "ngraph/ops/broadcast.hpp"
@@ -44,6 +49,30 @@
 using namespace std;
 using namespace ngraph;

+
+#define NVRTC_SAFE_CALL(x) \ 
+do { \ 
+nvrtcResult result = x; \ 
+if (result != NVRTC_SUCCESS) { \ 
+std::cerr << "\nerror: " #x " failed with error " \ 
+<< nvrtcGetErrorString(result) << '\n'; \
+ exit(1); \ 
+} \
+ } while(0) 
+
+#define CUDA_SAFE_CALL(x) \ 
+do { \ 
+CUresult result = x; \ 
+if (result != CUDA_SUCCESS) { \ 
+const char *msg; \ 
+cuGetErrorName(result, &msg); \ 
+std::cerr << "\nerror: " #x " failed with error " \ 
+<< msg << '\n'; \ 
+exit(1); \ 
+} \ 
+} while(0)
+
+
 void runtime::gpu::GPU_Emitter::EmitNop(codegen::CodeWriter& writer,
                                        const ngraph::Node* n,
                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
@@ -56,7 +85,104 @@ void runtime::gpu::GPU_Emitter::EmitAbs(codegen::CodeWriter& writer,
                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
                                        const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
 {
-    writer << " // " << n->get_name() << "\n  return;\n";
+   const char *op_abs = R"(  
+  extern "C" __global__  
+  void cuda_op_abs(float* in, float* out, size_t n)  
+  {  
+    size_t tid = blockIdx.x * blockDim.x + threadIdx.x;  
+    if(tid < n) 
+    {
+      out[tid] = fabsf(in[tid]);  
+    }
+  })";
+
+size_t numBlocks = 4; size_t numThreads = 4; 
+// Create an instance of nvrtcProgram with the code string. 
+
+nvrtcProgram prog; 
+NVRTC_SAFE_CALL(nvrtcCreateProgram(&prog, // prog i
+                                    op_abs, // buffer 
+                                    "op_abs.cu", // name 
+0, // numHeaders 
+NULL, // headers 
+NULL)); // includeNames
+
+
+const char *opts[] = {"--gpu-architecture=compute_35",
+                        "--relocatable-device-code=true"};
+nvrtcResult compileResult = nvrtcCompileProgram(prog, // prog 
+                          2, // numOptions 
+                          opts); // options
+// Obtain compilation log from the program. 
+
+size_t logSize; 
+
+NVRTC_SAFE_CALL(nvrtcGetProgramLogSize(prog, &logSize)); 
+char *log = new char[logSize]; 
+NVRTC_SAFE_CALL(nvrtcGetProgramLog(prog, log)); 
+std::cout << log << '\n'; 
+delete[] log; 
+
+if (compileResult != NVRTC_SUCCESS) {
+    exit(1);
+  }
+
+size_t ptxSize; 
+ NVRTC_SAFE_CALL(nvrtcGetPTXSize(prog, &ptxSize)); 
+char *ptx = new char[ptxSize]; 
+ NVRTC_SAFE_CALL(nvrtcGetPTX(prog, ptx)); // Destroy the program. 
+ NVRTC_SAFE_CALL(nvrtcDestroyProgram(&prog)); // Load the generated PTX and get a handle to the parent kernel. 
+
+CUdevice cuDevice; 
+CUcontext context; 
+CUmodule module; 
+CUfunction cuda_op_abs_kernel; 
+CUDA_SAFE_CALL( cuInit(0)); 
+ CUDA_SAFE_CALL(cuDeviceGet(&cuDevice, 0)); 
+ CUDA_SAFE_CALL(cuCtxCreate(&context, 0, cuDevice)); 
+// CUDA_SAFE_CALL(cuLinkCreate(0, 0 , 0, &linkState)); 
+ //CUDA_SAFE_CALL(cuLinkeAddFile(linkState, CU_JIT_INPUT_LIBRARY, ' ', 0, 0, 0));
+//CUDA_SAFE_CALL(cuLinkAddData(linkState, CU_JIT_INPUT_PTX, (void *)ptx, ptxSize, "dynamic_parallelism.ptx", 0, 0, 0));
+
+//size_t cubinSize;
+//void *cubin;
+//CUDA_SAFE_CALL(cuLinkComplete(linkState, &cubin, &cubinSize));
+CUDA_SAFE_CALL(cuModuleLoadDataEx(&module, ptx, 0, 0, 0));
+CUDA_SAFE_CALL(cuModuleGetFunction(&cuda_op_abs_kernel, module, "cuda_op_abs"));
+
+
+    writer << "{  // " << n->get_name() << "\n";
+    writer.indent++;
+    writer << "int count = " << out[0].get_size() << ";\n";
+    writer << "if(count == 0) return;\n";
+    writer << "void *argsList[] = {(void *)" << args[0].get_name() << ", (void *)" << out[0].get_name() << ", &count};\n";
+    writer << "//cuLaunchKernel(cuda_op_abs_kernel, count, 1, 1, 1, 1, 1, 0, NULL, argsList, 0);\n";
+    writer << "}\n";
+
+// Generate input for execution, and create output buffers. 
+//size_t nt = numBlocks * numThreads; 
+//size_t bufferSize = nt * sizeof(float); 
+//float *hOut = new float[nt]; 
+//float *hIn = new float[nt]; 
+//for(int i = 0; i< nt; i++) hIn[i] = -i;
+//
+//CUdeviceptr dOut, dIn;
+//cuMemAlloc(&dOut, bufferSize); // Execute parent kernel. 
+//cuMemAlloc(&dIn, bufferSize); // Execute parent kernel. 
+//cuMemcpyHtoD(dIn, hIn, bufferSize); 
+//
+//void *argst[] = {&dIn, &dOut, &nt};
+// CUDA_SAFE_CALL(
+// cuLaunchKernel(kernel, 
+//    numBlocks , 1, 1, // grid dim 
+//    numThreads, 1, 1, // block dim 
+//    0, NULL, // shared mem and stream 
+//   argst, 0)); // arguments 
+//CUDA_SAFE_CALL(cuCtxSynchronize()); // Retrieve and print output. 
+//cuMemcpyDtoH(hOut, dOut, bufferSize); 
+//for (size_t i = 0; i < nt; ++i) { std::cout << hOut[i] << '\n'; } // Release resources. 
+//cuMemFree(dOut); 
+//cuModuleUnload(module); 
 }

 void runtime::gpu::GPU_Emitter::EmitAdd(codegen::CodeWriter& writer,

--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -76,7 +76,7 @@ if(NGRAPH_CPU_ENABLE AND LLVM_INCLUDE_DIR)
 endif()

 if(NGRAPH_GPU_ENABLE AND LLVM_INCLUDE_DIR)
-    include_directories(SYSTEM ${LLVM_INCLUDE_DIR})
+    include_directories(SYSTEM ${LLVM_INCLUDE_DIR} ${CUDA_INCLUDE_DIRS} ${CUDNN_INCLUDE_DIR})
    link_directories(${LLVM_LIB_DIR})
    link_directories(${CUDA_LIBRARIES})
    link_directories(${CUDA_CUBLAS_LIBRARIES})
@@ -130,7 +130,8 @@ if(LLVM_INCLUDE_DIR)
 endif()

 if(CUDA_INCLUDE_DIRS)
-  target_link_libraries(unit-test ${CUDA_LIBRARIES} ${CUDNN_LIBRARIES} ${CUDA_CUBLAS_LIBRARIES})
+    find_library(CUDA_nvrtc_LIBRARY nvrtc /usr/local/cuda/lib64)
+  target_link_libraries(unit-test ${CUDA_nvrtc_LIBRARY} ${CUDA_LIBRARIES} ${CUDNN_LIBRARIES} ${CUDA_CUBLAS_LIBRARIES})
 endif()

 target_link_libraries(unit-test ngraph libgtest pthread)