save the first working version, fix conflict between cuda runtime and api context

4eaf5711 · fenglei.tian · e14f4384 · 4eaf5711 · 4eaf5711 · 4eaf5711
Commit 4eaf5711 authored Feb 12, 2018 by fenglei.tian
6 changed files
--- a/scripts/build.sh
+++ b/scripts/build.sh
 mkdir build
 cd build
-cmake .. -DNGRAPH_GPU_ENABLE=TRUE -DNGRAPH_CPU_ENABLE=TRUE -DCUDNN_ROOT_DIR=/usr/lib/x86_64-linux-gnu/ -DCUDNN_INCLUDE_DIR=/usr/include -DZLIB_LIBRARY=/usr/lib/x86_64-linux/gpu/libz.so -DZLIB_INCLUDE_DIR=/usr/include/ -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
+cmake .. -DNGRAPH_GPU_ENABLE=TRUE -DNGRAPH_CPU_ENABLE=TRUE -DCUDNN_ROOT_DIR=/usr/lib/x86_64-linux-gnu/ -DCUDNN_INCLUDE_DIR=/usr/include -DZLIB_LIBRARY=/usr/lib/x86_64-linux/gpu/libz.so -DZLIB_INCLUDE_DIR=/usr/include/ -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DNGRPH_COMPILER_DIAG_ENABLE=TRUE
 make -j24 all
--- a/src/ngraph/runtime/gpu/gpu_call_frame.cpp
+++ b/src/ngraph/runtime/gpu/gpu_call_frame.cpp
-// ----------------------------------------------------------------------------
+//// ----------------------------------------------------------------------------
 // Copyright 2017 Nervana Systems Inc.
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -23,11 +23,41 @@
 using namespace std;
 using namespace ngraph;

+#define NVRTC_SAFE_CALL(x) \
+do { \
+    nvrtcResult result = x; \
+        if (result != NVRTC_SUCCESS) { \
+            std::cerr << "\nerror: " #x " failed with error " \
+                << nvrtcGetErrorString(result) << '\n'; \
+                exit(1); \
+        } \
+} while(0)
+
+#define CUDA_SAFE_CALL(x) \
+do { \
+    CUresult result = x; \
+        if (result != CUDA_SUCCESS) { \
+            const char *msg; \
+                cuGetErrorName(result, &msg); \
+                std::cerr << "\nerror: " #x " failed with error " \
+                << msg << '\n'; \
+                exit(1); \
+        } \
+} while(0)
+
 runtime::gpu::GPU_CallFrame::GPU_CallFrame(std::shared_ptr<GPU_ExternalFunction> external_function,
                                           EntryPoint compiled_function)
    : m_external_function(external_function)
    , m_compiled_function(compiled_function)
 {
+
+    CUdevice cuDevice;
+    CUcontext context;
+    CUmodule module;
+    CUfunction cuda_op_abs_kernel;
+    CUDA_SAFE_CALL( cuInit(0));
+    CUDA_SAFE_CALL(cuDeviceGet(&cuDevice, 0));
+    CUDA_SAFE_CALL(cuCtxCreate(&context, 0, cuDevice));
    cublasStatus_t cublasStatus  = cublasCreate(&m_cublas_handle);
    if (cublasStatus != CUBLAS_STATUS_SUCCESS)
    {

--- a/src/ngraph/runtime/gpu/gpu_cuda_kernel_emitters.cpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_kernel_emitters.cpp
@@ -88,7 +88,6 @@ namespace ngraph
    }
  })";

-        size_t numBlocks = 4; size_t numThreads = 4; 
    // Create an instance of nvrtcProgram with the code string. 

    nvrtcProgram prog; 
@@ -125,32 +124,47 @@ namespace ngraph
    NVRTC_SAFE_CALL(nvrtcGetPTX(prog, ptx)); // Destroy the program. 
    NVRTC_SAFE_CALL(nvrtcDestroyProgram(&prog)); // Load the generated PTX and get a handle to the parent kernel. 

-    CUdevice cuDevice; 
-    CUcontext context; 
-    CUmodule module; 
-    CUfunction cuda_op_abs_kernel; 
-    CUDA_SAFE_CALL( cuInit(0)); 
-    CUDA_SAFE_CALL(cuDeviceGet(&cuDevice, 0)); 
-    CUDA_SAFE_CALL(cuCtxCreate(&context, 0, cuDevice)); 
-    // CUDA_SAFE_CALL(cuLinkCreate(0, 0 , 0, &linkState)); 
-    //CUDA_SAFE_CALL(cuLinkeAddFile(linkState, CU_JIT_INPUT_LIBRARY, ' ', 0, 0, 0));
-    //CUDA_SAFE_CALL(cuLinkAddData(linkState, CU_JIT_INPUT_PTX, (void *)ptx, ptxSize, "dynamic_parallelism.ptx", 0, 0, 0));
-
-    //size_t cubinSize;
-    //void *cubin;
-    //CUDA_SAFE_CALL(cuLinkComplete(linkState, &cubin, &cubinSize));
+      CUdevice cuDevice;
+      CUcontext context;
+      CUmodule module;
+      CUfunction cuda_op_abs_kernel;
+      CUDA_SAFE_CALL( cuInit(0));
+      CUDA_SAFE_CALL(cuDeviceGet(&cuDevice, 0));
+      CUDA_SAFE_CALL(cuCtxCreate(&context, 0, cuDevice)); 
    CUDA_SAFE_CALL(cuModuleLoadDataEx(&module, ptx, 0, 0, 0));
    CUDA_SAFE_CALL(cuModuleGetFunction(&cuda_op_abs_kernel, module, "cuda_op_abs"));
+ 
+    size_t numBlocks = 4;
+    size_t numThreads = 4; 
+    size_t nt = numBlocks * numThreads; 
+    size_t bufferSize = nt * sizeof(float); 
+    float *hOut = new float[nt]; 
+    float *hIn = new float[nt]; 
+    for(int i = 0; i< nt; i++) hIn[i] = -i;
    
-                        void *argsList[] = {In, Out, &count};
+//    void *dOut, *dIn;
+//    cudaMalloc((void**) &dIn, 64);
+//    cudaMalloc((void**) &dOut, 64);   
+    CUdeviceptr dPtrIn, dPtrOut;
+    dPtrIn = (CUdeviceptr)in;
+    dPtrOut = (CUdeviceptr)out;
+    
+                        void *argsList[] = {&dPtrIn, &dPtrOut, &nt};
+  //  cudaLaunchKernel(cuda_op_obs_kernel,
+  //                   {4, 1, 1},
+  //                   {1, 1, 1},
+  //                    argslist, 0, NULL);
+ 
+                 //       void *argsList[] = {dIn, dOut, &nt};
                        CUDA_SAFE_CALL(
                        cuLaunchKernel(cuda_op_abs_kernel, 
-                        count , 1, 1, // grid dim 
-                        1, 1, 1, // block dim 
+                        4 , 1, 1, // grid dim 
+                        4, 1, 1, // block dim 
                        0, NULL, // shared mem and stream 
                        argsList, 0)); // arguments 
                        CUDA_SAFE_CALL(cuCtxSynchronize()); // Retrieve and print output. 
                    }
+
                    void emit_broadcast(codegen::CodeWriter& writer,
                            const std::string& element_type,
                            const std::string& arg0, // replacement context

--- a/src/ngraph/runtime/gpu/gpu_cuda_kernel_emitters.hpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_kernel_emitters.hpp
@@ -27,7 +27,7 @@ namespace ngraph
            {
                namespace kernel
                {
-                    void emit_abs(void** in, void** out, size_t count);
+                    void emit_abs(void* in, void* out, size_t count);

                    void emit_broadcast(codegen::CodeWriter& writer,
                                        const std::string& element_type,

--- a/src/ngraph/runtime/gpu/gpu_emitter.cpp
+++ b/src/ngraph/runtime/gpu/gpu_emitter.cpp
@@ -44,6 +44,7 @@
 #include "ngraph/ops/sum.hpp"
 #include "ngraph/runtime/gpu/gpu_emitter.hpp"
 #include "ngraph/runtime/gpu/gpu_kernel_emitters.hpp"
+#include "ngraph/runtime/gpu/gpu_cuda_kernel_emitters.hpp"
 #include "ngraph/util.hpp"

 using namespace std;
@@ -89,22 +90,26 @@ void runtime::gpu::GPU_Emitter::EmitAbs(codegen::CodeWriter& writer,
    writer.indent++;
    writer << "int count = " << out[0].get_size() << ";\n";
    writer << "if(count == 0) return;\n";
-    writer << "ngraph::runtime::gpu::cuda::kernel::emit_abs(" << args[0].get_name() <<  ", " << out[0].get_name() << ", count);\n";
+    writer << "ngraph::runtime::gpu::cuda::kernel::emit_abs((void*) " << args[0].get_name() <<  ", (void*) " << out[0].get_name() << ", count);\n";
    writer.indent--;
    writer << "}\n";

-    // Generate input for execution, and create output buffers. 
-    //size_t nt = numBlocks * numThreads; 
-    //size_t bufferSize = nt * sizeof(float); 
-    //float *hOut = new float[nt]; 
-    //float *hIn = new float[nt]; 
-    //for(int i = 0; i< nt; i++) hIn[i] = -i;
-    //
-    //CUdeviceptr dOut, dIn;
-    //cuMemAlloc(&dOut, bufferSize); // Execute parent kernel. 
-    //cuMemAlloc(&dIn, bufferSize); // Execute parent kernel. 
-    //cuMemcpyHtoD(dIn, hIn, bufferSize); 
-    //
+//   ngraph::runtime::gpu::cuda::kernel::emit_abs((void*) , (void*) ((float*)(outputs[0])), count);
+    //Generate input for execution, and create output buffers. 
+//    size_t nt = 4; //numBlocks * numThreads; 
+//    size_t bufferSize = nt * sizeof(float); 
+//    float *hOut = new float[nt]; 
+//    float *hIn = new float[nt]; 
+//    for(int i = 0; i< nt; i++) hIn[i] = -i;
+//    
+//    CUdeviceptr dOut, dIn;
+//    cuMemAlloc(&dOut, bufferSize); // Execute parent kernel. 
+//    cuMemAlloc(&dIn, bufferSize); // Execute parent kernel. 
+//    cuMemcpyHtoD(dIn, hIn, bufferSize); 
+//    
+//    ngraph::runtime::gpu::cuda::kernel::emit_abs((void*) dIn , (void*) dOut, nt);
+
+
    //void *argst[] = {&dIn, &dOut, &nt};
    // CUDA_SAFE_CALL(
    // cuLaunchKernel(kernel, 

--- a/src/ngraph/runtime/gpu/gpu_external_function.cpp
+++ b/src/ngraph/runtime/gpu/gpu_external_function.cpp
@@ -96,6 +96,7 @@
 #include "ngraph/runtime/gpu/gpu_emitter.hpp"
 #include "ngraph/runtime/gpu/gpu_external_function.hpp"
 #include "ngraph/runtime/gpu/gpu_kernel_emitters.hpp"
+#include "ngraph/runtime/gpu/gpu_cuda_kernel_emitters.hpp"

 using namespace std;
 using namespace ngraph;
@@ -254,10 +255,10 @@ void runtime::gpu::GPU_ExternalFunction::compile()

    string pch_header_source = writer.get_code();

-    writer += R"(
-    using namespace ngraph;
-    using namespace std;
-)";
+//    writer += R"(
+//    using namespace ngraph;
+//    using namespace std;
+//)";

    if (m_emit_timing)
    {