clean up code

400f7f2c · fenglei.tian · 4eaf5711 · 400f7f2c · 400f7f2c
Commit 400f7f2c authored Feb 12, 2018 by fenglei.tian
Hide whitespace changes
Inline Side-by-side

Showing with 56 additions and 103 deletions

gpu_cuda_kernel_emitters.cpp src/ngraph/runtime/gpu/gpu_cuda_kernel_emitters.cpp +56 -74

gpu_emitter.cpp src/ngraph/runtime/gpu/gpu_emitter.cpp +0 -29

No files found.
--- a/src/ngraph/runtime/gpu/gpu_cuda_kernel_emitters.cpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_kernel_emitters.cpp
@@ -77,7 +77,7 @@ namespace ngraph

                    void emit_abs(void* in, void* out, size_t count)
                    {
-                            const char *op_abs = R"(  
+                        const char *op_abs = R"(  
  extern "C" __global__  
  void cuda_op_abs(float* in, float* out, size_t n)  
  {  
@@ -88,80 +88,62 @@ namespace ngraph
    }
  })";

-    // Create an instance of nvrtcProgram with the code string. 
-
-    nvrtcProgram prog; 
-    NVRTC_SAFE_CALL(nvrtcCreateProgram(&prog, // prog i
-                op_abs, // buffer 
-                "op_abs.cu", // name 
-                0, // numHeaders 
-                NULL, // headers 
-                NULL)); // includeNames
-
-
-    const char *opts[] = {"--gpu-architecture=compute_35",
-        "--relocatable-device-code=true"};
-    nvrtcResult compileResult = nvrtcCompileProgram(prog, // prog 
-            2, // numOptions 
-            opts); // options
-    // Obtain compilation log from the program. 
-
-    size_t logSize; 
-
-    NVRTC_SAFE_CALL(nvrtcGetProgramLogSize(prog, &logSize)); 
-    char *log = new char[logSize]; 
-    NVRTC_SAFE_CALL(nvrtcGetProgramLog(prog, log)); 
-    std::cout << log << '\n'; 
-    delete[] log; 
-
-    if (compileResult != NVRTC_SUCCESS) {
-        exit(1);
-    }
-
-    size_t ptxSize; 
-    NVRTC_SAFE_CALL(nvrtcGetPTXSize(prog, &ptxSize)); 
-    char *ptx = new char[ptxSize]; 
-    NVRTC_SAFE_CALL(nvrtcGetPTX(prog, ptx)); // Destroy the program. 
-    NVRTC_SAFE_CALL(nvrtcDestroyProgram(&prog)); // Load the generated PTX and get a handle to the parent kernel. 
-
-      CUdevice cuDevice;
-      CUcontext context;
-      CUmodule module;
-      CUfunction cuda_op_abs_kernel;
-      CUDA_SAFE_CALL( cuInit(0));
-      CUDA_SAFE_CALL(cuDeviceGet(&cuDevice, 0));
-      CUDA_SAFE_CALL(cuCtxCreate(&context, 0, cuDevice)); 
-    CUDA_SAFE_CALL(cuModuleLoadDataEx(&module, ptx, 0, 0, 0));
-    CUDA_SAFE_CALL(cuModuleGetFunction(&cuda_op_abs_kernel, module, "cuda_op_abs"));
- 
-    size_t numBlocks = 4;
-    size_t numThreads = 4; 
-    size_t nt = numBlocks * numThreads; 
-    size_t bufferSize = nt * sizeof(float); 
-    float *hOut = new float[nt]; 
-    float *hIn = new float[nt]; 
-    for(int i = 0; i< nt; i++) hIn[i] = -i;
-    
-//    void *dOut, *dIn;
-//    cudaMalloc((void**) &dIn, 64);
-//    cudaMalloc((void**) &dOut, 64);   
-    CUdeviceptr dPtrIn, dPtrOut;
-    dPtrIn = (CUdeviceptr)in;
-    dPtrOut = (CUdeviceptr)out;
-    
-                        void *argsList[] = {&dPtrIn, &dPtrOut, &nt};
-  //  cudaLaunchKernel(cuda_op_obs_kernel,
-  //                   {4, 1, 1},
-  //                   {1, 1, 1},
-  //                    argslist, 0, NULL);
- 
-                 //       void *argsList[] = {dIn, dOut, &nt};
+                            // Create an instance of nvrtcProgram with the code string. 
+
+                        nvrtcProgram prog; 
+                        NVRTC_SAFE_CALL(nvrtcCreateProgram(&prog,
+                                    op_abs, 
+                                    "op_abs.cu", 
+                                    0, // numHeaders 
+                                    NULL, // headers 
+                                    NULL)); // includeNames
+
+
+                        const char *opts[] = {"--gpu-architecture=compute_35",
+                            "--relocatable-device-code=true"};
+                        nvrtcResult compileResult = nvrtcCompileProgram(prog, 
+                                2, 
+                                opts); 
+
+                        size_t logSize; 
+
+                        NVRTC_SAFE_CALL(nvrtcGetProgramLogSize(prog, &logSize)); 
+                        char *log = new char[logSize]; 
+                        NVRTC_SAFE_CALL(nvrtcGetProgramLog(prog, log)); 
+                        std::cout << log << '\n'; 
+                        delete[] log; 
+
+                        if (compileResult != NVRTC_SUCCESS) {
+                            exit(1);
+                        }
+
+                        size_t ptxSize; 
+                        NVRTC_SAFE_CALL(nvrtcGetPTXSize(prog, &ptxSize)); 
+                        char *ptx = new char[ptxSize]; 
+                        NVRTC_SAFE_CALL(nvrtcGetPTX(prog, ptx)); // Destroy the program. 
+                        NVRTC_SAFE_CALL(nvrtcDestroyProgram(&prog)); // Load the generated PTX and get a handle to the parent kernel. 
+
+                        CUdevice cuDevice;
+                        CUcontext context;
+                        CUmodule module;
+                        CUfunction cuda_op_abs_kernel;
+                        CUDA_SAFE_CALL( cuInit(0));
+                        CUDA_SAFE_CALL(cuDeviceGet(&cuDevice, 0));
+                        CUDA_SAFE_CALL(cuCtxCreate(&context, 0, cuDevice)); 
+                        CUDA_SAFE_CALL(cuModuleLoadDataEx(&module, ptx, 0, 0, 0));
+                        CUDA_SAFE_CALL(cuModuleGetFunction(&cuda_op_abs_kernel, module, "cuda_op_abs"));
+
+                        CUdeviceptr dPtrIn, dPtrOut;
+                        dPtrIn = (CUdeviceptr)in;
+                        dPtrOut = (CUdeviceptr)out;
+
+                        void *argsList[] = {&dPtrIn, &dPtrOut, &count};
                        CUDA_SAFE_CALL(
-                        cuLaunchKernel(cuda_op_abs_kernel, 
-                        4 , 1, 1, // grid dim 
-                        4, 1, 1, // block dim 
-                        0, NULL, // shared mem and stream 
-                        argsList, 0)); // arguments 
+                                cuLaunchKernel(cuda_op_abs_kernel, 
+                                    count ,1, 1, // grid dim 
+                                    1, 1, 1, // block dim 
+                                    0, NULL, // shared mem and stream 
+                                    argsList, 0)); // arguments 
                        CUDA_SAFE_CALL(cuCtxSynchronize()); // Retrieve and print output. 
                    }


--- a/src/ngraph/runtime/gpu/gpu_emitter.cpp
+++ b/src/ngraph/runtime/gpu/gpu_emitter.cpp
@@ -93,35 +93,6 @@ void runtime::gpu::GPU_Emitter::EmitAbs(codegen::CodeWriter& writer,
    writer << "ngraph::runtime::gpu::cuda::kernel::emit_abs((void*) " << args[0].get_name() <<  ", (void*) " << out[0].get_name() << ", count);\n";
    writer.indent--;
    writer << "}\n";
-
-//   ngraph::runtime::gpu::cuda::kernel::emit_abs((void*) , (void*) ((float*)(outputs[0])), count);
-    //Generate input for execution, and create output buffers. 
-//    size_t nt = 4; //numBlocks * numThreads; 
-//    size_t bufferSize = nt * sizeof(float); 
-//    float *hOut = new float[nt]; 
-//    float *hIn = new float[nt]; 
-//    for(int i = 0; i< nt; i++) hIn[i] = -i;
-//    
-//    CUdeviceptr dOut, dIn;
-//    cuMemAlloc(&dOut, bufferSize); // Execute parent kernel. 
-//    cuMemAlloc(&dIn, bufferSize); // Execute parent kernel. 
-//    cuMemcpyHtoD(dIn, hIn, bufferSize); 
-//    
-//    ngraph::runtime::gpu::cuda::kernel::emit_abs((void*) dIn , (void*) dOut, nt);
-
-
-    //void *argst[] = {&dIn, &dOut, &nt};
-    // CUDA_SAFE_CALL(
-    // cuLaunchKernel(kernel, 
-    //    numBlocks , 1, 1, // grid dim 
-    //    numThreads, 1, 1, // block dim 
-    //    0, NULL, // shared mem and stream 
-    //   argst, 0)); // arguments 
-    //CUDA_SAFE_CALL(cuCtxSynchronize()); // Retrieve and print output. 
-    //cuMemcpyDtoH(hOut, dOut, bufferSize); 
-    //for (size_t i = 0; i < nt; ++i) { std::cout << hOut[i] << '\n'; } // Release resources. 
-    //cuMemFree(dOut); 
-    //cuModuleUnload(module); 
 }

 void runtime::gpu::GPU_Emitter::EmitAdd(codegen::CodeWriter& writer,