Commit 4eaf5711 authored by fenglei.tian's avatar fenglei.tian

save the first working version, fix conflict between cuda runtime and api context

parent e14f4384
mkdir build mkdir build
cd build cd build
cmake .. -DNGRAPH_GPU_ENABLE=TRUE -DNGRAPH_CPU_ENABLE=TRUE -DCUDNN_ROOT_DIR=/usr/lib/x86_64-linux-gnu/ -DCUDNN_INCLUDE_DIR=/usr/include -DZLIB_LIBRARY=/usr/lib/x86_64-linux/gpu/libz.so -DZLIB_INCLUDE_DIR=/usr/include/ -DCMAKE_EXPORT_COMPILE_COMMANDS=ON cmake .. -DNGRAPH_GPU_ENABLE=TRUE -DNGRAPH_CPU_ENABLE=TRUE -DCUDNN_ROOT_DIR=/usr/lib/x86_64-linux-gnu/ -DCUDNN_INCLUDE_DIR=/usr/include -DZLIB_LIBRARY=/usr/lib/x86_64-linux/gpu/libz.so -DZLIB_INCLUDE_DIR=/usr/include/ -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DNGRPH_COMPILER_DIAG_ENABLE=TRUE
make -j24 all make -j24 all
// ---------------------------------------------------------------------------- //// ----------------------------------------------------------------------------
// Copyright 2017 Nervana Systems Inc. // Copyright 2017 Nervana Systems Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); // Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License. // you may not use this file except in compliance with the License.
...@@ -23,11 +23,41 @@ ...@@ -23,11 +23,41 @@
using namespace std; using namespace std;
using namespace ngraph; using namespace ngraph;
#define NVRTC_SAFE_CALL(x) \
do { \
nvrtcResult result = x; \
if (result != NVRTC_SUCCESS) { \
std::cerr << "\nerror: " #x " failed with error " \
<< nvrtcGetErrorString(result) << '\n'; \
exit(1); \
} \
} while(0)
#define CUDA_SAFE_CALL(x) \
do { \
CUresult result = x; \
if (result != CUDA_SUCCESS) { \
const char *msg; \
cuGetErrorName(result, &msg); \
std::cerr << "\nerror: " #x " failed with error " \
<< msg << '\n'; \
exit(1); \
} \
} while(0)
runtime::gpu::GPU_CallFrame::GPU_CallFrame(std::shared_ptr<GPU_ExternalFunction> external_function, runtime::gpu::GPU_CallFrame::GPU_CallFrame(std::shared_ptr<GPU_ExternalFunction> external_function,
EntryPoint compiled_function) EntryPoint compiled_function)
: m_external_function(external_function) : m_external_function(external_function)
, m_compiled_function(compiled_function) , m_compiled_function(compiled_function)
{ {
CUdevice cuDevice;
CUcontext context;
CUmodule module;
CUfunction cuda_op_abs_kernel;
CUDA_SAFE_CALL( cuInit(0));
CUDA_SAFE_CALL(cuDeviceGet(&cuDevice, 0));
CUDA_SAFE_CALL(cuCtxCreate(&context, 0, cuDevice));
cublasStatus_t cublasStatus = cublasCreate(&m_cublas_handle); cublasStatus_t cublasStatus = cublasCreate(&m_cublas_handle);
if (cublasStatus != CUBLAS_STATUS_SUCCESS) if (cublasStatus != CUBLAS_STATUS_SUCCESS)
{ {
......
...@@ -88,7 +88,6 @@ namespace ngraph ...@@ -88,7 +88,6 @@ namespace ngraph
} }
})"; })";
size_t numBlocks = 4; size_t numThreads = 4;
// Create an instance of nvrtcProgram with the code string. // Create an instance of nvrtcProgram with the code string.
nvrtcProgram prog; nvrtcProgram prog;
...@@ -125,32 +124,47 @@ namespace ngraph ...@@ -125,32 +124,47 @@ namespace ngraph
NVRTC_SAFE_CALL(nvrtcGetPTX(prog, ptx)); // Destroy the program. NVRTC_SAFE_CALL(nvrtcGetPTX(prog, ptx)); // Destroy the program.
NVRTC_SAFE_CALL(nvrtcDestroyProgram(&prog)); // Load the generated PTX and get a handle to the parent kernel. NVRTC_SAFE_CALL(nvrtcDestroyProgram(&prog)); // Load the generated PTX and get a handle to the parent kernel.
CUdevice cuDevice; CUdevice cuDevice;
CUcontext context; CUcontext context;
CUmodule module; CUmodule module;
CUfunction cuda_op_abs_kernel; CUfunction cuda_op_abs_kernel;
CUDA_SAFE_CALL( cuInit(0)); CUDA_SAFE_CALL( cuInit(0));
CUDA_SAFE_CALL(cuDeviceGet(&cuDevice, 0)); CUDA_SAFE_CALL(cuDeviceGet(&cuDevice, 0));
CUDA_SAFE_CALL(cuCtxCreate(&context, 0, cuDevice)); CUDA_SAFE_CALL(cuCtxCreate(&context, 0, cuDevice));
// CUDA_SAFE_CALL(cuLinkCreate(0, 0 , 0, &linkState));
//CUDA_SAFE_CALL(cuLinkeAddFile(linkState, CU_JIT_INPUT_LIBRARY, ' ', 0, 0, 0));
//CUDA_SAFE_CALL(cuLinkAddData(linkState, CU_JIT_INPUT_PTX, (void *)ptx, ptxSize, "dynamic_parallelism.ptx", 0, 0, 0));
//size_t cubinSize;
//void *cubin;
//CUDA_SAFE_CALL(cuLinkComplete(linkState, &cubin, &cubinSize));
CUDA_SAFE_CALL(cuModuleLoadDataEx(&module, ptx, 0, 0, 0)); CUDA_SAFE_CALL(cuModuleLoadDataEx(&module, ptx, 0, 0, 0));
CUDA_SAFE_CALL(cuModuleGetFunction(&cuda_op_abs_kernel, module, "cuda_op_abs")); CUDA_SAFE_CALL(cuModuleGetFunction(&cuda_op_abs_kernel, module, "cuda_op_abs"));
size_t numBlocks = 4;
size_t numThreads = 4;
size_t nt = numBlocks * numThreads;
size_t bufferSize = nt * sizeof(float);
float *hOut = new float[nt];
float *hIn = new float[nt];
for(int i = 0; i< nt; i++) hIn[i] = -i;
void *argsList[] = {In, Out, &count}; // void *dOut, *dIn;
// cudaMalloc((void**) &dIn, 64);
// cudaMalloc((void**) &dOut, 64);
CUdeviceptr dPtrIn, dPtrOut;
dPtrIn = (CUdeviceptr)in;
dPtrOut = (CUdeviceptr)out;
void *argsList[] = {&dPtrIn, &dPtrOut, &nt};
// cudaLaunchKernel(cuda_op_obs_kernel,
// {4, 1, 1},
// {1, 1, 1},
// argslist, 0, NULL);
// void *argsList[] = {dIn, dOut, &nt};
CUDA_SAFE_CALL( CUDA_SAFE_CALL(
cuLaunchKernel(cuda_op_abs_kernel, cuLaunchKernel(cuda_op_abs_kernel,
count , 1, 1, // grid dim 4 , 1, 1, // grid dim
1, 1, 1, // block dim 4, 1, 1, // block dim
0, NULL, // shared mem and stream 0, NULL, // shared mem and stream
argsList, 0)); // arguments argsList, 0)); // arguments
CUDA_SAFE_CALL(cuCtxSynchronize()); // Retrieve and print output. CUDA_SAFE_CALL(cuCtxSynchronize()); // Retrieve and print output.
} }
void emit_broadcast(codegen::CodeWriter& writer, void emit_broadcast(codegen::CodeWriter& writer,
const std::string& element_type, const std::string& element_type,
const std::string& arg0, // replacement context const std::string& arg0, // replacement context
......
...@@ -27,7 +27,7 @@ namespace ngraph ...@@ -27,7 +27,7 @@ namespace ngraph
{ {
namespace kernel namespace kernel
{ {
void emit_abs(void** in, void** out, size_t count); void emit_abs(void* in, void* out, size_t count);
void emit_broadcast(codegen::CodeWriter& writer, void emit_broadcast(codegen::CodeWriter& writer,
const std::string& element_type, const std::string& element_type,
......
...@@ -44,6 +44,7 @@ ...@@ -44,6 +44,7 @@
#include "ngraph/ops/sum.hpp" #include "ngraph/ops/sum.hpp"
#include "ngraph/runtime/gpu/gpu_emitter.hpp" #include "ngraph/runtime/gpu/gpu_emitter.hpp"
#include "ngraph/runtime/gpu/gpu_kernel_emitters.hpp" #include "ngraph/runtime/gpu/gpu_kernel_emitters.hpp"
#include "ngraph/runtime/gpu/gpu_cuda_kernel_emitters.hpp"
#include "ngraph/util.hpp" #include "ngraph/util.hpp"
using namespace std; using namespace std;
...@@ -89,22 +90,26 @@ void runtime::gpu::GPU_Emitter::EmitAbs(codegen::CodeWriter& writer, ...@@ -89,22 +90,26 @@ void runtime::gpu::GPU_Emitter::EmitAbs(codegen::CodeWriter& writer,
writer.indent++; writer.indent++;
writer << "int count = " << out[0].get_size() << ";\n"; writer << "int count = " << out[0].get_size() << ";\n";
writer << "if(count == 0) return;\n"; writer << "if(count == 0) return;\n";
writer << "ngraph::runtime::gpu::cuda::kernel::emit_abs(" << args[0].get_name() << ", " << out[0].get_name() << ", count);\n"; writer << "ngraph::runtime::gpu::cuda::kernel::emit_abs((void*) " << args[0].get_name() << ", (void*) " << out[0].get_name() << ", count);\n";
writer.indent--; writer.indent--;
writer << "}\n"; writer << "}\n";
// Generate input for execution, and create output buffers. // ngraph::runtime::gpu::cuda::kernel::emit_abs((void*) , (void*) ((float*)(outputs[0])), count);
//size_t nt = numBlocks * numThreads; //Generate input for execution, and create output buffers.
//size_t bufferSize = nt * sizeof(float); // size_t nt = 4; //numBlocks * numThreads;
//float *hOut = new float[nt]; // size_t bufferSize = nt * sizeof(float);
//float *hIn = new float[nt]; // float *hOut = new float[nt];
//for(int i = 0; i< nt; i++) hIn[i] = -i; // float *hIn = new float[nt];
// // for(int i = 0; i< nt; i++) hIn[i] = -i;
//CUdeviceptr dOut, dIn; //
//cuMemAlloc(&dOut, bufferSize); // Execute parent kernel. // CUdeviceptr dOut, dIn;
//cuMemAlloc(&dIn, bufferSize); // Execute parent kernel. // cuMemAlloc(&dOut, bufferSize); // Execute parent kernel.
//cuMemcpyHtoD(dIn, hIn, bufferSize); // cuMemAlloc(&dIn, bufferSize); // Execute parent kernel.
// // cuMemcpyHtoD(dIn, hIn, bufferSize);
//
// ngraph::runtime::gpu::cuda::kernel::emit_abs((void*) dIn , (void*) dOut, nt);
//void *argst[] = {&dIn, &dOut, &nt}; //void *argst[] = {&dIn, &dOut, &nt};
// CUDA_SAFE_CALL( // CUDA_SAFE_CALL(
// cuLaunchKernel(kernel, // cuLaunchKernel(kernel,
......
...@@ -96,6 +96,7 @@ ...@@ -96,6 +96,7 @@
#include "ngraph/runtime/gpu/gpu_emitter.hpp" #include "ngraph/runtime/gpu/gpu_emitter.hpp"
#include "ngraph/runtime/gpu/gpu_external_function.hpp" #include "ngraph/runtime/gpu/gpu_external_function.hpp"
#include "ngraph/runtime/gpu/gpu_kernel_emitters.hpp" #include "ngraph/runtime/gpu/gpu_kernel_emitters.hpp"
#include "ngraph/runtime/gpu/gpu_cuda_kernel_emitters.hpp"
using namespace std; using namespace std;
using namespace ngraph; using namespace ngraph;
...@@ -254,10 +255,10 @@ void runtime::gpu::GPU_ExternalFunction::compile() ...@@ -254,10 +255,10 @@ void runtime::gpu::GPU_ExternalFunction::compile()
string pch_header_source = writer.get_code(); string pch_header_source = writer.get_code();
writer += R"( // writer += R"(
using namespace ngraph; // using namespace ngraph;
using namespace std; // using namespace std;
)"; //)";
if (m_emit_timing) if (m_emit_timing)
{ {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment